[ORC] Add std::tuple support to SimplePackedSerialization.
[llvm-project.git] / llvm / lib / Transforms / Vectorize / LoopVectorize.cpp
blob00416efb032533463164864a85c413946fdbbfbd
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 // of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 // widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 // of vectorization. It decides on the optimal vector width, which
26 // can be one, if vectorization is not profitable.
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
36 //===----------------------------------------------------------------------===//
38 // The reduction-variable vectorization is based on the paper:
39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
41 // Variable uniformity checks are inspired by:
42 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
44 // The interleaved access vectorization is based on the paper:
45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46 // Data for SIMD
48 // Other ideas/concepts are from:
49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52 // Vectorizing Compilers.
54 //===----------------------------------------------------------------------===//
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
91 #include "llvm/Analysis/ProfileSummaryInfo.h"
92 #include "llvm/Analysis/ScalarEvolution.h"
93 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
94 #include "llvm/Analysis/TargetLibraryInfo.h"
95 #include "llvm/Analysis/TargetTransformInfo.h"
96 #include "llvm/Analysis/VectorUtils.h"
97 #include "llvm/IR/Attributes.h"
98 #include "llvm/IR/BasicBlock.h"
99 #include "llvm/IR/CFG.h"
100 #include "llvm/IR/Constant.h"
101 #include "llvm/IR/Constants.h"
102 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/LLVMContext.h"
116 #include "llvm/IR/Metadata.h"
117 #include "llvm/IR/Module.h"
118 #include "llvm/IR/Operator.h"
119 #include "llvm/IR/PatternMatch.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/InstructionCost.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
142 #include "llvm/Transforms/Utils/SizeOpts.h"
143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
144 #include <algorithm>
145 #include <cassert>
146 #include <cstdint>
147 #include <cstdlib>
148 #include <functional>
149 #include <iterator>
150 #include <limits>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
156 using namespace llvm;
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169 "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171 "llvm.loop.vectorize.followup_epilogue";
172 /// @}
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
178 static cl::opt<bool> EnableEpilogueVectorization(
179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180 cl::desc("Enable vectorization of epilogue loops."));
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184 cl::desc("When epilogue vectorization is enabled, and a value greater than "
185 "1 is specified, forces the given VF for all applicable epilogue "
186 "loops."));
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190 cl::desc("Only loops with vectorization factor equal to or larger than "
191 "the specified value are considered for epilogue vectorization."));
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197 cl::desc("Loops with a constant trip count that is smaller than this "
198 "value are vectorized only if no scalar iteration overheads "
199 "are incurred."));
201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203 cl::desc("The maximum allowed number of runtime memory checks with a "
204 "vectorize(enable) pragma."));
206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
207 // that predication is preferred, and this lists all options. I.e., the
208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
209 // and predicate the instructions accordingly. If tail-folding fails, there are
210 // different fallback strategies depending on these values:
211 namespace PreferPredicateTy {
212 enum Option {
213 ScalarEpilogue = 0,
214 PredicateElseScalarEpilogue,
215 PredicateOrDontVectorize
217 } // namespace PreferPredicateTy
219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
220 "prefer-predicate-over-epilogue",
221 cl::init(PreferPredicateTy::ScalarEpilogue),
222 cl::Hidden,
223 cl::desc("Tail-folding and predication preferences over creating a scalar "
224 "epilogue loop."),
225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
226 "scalar-epilogue",
227 "Don't tail-predicate loops, create scalar epilogue"),
228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
229 "predicate-else-scalar-epilogue",
230 "prefer tail-folding, create scalar epilogue if tail "
231 "folding fails."),
232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
233 "predicate-dont-vectorize",
234 "prefers tail-folding, don't attempt vectorization if "
235 "tail-folding fails.")));
237 static cl::opt<bool> MaximizeBandwidth(
238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
239 cl::desc("Maximize bandwidth when selecting vectorization factor which "
240 "will be determined by the smallest type in loop."));
242 static cl::opt<bool> EnableInterleavedMemAccesses(
243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
246 /// An interleave-group may need masking if it resides in a block that needs
247 /// predication, or in order to mask away gaps.
248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
254 cl::desc("We don't interleave loops with a estimated constant trip count "
255 "below this number"));
257 static cl::opt<unsigned> ForceTargetNumScalarRegs(
258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
259 cl::desc("A flag that overrides the target's number of scalar registers."));
261 static cl::opt<unsigned> ForceTargetNumVectorRegs(
262 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
263 cl::desc("A flag that overrides the target's number of vector registers."));
265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
267 cl::desc("A flag that overrides the target's max interleave factor for "
268 "scalar loops."));
270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
272 cl::desc("A flag that overrides the target's max interleave factor for "
273 "vectorized loops."));
275 static cl::opt<unsigned> ForceTargetInstructionCost(
276 "force-target-instruction-cost", cl::init(0), cl::Hidden,
277 cl::desc("A flag that overrides the target's expected cost for "
278 "an instruction to a single constant value. Mostly "
279 "useful for getting consistent testing."));
281 static cl::opt<bool> ForceTargetSupportsScalableVectors(
282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
283 cl::desc(
284 "Pretend that scalable vectors are supported, even if the target does "
285 "not support them. This flag should only be used for testing."));
287 static cl::opt<unsigned> SmallLoopCost(
288 "small-loop-cost", cl::init(20), cl::Hidden,
289 cl::desc(
290 "The cost of a loop that is considered 'small' by the interleaver."));
292 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
294 cl::desc("Enable the use of the block frequency analysis to access PGO "
295 "heuristics minimizing code growth in cold regions and being more "
296 "aggressive in hot regions."));
298 // Runtime interleave loops for load/store throughput.
299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
301 cl::desc(
302 "Enable runtime interleaving until load/store ports are saturated"));
304 /// Interleave small loops with scalar reductions.
305 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
307 cl::desc("Enable interleaving for loops with small iteration counts that "
308 "contain scalar reductions to expose ILP."));
310 /// The number of stores in a loop that are allowed to need predication.
311 static cl::opt<unsigned> NumberOfStoresToPredicate(
312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
313 cl::desc("Max number of stores to be predicated behind an if."));
315 static cl::opt<bool> EnableIndVarRegisterHeur(
316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
317 cl::desc("Count the induction variable only once when interleaving"));
319 static cl::opt<bool> EnableCondStoresVectorization(
320 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
321 cl::desc("Enable if predication of stores during vectorization."));
323 static cl::opt<unsigned> MaxNestedScalarReductionIC(
324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
325 cl::desc("The maximum interleave count to use when interleaving a scalar "
326 "reduction in a nested loop."));
328 static cl::opt<bool>
329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
330 cl::Hidden,
331 cl::desc("Prefer in-loop vector reductions, "
332 "overriding the targets preference."));
334 cl::opt<bool> ForceOrderedReductions(
335 "force-ordered-reductions", cl::init(false), cl::Hidden,
336 cl::desc("Enable the vectorisation of loops with in-order (strict) "
337 "FP reductions"));
339 static cl::opt<bool> PreferPredicatedReductionSelect(
340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
341 cl::desc(
342 "Prefer predicating a reduction operation over an after loop select."));
344 cl::opt<bool> EnableVPlanNativePath(
345 "enable-vplan-native-path", cl::init(false), cl::Hidden,
346 cl::desc("Enable VPlan-native vectorization path with "
347 "support for outer loop vectorization."));
349 // FIXME: Remove this switch once we have divergence analysis. Currently we
350 // assume divergent non-backedge branches when this switch is true.
351 cl::opt<bool> EnableVPlanPredication(
352 "enable-vplan-predication", cl::init(false), cl::Hidden,
353 cl::desc("Enable VPlan-native vectorization path predicator with "
354 "support for outer loop vectorization."));
356 // This flag enables the stress testing of the VPlan H-CFG construction in the
357 // VPlan-native vectorization path. It must be used in conjuction with
358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
359 // verification of the H-CFGs built.
360 static cl::opt<bool> VPlanBuildStressTest(
361 "vplan-build-stress-test", cl::init(false), cl::Hidden,
362 cl::desc(
363 "Build VPlan for every supported loop nest in the function and bail "
364 "out right after the build (stress test the VPlan H-CFG construction "
365 "in the VPlan-native vectorization path)."));
367 cl::opt<bool> llvm::EnableLoopInterleaving(
368 "interleave-loops", cl::init(true), cl::Hidden,
369 cl::desc("Enable loop interleaving in Loop vectorization passes"));
370 cl::opt<bool> llvm::EnableLoopVectorization(
371 "vectorize-loops", cl::init(true), cl::Hidden,
372 cl::desc("Run the Loop vectorization passes"));
374 cl::opt<bool> PrintVPlansInDotFormat(
375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
376 cl::desc("Use dot format instead of plain text when dumping VPlans"));
378 /// A helper function that returns true if the given type is irregular. The
379 /// type is irregular if its allocated size doesn't equal the store size of an
380 /// element of the corresponding vector type.
381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
382 // Determine if an array of N elements of type Ty is "bitcast compatible"
383 // with a <N x Ty> vector.
384 // This is only true if there is no padding between the array elements.
385 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
388 /// A helper function that returns the reciprocal of the block probability of
389 /// predicated blocks. If we return X, we are assuming the predicated block
390 /// will execute once for every X iterations of the loop header.
392 /// TODO: We should use actual block probability here, if available. Currently,
393 /// we always assume predicated blocks have a 50% chance of executing.
394 static unsigned getReciprocalPredBlockProb() { return 2; }
396 /// A helper function that returns an integer or floating-point constant with
397 /// value C.
398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
399 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
400 : ConstantFP::get(Ty, C);
403 /// Returns "best known" trip count for the specified loop \p L as defined by
404 /// the following procedure:
405 /// 1) Returns exact trip count if it is known.
406 /// 2) Returns expected trip count according to profile data if any.
407 /// 3) Returns upper bound estimate if it is known.
408 /// 4) Returns None if all of the above failed.
409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
410 // Check if exact trip count is known.
411 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
412 return ExpectedTC;
414 // Check if there is an expected trip count available from profile data.
415 if (LoopVectorizeWithBlockFrequency)
416 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
417 return EstimatedTC;
419 // Check if upper bound estimate is known.
420 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
421 return ExpectedTC;
423 return None;
426 // Forward declare GeneratedRTChecks.
427 class GeneratedRTChecks;
429 namespace llvm {
431 /// InnerLoopVectorizer vectorizes loops which contain only one basic
432 /// block to a specified vectorization factor (VF).
433 /// This class performs the widening of scalars into vectors, or multiple
434 /// scalars. This class also implements the following features:
435 /// * It inserts an epilogue loop for handling loops that don't have iteration
436 /// counts that are known to be a multiple of the vectorization factor.
437 /// * It handles the code generation for reduction variables.
438 /// * Scalarization (implementation using scalars) of un-vectorizable
439 /// instructions.
440 /// InnerLoopVectorizer does not perform any vectorization-legality
441 /// checks, and relies on the caller to check for the different legality
442 /// aspects. The InnerLoopVectorizer relies on the
443 /// LoopVectorizationLegality class to provide information about the induction
444 /// and reduction variables that were found to a given vectorization factor.
445 class InnerLoopVectorizer {
446 public:
447 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
448 LoopInfo *LI, DominatorTree *DT,
449 const TargetLibraryInfo *TLI,
450 const TargetTransformInfo *TTI, AssumptionCache *AC,
451 OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
452 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
453 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
454 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
455 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
456 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
457 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
458 PSI(PSI), RTChecks(RTChecks) {
459 // Query this against the original loop and save it here because the profile
460 // of the original loop header may change as the transformation happens.
461 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
462 OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
465 virtual ~InnerLoopVectorizer() = default;
467 /// Create a new empty loop that will contain vectorized instructions later
468 /// on, while the old loop will be used as the scalar remainder. Control flow
469 /// is generated around the vectorized (and scalar epilogue) loops consisting
470 /// of various checks and bypasses. Return the pre-header block of the new
471 /// loop.
472 /// In the case of epilogue vectorization, this function is overriden to
473 /// handle the more complex control flow around the loops.
474 virtual BasicBlock *createVectorizedLoopSkeleton();
476 /// Widen a single instruction within the innermost loop.
477 void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
478 VPTransformState &State);
480 /// Widen a single call instruction within the innermost loop.
481 void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
482 VPTransformState &State);
484 /// Widen a single select instruction within the innermost loop.
485 void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
486 bool InvariantCond, VPTransformState &State);
488 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
489 void fixVectorizedLoop(VPTransformState &State);
491 // Return true if any runtime check is added.
492 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
494 /// A type for vectorized values in the new loop. Each value from the
495 /// original loop, when vectorized, is represented by UF vector values in the
496 /// new unrolled loop, where UF is the unroll factor.
497 using VectorParts = SmallVector<Value *, 2>;
499 /// Vectorize a single GetElementPtrInst based on information gathered and
500 /// decisions taken during planning.
501 void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
502 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
503 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
505 /// Vectorize a single first-order recurrence or pointer induction PHINode in
506 /// a block. This method handles the induction variable canonicalization. It
507 /// supports both VF = 1 for unrolled loops and arbitrary length vectors.
508 void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
509 VPTransformState &State);
511 /// A helper function to scalarize a single Instruction in the innermost loop.
512 /// Generates a sequence of scalar instances for each lane between \p MinLane
513 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
514 /// inclusive. Uses the VPValue operands from \p Operands instead of \p
515 /// Instr's operands.
516 void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands,
517 const VPIteration &Instance, bool IfPredicateInstr,
518 VPTransformState &State);
520 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
521 /// is provided, the integer induction variable will first be truncated to
522 /// the corresponding type.
523 void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc,
524 VPValue *Def, VPValue *CastDef,
525 VPTransformState &State);
527 /// Construct the vector value of a scalarized value \p V one lane at a time.
528 void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
529 VPTransformState &State);
531 /// Try to vectorize interleaved access group \p Group with the base address
532 /// given in \p Addr, optionally masking the vector operations if \p
533 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
534 /// values in the vectorized loop.
535 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
536 ArrayRef<VPValue *> VPDefs,
537 VPTransformState &State, VPValue *Addr,
538 ArrayRef<VPValue *> StoredValues,
539 VPValue *BlockInMask = nullptr);
541 /// Vectorize Load and Store instructions with the base address given in \p
542 /// Addr, optionally masking the vector operations if \p BlockInMask is
543 /// non-null. Use \p State to translate given VPValues to IR values in the
544 /// vectorized loop.
545 void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
546 VPValue *Def, VPValue *Addr,
547 VPValue *StoredValue, VPValue *BlockInMask);
549 /// Set the debug location in the builder \p Ptr using the debug location in
550 /// \p V. If \p Ptr is None then it uses the class member's Builder.
551 void setDebugLocFromInst(const Value *V,
552 Optional<IRBuilder<> *> CustomBuilder = None);
554 /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
555 void fixNonInductionPHIs(VPTransformState &State);
557 /// Returns true if the reordering of FP operations is not allowed, but we are
558 /// able to vectorize with strict in-order reductions for the given RdxDesc.
559 bool useOrderedReductions(RecurrenceDescriptor &RdxDesc);
561 /// Create a broadcast instruction. This method generates a broadcast
562 /// instruction (shuffle) for loop invariant values and for the induction
563 /// value. If this is the induction variable then we extend it to N, N+1, ...
564 /// this is needed because each iteration in the loop corresponds to a SIMD
565 /// element.
566 virtual Value *getBroadcastInstrs(Value *V);
568 protected:
569 friend class LoopVectorizationPlanner;
571 /// A small list of PHINodes.
572 using PhiVector = SmallVector<PHINode *, 4>;
574 /// A type for scalarized values in the new loop. Each value from the
575 /// original loop, when scalarized, is represented by UF x VF scalar values
576 /// in the new unrolled loop, where UF is the unroll factor and VF is the
577 /// vectorization factor.
578 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
580 /// Set up the values of the IVs correctly when exiting the vector loop.
581 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
582 Value *CountRoundDown, Value *EndValue,
583 BasicBlock *MiddleBlock);
585 /// Create a new induction variable inside L.
586 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
587 Value *Step, Instruction *DL);
589 /// Handle all cross-iteration phis in the header.
590 void fixCrossIterationPHIs(VPTransformState &State);
592 /// Create the exit value of first order recurrences in the middle block and
593 /// update their users.
594 void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State);
596 /// Create code for the loop exit value of the reduction.
597 void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
599 /// Clear NSW/NUW flags from reduction instructions if necessary.
600 void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
601 VPTransformState &State);
603 /// Fixup the LCSSA phi nodes in the unique exit block. This simply
604 /// means we need to add the appropriate incoming value from the middle
605 /// block as exiting edges from the scalar epilogue loop (if present) are
606 /// already in place, and we exit the vector loop exclusively to the middle
607 /// block.
608 void fixLCSSAPHIs(VPTransformState &State);
610 /// Iteratively sink the scalarized operands of a predicated instruction into
611 /// the block that was created for it.
612 void sinkScalarOperands(Instruction *PredInst);
614 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
615 /// represented as.
616 void truncateToMinimalBitwidths(VPTransformState &State);
618 /// This function adds
619 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
620 /// to each vector element of Val. The sequence starts at StartIndex.
621 /// \p Opcode is relevant for FP induction variable.
622 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
623 Instruction::BinaryOps Opcode =
624 Instruction::BinaryOpsEnd);
626 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
627 /// variable on which to base the steps, \p Step is the size of the step, and
628 /// \p EntryVal is the value from the original loop that maps to the steps.
629 /// Note that \p EntryVal doesn't have to be an induction variable - it
630 /// can also be a truncate instruction.
631 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
632 const InductionDescriptor &ID, VPValue *Def,
633 VPValue *CastDef, VPTransformState &State);
635 /// Create a vector induction phi node based on an existing scalar one. \p
636 /// EntryVal is the value from the original loop that maps to the vector phi
637 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
638 /// truncate instruction, instead of widening the original IV, we widen a
639 /// version of the IV truncated to \p EntryVal's type.
640 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
641 Value *Step, Value *Start,
642 Instruction *EntryVal, VPValue *Def,
643 VPValue *CastDef,
644 VPTransformState &State);
646 /// Returns true if an instruction \p I should be scalarized instead of
647 /// vectorized for the chosen vectorization factor.
648 bool shouldScalarizeInstruction(Instruction *I) const;
650 /// Returns true if we should generate a scalar version of \p IV.
651 bool needsScalarInduction(Instruction *IV) const;
653 /// If there is a cast involved in the induction variable \p ID, which should
654 /// be ignored in the vectorized loop body, this function records the
655 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
656 /// cast. We had already proved that the casted Phi is equal to the uncasted
657 /// Phi in the vectorized loop (under a runtime guard), and therefore
658 /// there is no need to vectorize the cast - the same value can be used in the
659 /// vector loop for both the Phi and the cast.
660 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
661 /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
663 /// \p EntryVal is the value from the original loop that maps to the vector
664 /// phi node and is used to distinguish what is the IV currently being
665 /// processed - original one (if \p EntryVal is a phi corresponding to the
666 /// original IV) or the "newly-created" one based on the proof mentioned above
667 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
668 /// latter case \p EntryVal is a TruncInst and we must not record anything for
669 /// that IV, but it's error-prone to expect callers of this routine to care
670 /// about that, hence this explicit parameter.
671 void recordVectorLoopValueForInductionCast(
672 const InductionDescriptor &ID, const Instruction *EntryVal,
673 Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State,
674 unsigned Part, unsigned Lane = UINT_MAX);
676 /// Generate a shuffle sequence that will reverse the vector Vec.
677 virtual Value *reverseVector(Value *Vec);
679 /// Returns (and creates if needed) the original loop trip count.
680 Value *getOrCreateTripCount(Loop *NewLoop);
682 /// Returns (and creates if needed) the trip count of the widened loop.
683 Value *getOrCreateVectorTripCount(Loop *NewLoop);
685 /// Returns a bitcasted value to the requested vector type.
686 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
687 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
688 const DataLayout &DL);
690 /// Emit a bypass check to see if the vector trip count is zero, including if
691 /// it overflows.
692 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
694 /// Emit a bypass check to see if all of the SCEV assumptions we've
695 /// had to make are correct. Returns the block containing the checks or
696 /// nullptr if no checks have been added.
697 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
699 /// Emit bypass checks to check any memory assumptions we may have made.
700 /// Returns the block containing the checks or nullptr if no checks have been
701 /// added.
702 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
704 /// Compute the transformed value of Index at offset StartValue using step
705 /// StepValue.
706 /// For integer induction, returns StartValue + Index * StepValue.
707 /// For pointer induction, returns StartValue[Index * StepValue].
708 /// FIXME: The newly created binary instructions should contain nsw/nuw
709 /// flags, which can be found from the original scalar operations.
710 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
711 const DataLayout &DL,
712 const InductionDescriptor &ID) const;
714 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
715 /// vector loop preheader, middle block and scalar preheader. Also
716 /// allocate a loop object for the new vector loop and return it.
717 Loop *createVectorLoopSkeleton(StringRef Prefix);
719 /// Create new phi nodes for the induction variables to resume iteration count
720 /// in the scalar epilogue, from where the vectorized loop left off (given by
721 /// \p VectorTripCount).
722 /// In cases where the loop skeleton is more complicated (eg. epilogue
723 /// vectorization) and the resume values can come from an additional bypass
724 /// block, the \p AdditionalBypass pair provides information about the bypass
725 /// block and the end value on the edge from bypass to this loop.
726 void createInductionResumeValues(
727 Loop *L, Value *VectorTripCount,
728 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
730 /// Complete the loop skeleton by adding debug MDs, creating appropriate
731 /// conditional branches in the middle block, preparing the builder and
732 /// running the verifier. Take in the vector loop \p L as argument, and return
733 /// the preheader of the completed vector loop.
734 BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
736 /// Add additional metadata to \p To that was not present on \p Orig.
738 /// Currently this is used to add the noalias annotations based on the
739 /// inserted memchecks. Use this for instructions that are *cloned* into the
740 /// vector loop.
741 void addNewMetadata(Instruction *To, const Instruction *Orig);
743 /// Add metadata from one instruction to another.
745 /// This includes both the original MDs from \p From and additional ones (\see
746 /// addNewMetadata). Use this for *newly created* instructions in the vector
747 /// loop.
748 void addMetadata(Instruction *To, Instruction *From);
750 /// Similar to the previous function but it adds the metadata to a
751 /// vector of instructions.
752 void addMetadata(ArrayRef<Value *> To, Instruction *From);
754 /// Allow subclasses to override and print debug traces before/after vplan
755 /// execution, when trace information is requested.
756 virtual void printDebugTracesAtStart(){};
757 virtual void printDebugTracesAtEnd(){};
759 /// The original loop.
760 Loop *OrigLoop;
762 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
763 /// dynamic knowledge to simplify SCEV expressions and converts them to a
764 /// more usable form.
765 PredicatedScalarEvolution &PSE;
767 /// Loop Info.
768 LoopInfo *LI;
770 /// Dominator Tree.
771 DominatorTree *DT;
773 /// Alias Analysis.
774 AAResults *AA;
776 /// Target Library Info.
777 const TargetLibraryInfo *TLI;
779 /// Target Transform Info.
780 const TargetTransformInfo *TTI;
782 /// Assumption Cache.
783 AssumptionCache *AC;
785 /// Interface to emit optimization remarks.
786 OptimizationRemarkEmitter *ORE;
788 /// LoopVersioning. It's only set up (non-null) if memchecks were
789 /// used.
791 /// This is currently only used to add no-alias metadata based on the
792 /// memchecks. The actually versioning is performed manually.
793 std::unique_ptr<LoopVersioning> LVer;
795 /// The vectorization SIMD factor to use. Each vector will have this many
796 /// vector elements.
797 ElementCount VF;
799 /// The vectorization unroll factor to use. Each scalar is vectorized to this
800 /// many different vector instructions.
801 unsigned UF;
803 /// The builder that we use
804 IRBuilder<> Builder;
806 // --- Vectorization state ---
808 /// The vector-loop preheader.
809 BasicBlock *LoopVectorPreHeader;
811 /// The scalar-loop preheader.
812 BasicBlock *LoopScalarPreHeader;
814 /// Middle Block between the vector and the scalar.
815 BasicBlock *LoopMiddleBlock;
817 /// The unique ExitBlock of the scalar loop if one exists. Note that
818 /// there can be multiple exiting edges reaching this block.
819 BasicBlock *LoopExitBlock;
821 /// The vector loop body.
822 BasicBlock *LoopVectorBody;
824 /// The scalar loop body.
825 BasicBlock *LoopScalarBody;
827 /// A list of all bypass blocks. The first block is the entry of the loop.
828 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
830 /// The new Induction variable which was added to the new block.
831 PHINode *Induction = nullptr;
833 /// The induction variable of the old basic block.
834 PHINode *OldInduction = nullptr;
836 /// Store instructions that were predicated.
837 SmallVector<Instruction *, 4> PredicatedInstructions;
839 /// Trip count of the original loop.
840 Value *TripCount = nullptr;
842 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
843 Value *VectorTripCount = nullptr;
845 /// The legality analysis.
846 LoopVectorizationLegality *Legal;
848 /// The profitablity analysis.
849 LoopVectorizationCostModel *Cost;
851 // Record whether runtime checks are added.
852 bool AddedSafetyChecks = false;
854 // Holds the end values for each induction variable. We save the end values
855 // so we can later fix-up the external users of the induction variables.
856 DenseMap<PHINode *, Value *> IVEndValues;
858 // Vector of original scalar PHIs whose corresponding widened PHIs need to be
859 // fixed up at the end of vector code generation.
860 SmallVector<PHINode *, 8> OrigPHIsToFix;
862 /// BFI and PSI are used to check for profile guided size optimizations.
863 BlockFrequencyInfo *BFI;
864 ProfileSummaryInfo *PSI;
866 // Whether this loop should be optimized for size based on profile guided size
867 // optimizatios.
868 bool OptForSizeBasedOnProfile;
870 /// Structure to hold information about generated runtime checks, responsible
871 /// for cleaning the checks, if vectorization turns out unprofitable.
872 GeneratedRTChecks &RTChecks;
875 class InnerLoopUnroller : public InnerLoopVectorizer {
876 public:
877 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
878 LoopInfo *LI, DominatorTree *DT,
879 const TargetLibraryInfo *TLI,
880 const TargetTransformInfo *TTI, AssumptionCache *AC,
881 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
882 LoopVectorizationLegality *LVL,
883 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
884 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
885 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
886 ElementCount::getFixed(1), UnrollFactor, LVL, CM,
887 BFI, PSI, Check) {}
889 private:
890 Value *getBroadcastInstrs(Value *V) override;
891 Value *getStepVector(Value *Val, int StartIdx, Value *Step,
892 Instruction::BinaryOps Opcode =
893 Instruction::BinaryOpsEnd) override;
894 Value *reverseVector(Value *Vec) override;
897 /// Encapsulate information regarding vectorization of a loop and its epilogue.
898 /// This information is meant to be updated and used across two stages of
899 /// epilogue vectorization.
900 struct EpilogueLoopVectorizationInfo {
901 ElementCount MainLoopVF = ElementCount::getFixed(0);
902 unsigned MainLoopUF = 0;
903 ElementCount EpilogueVF = ElementCount::getFixed(0);
904 unsigned EpilogueUF = 0;
905 BasicBlock *MainLoopIterationCountCheck = nullptr;
906 BasicBlock *EpilogueIterationCountCheck = nullptr;
907 BasicBlock *SCEVSafetyCheck = nullptr;
908 BasicBlock *MemSafetyCheck = nullptr;
909 Value *TripCount = nullptr;
910 Value *VectorTripCount = nullptr;
912 EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
913 unsigned EUF)
914 : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
915 EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
916 assert(EUF == 1 &&
917 "A high UF for the epilogue loop is likely not beneficial.");
921 /// An extension of the inner loop vectorizer that creates a skeleton for a
922 /// vectorized loop that has its epilogue (residual) also vectorized.
923 /// The idea is to run the vplan on a given loop twice, firstly to setup the
924 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
925 /// from the first step and vectorize the epilogue. This is achieved by
926 /// deriving two concrete strategy classes from this base class and invoking
927 /// them in succession from the loop vectorizer planner.
928 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
929 public:
930 InnerLoopAndEpilogueVectorizer(
931 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
932 DominatorTree *DT, const TargetLibraryInfo *TLI,
933 const TargetTransformInfo *TTI, AssumptionCache *AC,
934 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
935 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
936 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
937 GeneratedRTChecks &Checks)
938 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
939 EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
940 Checks),
941 EPI(EPI) {}
943 // Override this function to handle the more complex control flow around the
944 // three loops.
945 BasicBlock *createVectorizedLoopSkeleton() final override {
946 return createEpilogueVectorizedLoopSkeleton();
949 /// The interface for creating a vectorized skeleton using one of two
950 /// different strategies, each corresponding to one execution of the vplan
951 /// as described above.
952 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
954 /// Holds and updates state information required to vectorize the main loop
955 /// and its epilogue in two separate passes. This setup helps us avoid
956 /// regenerating and recomputing runtime safety checks. It also helps us to
957 /// shorten the iteration-count-check path length for the cases where the
958 /// iteration count of the loop is so small that the main vector loop is
959 /// completely skipped.
960 EpilogueLoopVectorizationInfo &EPI;
963 /// A specialized derived class of inner loop vectorizer that performs
964 /// vectorization of *main* loops in the process of vectorizing loops and their
965 /// epilogues.
966 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
967 public:
968 EpilogueVectorizerMainLoop(
969 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
970 DominatorTree *DT, const TargetLibraryInfo *TLI,
971 const TargetTransformInfo *TTI, AssumptionCache *AC,
972 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
973 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
974 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
975 GeneratedRTChecks &Check)
976 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
977 EPI, LVL, CM, BFI, PSI, Check) {}
978 /// Implements the interface for creating a vectorized skeleton using the
979 /// *main loop* strategy (ie the first pass of vplan execution).
980 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
982 protected:
983 /// Emits an iteration count bypass check once for the main loop (when \p
984 /// ForEpilogue is false) and once for the epilogue loop (when \p
985 /// ForEpilogue is true).
986 BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
987 bool ForEpilogue);
988 void printDebugTracesAtStart() override;
989 void printDebugTracesAtEnd() override;
992 // A specialized derived class of inner loop vectorizer that performs
993 // vectorization of *epilogue* loops in the process of vectorizing loops and
994 // their epilogues.
995 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
996 public:
997 EpilogueVectorizerEpilogueLoop(
998 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
999 DominatorTree *DT, const TargetLibraryInfo *TLI,
1000 const TargetTransformInfo *TTI, AssumptionCache *AC,
1001 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
1002 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
1003 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
1004 GeneratedRTChecks &Checks)
1005 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1006 EPI, LVL, CM, BFI, PSI, Checks) {}
1007 /// Implements the interface for creating a vectorized skeleton using the
1008 /// *epilogue loop* strategy (ie the second pass of vplan execution).
1009 BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1011 protected:
1012 /// Emits an iteration count bypass check after the main vector loop has
1013 /// finished to see if there are any iterations left to execute by either
1014 /// the vector epilogue or the scalar epilogue.
1015 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1016 BasicBlock *Bypass,
1017 BasicBlock *Insert);
1018 void printDebugTracesAtStart() override;
1019 void printDebugTracesAtEnd() override;
1021 } // end namespace llvm
1023 /// Look for a meaningful debug location on the instruction or it's
1024 /// operands.
1025 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1026 if (!I)
1027 return I;
1029 DebugLoc Empty;
1030 if (I->getDebugLoc() != Empty)
1031 return I;
1033 for (Use &Op : I->operands()) {
1034 if (Instruction *OpInst = dyn_cast<Instruction>(Op))
1035 if (OpInst->getDebugLoc() != Empty)
1036 return OpInst;
1039 return I;
1042 void InnerLoopVectorizer::setDebugLocFromInst(
1043 const Value *V, Optional<IRBuilder<> *> CustomBuilder) {
1044 IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
1045 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
1046 const DILocation *DIL = Inst->getDebugLoc();
1048 // When a FSDiscriminator is enabled, we don't need to add the multiply
1049 // factors to the discriminators.
1050 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1051 !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
1052 // FIXME: For scalable vectors, assume vscale=1.
1053 auto NewDIL =
1054 DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1055 if (NewDIL)
1056 B->SetCurrentDebugLocation(NewDIL.getValue());
1057 else
1058 LLVM_DEBUG(dbgs()
1059 << "Failed to create new discriminator: "
1060 << DIL->getFilename() << " Line: " << DIL->getLine());
1061 } else
1062 B->SetCurrentDebugLocation(DIL);
1063 } else
1064 B->SetCurrentDebugLocation(DebugLoc());
1067 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
1068 /// is passed, the message relates to that particular instruction.
1069 #ifndef NDEBUG
1070 static void debugVectorizationMessage(const StringRef Prefix,
1071 const StringRef DebugMsg,
1072 Instruction *I) {
1073 dbgs() << "LV: " << Prefix << DebugMsg;
1074 if (I != nullptr)
1075 dbgs() << " " << *I;
1076 else
1077 dbgs() << '.';
1078 dbgs() << '\n';
1080 #endif
1082 /// Create an analysis remark that explains why vectorization failed
1084 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
1085 /// RemarkName is the identifier for the remark. If \p I is passed it is an
1086 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for
1087 /// the location of the remark. \return the remark object that can be
1088 /// streamed to.
1089 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1090 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1091 Value *CodeRegion = TheLoop->getHeader();
1092 DebugLoc DL = TheLoop->getStartLoc();
1094 if (I) {
1095 CodeRegion = I->getParent();
1096 // If there is no debug location attached to the instruction, revert back to
1097 // using the loop's.
1098 if (I->getDebugLoc())
1099 DL = I->getDebugLoc();
1102 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1105 /// Return a value for Step multiplied by VF.
1106 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
1107 assert(isa<ConstantInt>(Step) && "Expected an integer step");
1108 Constant *StepVal = ConstantInt::get(
1109 Step->getType(),
1110 cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
1111 return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1114 namespace llvm {
1116 /// Return the runtime value for VF.
1117 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
1118 Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1119 return VF.isScalable() ? B.CreateVScale(EC) : EC;
1122 void reportVectorizationFailure(const StringRef DebugMsg,
1123 const StringRef OREMsg, const StringRef ORETag,
1124 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1125 Instruction *I) {
1126 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1127 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1128 ORE->emit(
1129 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1130 << "loop not vectorized: " << OREMsg);
1133 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1134 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1135 Instruction *I) {
1136 LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1137 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1138 ORE->emit(
1139 createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1140 << Msg);
1143 } // end namespace llvm
1145 #ifndef NDEBUG
1146 /// \return string containing a file name and a line # for the given loop.
1147 static std::string getDebugLocString(const Loop *L) {
1148 std::string Result;
1149 if (L) {
1150 raw_string_ostream OS(Result);
1151 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1152 LoopDbgLoc.print(OS);
1153 else
1154 // Just print the module name.
1155 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1156 OS.flush();
1158 return Result;
1160 #endif
1162 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1163 const Instruction *Orig) {
1164 // If the loop was versioned with memchecks, add the corresponding no-alias
1165 // metadata.
1166 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1167 LVer->annotateInstWithNoAlias(To, Orig);
1170 void InnerLoopVectorizer::addMetadata(Instruction *To,
1171 Instruction *From) {
1172 propagateMetadata(To, From);
1173 addNewMetadata(To, From);
1176 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1177 Instruction *From) {
1178 for (Value *V : To) {
1179 if (Instruction *I = dyn_cast<Instruction>(V))
1180 addMetadata(I, From);
1184 namespace llvm {
1186 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1187 // lowered.
1188 enum ScalarEpilogueLowering {
1190 // The default: allowing scalar epilogues.
1191 CM_ScalarEpilogueAllowed,
1193 // Vectorization with OptForSize: don't allow epilogues.
1194 CM_ScalarEpilogueNotAllowedOptSize,
1196 // A special case of vectorisation with OptForSize: loops with a very small
1197 // trip count are considered for vectorization under OptForSize, thereby
1198 // making sure the cost of their loop body is dominant, free of runtime
1199 // guards and scalar iteration overheads.
1200 CM_ScalarEpilogueNotAllowedLowTripLoop,
1202 // Loop hint predicate indicating an epilogue is undesired.
1203 CM_ScalarEpilogueNotNeededUsePredicate,
1205 // Directive indicating we must either tail fold or not vectorize
1206 CM_ScalarEpilogueNotAllowedUsePredicate
1209 /// ElementCountComparator creates a total ordering for ElementCount
1210 /// for the purposes of using it in a set structure.
1211 struct ElementCountComparator {
1212 bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1213 return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1214 std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1217 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1219 /// LoopVectorizationCostModel - estimates the expected speedups due to
1220 /// vectorization.
1221 /// In many cases vectorization is not profitable. This can happen because of
1222 /// a number of reasons. In this class we mainly attempt to predict the
1223 /// expected speedup/slowdowns due to the supported instruction set. We use the
1224 /// TargetTransformInfo to query the different backends for the cost of
1225 /// different operations.
1226 class LoopVectorizationCostModel {
1227 public:
1228 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1229 PredicatedScalarEvolution &PSE, LoopInfo *LI,
1230 LoopVectorizationLegality *Legal,
1231 const TargetTransformInfo &TTI,
1232 const TargetLibraryInfo *TLI, DemandedBits *DB,
1233 AssumptionCache *AC,
1234 OptimizationRemarkEmitter *ORE, const Function *F,
1235 const LoopVectorizeHints *Hints,
1236 InterleavedAccessInfo &IAI)
1237 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1238 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1239 Hints(Hints), InterleaveInfo(IAI) {}
1241 /// \return An upper bound for the vectorization factors (both fixed and
1242 /// scalable). If the factors are 0, vectorization and interleaving should be
1243 /// avoided up front.
1244 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1246 /// \return True if runtime checks are required for vectorization, and false
1247 /// otherwise.
1248 bool runtimeChecksRequired();
1250 /// \return The most profitable vectorization factor and the cost of that VF.
1251 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1252 /// then this vectorization factor will be selected if vectorization is
1253 /// possible.
1254 VectorizationFactor
1255 selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1257 VectorizationFactor
1258 selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1259 const LoopVectorizationPlanner &LVP);
1261 /// Setup cost-based decisions for user vectorization factor.
1262 /// \return true if the UserVF is a feasible VF to be chosen.
1263 bool selectUserVectorizationFactor(ElementCount UserVF) {
1264 collectUniformsAndScalars(UserVF);
1265 collectInstsToScalarize(UserVF);
1266 return expectedCost(UserVF).first.isValid();
1269 /// \return The size (in bits) of the smallest and widest types in the code
1270 /// that needs to be vectorized. We ignore values that remain scalar such as
1271 /// 64 bit loop indices.
1272 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1274 /// \return The desired interleave count.
1275 /// If interleave count has been specified by metadata it will be returned.
1276 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1277 /// are the selected vectorization factor and the cost of the selected VF.
1278 unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1280 /// Memory access instruction may be vectorized in more than one way.
1281 /// Form of instruction after vectorization depends on cost.
1282 /// This function takes cost-based decisions for Load/Store instructions
1283 /// and collects them in a map. This decisions map is used for building
1284 /// the lists of loop-uniform and loop-scalar instructions.
1285 /// The calculated cost is saved with widening decision in order to
1286 /// avoid redundant calculations.
1287 void setCostBasedWideningDecision(ElementCount VF);
1289 /// A struct that represents some properties of the register usage
1290 /// of a loop.
1291 struct RegisterUsage {
1292 /// Holds the number of loop invariant values that are used in the loop.
1293 /// The key is ClassID of target-provided register class.
1294 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1295 /// Holds the maximum number of concurrent live intervals in the loop.
1296 /// The key is ClassID of target-provided register class.
1297 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1300 /// \return Returns information about the register usages of the loop for the
1301 /// given vectorization factors.
1302 SmallVector<RegisterUsage, 8>
1303 calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1305 /// Collect values we want to ignore in the cost model.
1306 void collectValuesToIgnore();
1308 /// Collect all element types in the loop for which widening is needed.
1309 void collectElementTypesForWidening();
1311 /// Split reductions into those that happen in the loop, and those that happen
1312 /// outside. In loop reductions are collected into InLoopReductionChains.
1313 void collectInLoopReductions();
1315 /// Returns true if we should use strict in-order reductions for the given
1316 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1317 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1318 /// of FP operations.
1319 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
1320 return ForceOrderedReductions && !Hints->allowReordering() &&
1321 RdxDesc.isOrdered();
1324 /// \returns The smallest bitwidth each instruction can be represented with.
1325 /// The vector equivalents of these instructions should be truncated to this
1326 /// type.
1327 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1328 return MinBWs;
1331 /// \returns True if it is more profitable to scalarize instruction \p I for
1332 /// vectorization factor \p VF.
1333 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1334 assert(VF.isVector() &&
1335 "Profitable to scalarize relevant only for VF > 1.");
1337 // Cost model is not run in the VPlan-native path - return conservative
1338 // result until this changes.
1339 if (EnableVPlanNativePath)
1340 return false;
1342 auto Scalars = InstsToScalarize.find(VF);
1343 assert(Scalars != InstsToScalarize.end() &&
1344 "VF not yet analyzed for scalarization profitability");
1345 return Scalars->second.find(I) != Scalars->second.end();
1348 /// Returns true if \p I is known to be uniform after vectorization.
1349 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1350 if (VF.isScalar())
1351 return true;
1353 // Cost model is not run in the VPlan-native path - return conservative
1354 // result until this changes.
1355 if (EnableVPlanNativePath)
1356 return false;
1358 auto UniformsPerVF = Uniforms.find(VF);
1359 assert(UniformsPerVF != Uniforms.end() &&
1360 "VF not yet analyzed for uniformity");
1361 return UniformsPerVF->second.count(I);
1364 /// Returns true if \p I is known to be scalar after vectorization.
1365 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1366 if (VF.isScalar())
1367 return true;
1369 // Cost model is not run in the VPlan-native path - return conservative
1370 // result until this changes.
1371 if (EnableVPlanNativePath)
1372 return false;
1374 auto ScalarsPerVF = Scalars.find(VF);
1375 assert(ScalarsPerVF != Scalars.end() &&
1376 "Scalar values are not calculated for VF");
1377 return ScalarsPerVF->second.count(I);
1380 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1381 /// for vectorization factor \p VF.
1382 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1383 return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1384 !isProfitableToScalarize(I, VF) &&
1385 !isScalarAfterVectorization(I, VF);
1388 /// Decision that was taken during cost calculation for memory instruction.
1389 enum InstWidening {
1390 CM_Unknown,
1391 CM_Widen, // For consecutive accesses with stride +1.
1392 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1393 CM_Interleave,
1394 CM_GatherScatter,
1395 CM_Scalarize
1398 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1399 /// instruction \p I and vector width \p VF.
1400 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1401 InstructionCost Cost) {
1402 assert(VF.isVector() && "Expected VF >=2");
1403 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1406 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1407 /// interleaving group \p Grp and vector width \p VF.
1408 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1409 ElementCount VF, InstWidening W,
1410 InstructionCost Cost) {
1411 assert(VF.isVector() && "Expected VF >=2");
1412 /// Broadcast this decicion to all instructions inside the group.
1413 /// But the cost will be assigned to one instruction only.
1414 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1415 if (auto *I = Grp->getMember(i)) {
1416 if (Grp->getInsertPos() == I)
1417 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1418 else
1419 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1424 /// Return the cost model decision for the given instruction \p I and vector
1425 /// width \p VF. Return CM_Unknown if this instruction did not pass
1426 /// through the cost modeling.
1427 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1428 assert(VF.isVector() && "Expected VF to be a vector VF");
1429 // Cost model is not run in the VPlan-native path - return conservative
1430 // result until this changes.
1431 if (EnableVPlanNativePath)
1432 return CM_GatherScatter;
1434 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1435 auto Itr = WideningDecisions.find(InstOnVF);
1436 if (Itr == WideningDecisions.end())
1437 return CM_Unknown;
1438 return Itr->second.first;
1441 /// Return the vectorization cost for the given instruction \p I and vector
1442 /// width \p VF.
1443 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1444 assert(VF.isVector() && "Expected VF >=2");
1445 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1446 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1447 "The cost is not calculated");
1448 return WideningDecisions[InstOnVF].second;
1451 /// Return True if instruction \p I is an optimizable truncate whose operand
1452 /// is an induction variable. Such a truncate will be removed by adding a new
1453 /// induction variable with the destination type.
1454 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1455 // If the instruction is not a truncate, return false.
1456 auto *Trunc = dyn_cast<TruncInst>(I);
1457 if (!Trunc)
1458 return false;
1460 // Get the source and destination types of the truncate.
1461 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1462 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1464 // If the truncate is free for the given types, return false. Replacing a
1465 // free truncate with an induction variable would add an induction variable
1466 // update instruction to each iteration of the loop. We exclude from this
1467 // check the primary induction variable since it will need an update
1468 // instruction regardless.
1469 Value *Op = Trunc->getOperand(0);
1470 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1471 return false;
1473 // If the truncated value is not an induction variable, return false.
1474 return Legal->isInductionPhi(Op);
1477 /// Collects the instructions to scalarize for each predicated instruction in
1478 /// the loop.
1479 void collectInstsToScalarize(ElementCount VF);
1481 /// Collect Uniform and Scalar values for the given \p VF.
1482 /// The sets depend on CM decision for Load/Store instructions
1483 /// that may be vectorized as interleave, gather-scatter or scalarized.
1484 void collectUniformsAndScalars(ElementCount VF) {
1485 // Do the analysis once.
1486 if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1487 return;
1488 setCostBasedWideningDecision(VF);
1489 collectLoopUniforms(VF);
1490 collectLoopScalars(VF);
1493 /// Returns true if the target machine supports masked store operation
1494 /// for the given \p DataType and kind of access to \p Ptr.
1495 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1496 return Legal->isConsecutivePtr(Ptr) &&
1497 TTI.isLegalMaskedStore(DataType, Alignment);
1500 /// Returns true if the target machine supports masked load operation
1501 /// for the given \p DataType and kind of access to \p Ptr.
1502 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1503 return Legal->isConsecutivePtr(Ptr) &&
1504 TTI.isLegalMaskedLoad(DataType, Alignment);
1507 /// Returns true if the target machine can represent \p V as a masked gather
1508 /// or scatter operation.
1509 bool isLegalGatherOrScatter(Value *V) {
1510 bool LI = isa<LoadInst>(V);
1511 bool SI = isa<StoreInst>(V);
1512 if (!LI && !SI)
1513 return false;
1514 auto *Ty = getLoadStoreType(V);
1515 Align Align = getLoadStoreAlignment(V);
1516 return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1517 (SI && TTI.isLegalMaskedScatter(Ty, Align));
1520 /// Returns true if the target machine supports all of the reduction
1521 /// variables found for the given VF.
1522 bool canVectorizeReductions(ElementCount VF) const {
1523 return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1524 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1525 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1526 }));
1529 /// Returns true if \p I is an instruction that will be scalarized with
1530 /// predication. Such instructions include conditional stores and
1531 /// instructions that may divide by zero.
1532 /// If a non-zero VF has been calculated, we check if I will be scalarized
1533 /// predication for that VF.
1534 bool isScalarWithPredication(Instruction *I) const;
1536 // Returns true if \p I is an instruction that will be predicated either
1537 // through scalar predication or masked load/store or masked gather/scatter.
1538 // Superset of instructions that return true for isScalarWithPredication.
1539 bool isPredicatedInst(Instruction *I) {
1540 if (!blockNeedsPredication(I->getParent()))
1541 return false;
1542 // Loads and stores that need some form of masked operation are predicated
1543 // instructions.
1544 if (isa<LoadInst>(I) || isa<StoreInst>(I))
1545 return Legal->isMaskRequired(I);
1546 return isScalarWithPredication(I);
1549 /// Returns true if \p I is a memory instruction with consecutive memory
1550 /// access that can be widened.
1551 bool
1552 memoryInstructionCanBeWidened(Instruction *I,
1553 ElementCount VF = ElementCount::getFixed(1));
1555 /// Returns true if \p I is a memory instruction in an interleaved-group
1556 /// of memory accesses that can be vectorized with wide vector loads/stores
1557 /// and shuffles.
1558 bool
1559 interleavedAccessCanBeWidened(Instruction *I,
1560 ElementCount VF = ElementCount::getFixed(1));
1562 /// Check if \p Instr belongs to any interleaved access group.
1563 bool isAccessInterleaved(Instruction *Instr) {
1564 return InterleaveInfo.isInterleaved(Instr);
1567 /// Get the interleaved access group that \p Instr belongs to.
1568 const InterleaveGroup<Instruction> *
1569 getInterleavedAccessGroup(Instruction *Instr) {
1570 return InterleaveInfo.getInterleaveGroup(Instr);
1573 /// Returns true if we're required to use a scalar epilogue for at least
1574 /// the final iteration of the original loop.
1575 bool requiresScalarEpilogue(ElementCount VF) const {
1576 if (!isScalarEpilogueAllowed())
1577 return false;
1578 // If we might exit from anywhere but the latch, must run the exiting
1579 // iteration in scalar form.
1580 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1581 return true;
1582 return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1585 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1586 /// loop hint annotation.
1587 bool isScalarEpilogueAllowed() const {
1588 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1591 /// Returns true if all loop blocks should be masked to fold tail loop.
1592 bool foldTailByMasking() const { return FoldTailByMasking; }
1594 bool blockNeedsPredication(BasicBlock *BB) const {
1595 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1598 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1599 /// nodes to the chain of instructions representing the reductions. Uses a
1600 /// MapVector to ensure deterministic iteration order.
1601 using ReductionChainMap =
1602 SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1604 /// Return the chain of instructions representing an inloop reduction.
1605 const ReductionChainMap &getInLoopReductionChains() const {
1606 return InLoopReductionChains;
1609 /// Returns true if the Phi is part of an inloop reduction.
1610 bool isInLoopReduction(PHINode *Phi) const {
1611 return InLoopReductionChains.count(Phi);
1614 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1615 /// with factor VF. Return the cost of the instruction, including
1616 /// scalarization overhead if it's needed.
1617 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1619 /// Estimate cost of a call instruction CI if it were vectorized with factor
1620 /// VF. Return the cost of the instruction, including scalarization overhead
1621 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1622 /// scalarized -
1623 /// i.e. either vector version isn't available, or is too expensive.
1624 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1625 bool &NeedToScalarize) const;
1627 /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1628 /// that of B.
1629 bool isMoreProfitable(const VectorizationFactor &A,
1630 const VectorizationFactor &B) const;
1632 /// Invalidates decisions already taken by the cost model.
1633 void invalidateCostModelingDecisions() {
1634 WideningDecisions.clear();
1635 Uniforms.clear();
1636 Scalars.clear();
1639 private:
1640 unsigned NumPredStores = 0;
1642 /// \return An upper bound for the vectorization factors for both
1643 /// fixed and scalable vectorization, where the minimum-known number of
1644 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1645 /// disabled or unsupported, then the scalable part will be equal to
1646 /// ElementCount::getScalable(0).
1647 FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1648 ElementCount UserVF);
1650 /// \return the maximized element count based on the targets vector
1651 /// registers and the loop trip-count, but limited to a maximum safe VF.
1652 /// This is a helper function of computeFeasibleMaxVF.
1653 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1654 /// issue that occurred on one of the buildbots which cannot be reproduced
1655 /// without having access to the properietary compiler (see comments on
1656 /// D98509). The issue is currently under investigation and this workaround
1657 /// will be removed as soon as possible.
1658 ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1659 unsigned SmallestType,
1660 unsigned WidestType,
1661 const ElementCount &MaxSafeVF);
1663 /// \return the maximum legal scalable VF, based on the safe max number
1664 /// of elements.
1665 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1667 /// The vectorization cost is a combination of the cost itself and a boolean
1668 /// indicating whether any of the contributing operations will actually
1669 /// operate on vector values after type legalization in the backend. If this
1670 /// latter value is false, then all operations will be scalarized (i.e. no
1671 /// vectorization has actually taken place).
1672 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1674 /// Returns the expected execution cost. The unit of the cost does
1675 /// not matter because we use the 'cost' units to compare different
1676 /// vector widths. The cost that is returned is *not* normalized by
1677 /// the factor width. If \p Invalid is not nullptr, this function
1678 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1679 /// each instruction that has an Invalid cost for the given VF.
1680 using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1681 VectorizationCostTy
1682 expectedCost(ElementCount VF,
1683 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1685 /// Returns the execution time cost of an instruction for a given vector
1686 /// width. Vector width of one means scalar.
1687 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1689 /// The cost-computation logic from getInstructionCost which provides
1690 /// the vector type as an output parameter.
1691 InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1692 Type *&VectorTy);
1694 /// Return the cost of instructions in an inloop reduction pattern, if I is
1695 /// part of that pattern.
1696 Optional<InstructionCost>
1697 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1698 TTI::TargetCostKind CostKind);
1700 /// Calculate vectorization cost of memory instruction \p I.
1701 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1703 /// The cost computation for scalarized memory instruction.
1704 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1706 /// The cost computation for interleaving group of memory instructions.
1707 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1709 /// The cost computation for Gather/Scatter instruction.
1710 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1712 /// The cost computation for widening instruction \p I with consecutive
1713 /// memory access.
1714 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1716 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1717 /// Load: scalar load + broadcast.
1718 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1719 /// element)
1720 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1722 /// Estimate the overhead of scalarizing an instruction. This is a
1723 /// convenience wrapper for the type-based getScalarizationOverhead API.
1724 InstructionCost getScalarizationOverhead(Instruction *I,
1725 ElementCount VF) const;
1727 /// Returns whether the instruction is a load or store and will be a emitted
1728 /// as a vector operation.
1729 bool isConsecutiveLoadOrStore(Instruction *I);
1731 /// Returns true if an artificially high cost for emulated masked memrefs
1732 /// should be used.
1733 bool useEmulatedMaskMemRefHack(Instruction *I);
1735 /// Map of scalar integer values to the smallest bitwidth they can be legally
1736 /// represented as. The vector equivalents of these values should be truncated
1737 /// to this type.
1738 MapVector<Instruction *, uint64_t> MinBWs;
1740 /// A type representing the costs for instructions if they were to be
1741 /// scalarized rather than vectorized. The entries are Instruction-Cost
1742 /// pairs.
1743 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1745 /// A set containing all BasicBlocks that are known to present after
1746 /// vectorization as a predicated block.
1747 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1749 /// Records whether it is allowed to have the original scalar loop execute at
1750 /// least once. This may be needed as a fallback loop in case runtime
1751 /// aliasing/dependence checks fail, or to handle the tail/remainder
1752 /// iterations when the trip count is unknown or doesn't divide by the VF,
1753 /// or as a peel-loop to handle gaps in interleave-groups.
1754 /// Under optsize and when the trip count is very small we don't allow any
1755 /// iterations to execute in the scalar loop.
1756 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1758 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1759 bool FoldTailByMasking = false;
1761 /// A map holding scalar costs for different vectorization factors. The
1762 /// presence of a cost for an instruction in the mapping indicates that the
1763 /// instruction will be scalarized when vectorizing with the associated
1764 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1765 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1767 /// Holds the instructions known to be uniform after vectorization.
1768 /// The data is collected per VF.
1769 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1771 /// Holds the instructions known to be scalar after vectorization.
1772 /// The data is collected per VF.
1773 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1775 /// Holds the instructions (address computations) that are forced to be
1776 /// scalarized.
1777 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1779 /// PHINodes of the reductions that should be expanded in-loop along with
1780 /// their associated chains of reduction operations, in program order from top
1781 /// (PHI) to bottom
1782 ReductionChainMap InLoopReductionChains;
1784 /// A Map of inloop reduction operations and their immediate chain operand.
1785 /// FIXME: This can be removed once reductions can be costed correctly in
1786 /// vplan. This was added to allow quick lookup to the inloop operations,
1787 /// without having to loop through InLoopReductionChains.
1788 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1790 /// Returns the expected difference in cost from scalarizing the expression
1791 /// feeding a predicated instruction \p PredInst. The instructions to
1792 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1793 /// non-negative return value implies the expression will be scalarized.
1794 /// Currently, only single-use chains are considered for scalarization.
1795 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1796 ElementCount VF);
1798 /// Collect the instructions that are uniform after vectorization. An
1799 /// instruction is uniform if we represent it with a single scalar value in
1800 /// the vectorized loop corresponding to each vector iteration. Examples of
1801 /// uniform instructions include pointer operands of consecutive or
1802 /// interleaved memory accesses. Note that although uniformity implies an
1803 /// instruction will be scalar, the reverse is not true. In general, a
1804 /// scalarized instruction will be represented by VF scalar values in the
1805 /// vectorized loop, each corresponding to an iteration of the original
1806 /// scalar loop.
1807 void collectLoopUniforms(ElementCount VF);
1809 /// Collect the instructions that are scalar after vectorization. An
1810 /// instruction is scalar if it is known to be uniform or will be scalarized
1811 /// during vectorization. Non-uniform scalarized instructions will be
1812 /// represented by VF values in the vectorized loop, each corresponding to an
1813 /// iteration of the original scalar loop.
1814 void collectLoopScalars(ElementCount VF);
1816 /// Keeps cost model vectorization decision and cost for instructions.
1817 /// Right now it is used for memory instructions only.
1818 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1819 std::pair<InstWidening, InstructionCost>>;
1821 DecisionList WideningDecisions;
1823 /// Returns true if \p V is expected to be vectorized and it needs to be
1824 /// extracted.
1825 bool needsExtract(Value *V, ElementCount VF) const {
1826 Instruction *I = dyn_cast<Instruction>(V);
1827 if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1828 TheLoop->isLoopInvariant(I))
1829 return false;
1831 // Assume we can vectorize V (and hence we need extraction) if the
1832 // scalars are not computed yet. This can happen, because it is called
1833 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1834 // the scalars are collected. That should be a safe assumption in most
1835 // cases, because we check if the operands have vectorizable types
1836 // beforehand in LoopVectorizationLegality.
1837 return Scalars.find(VF) == Scalars.end() ||
1838 !isScalarAfterVectorization(I, VF);
1841 /// Returns a range containing only operands needing to be extracted.
1842 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1843 ElementCount VF) const {
1844 return SmallVector<Value *, 4>(make_filter_range(
1845 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1848 /// Determines if we have the infrastructure to vectorize loop \p L and its
1849 /// epilogue, assuming the main loop is vectorized by \p VF.
1850 bool isCandidateForEpilogueVectorization(const Loop &L,
1851 const ElementCount VF) const;
1853 /// Returns true if epilogue vectorization is considered profitable, and
1854 /// false otherwise.
1855 /// \p VF is the vectorization factor chosen for the original loop.
1856 bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1858 public:
1859 /// The loop that we evaluate.
1860 Loop *TheLoop;
1862 /// Predicated scalar evolution analysis.
1863 PredicatedScalarEvolution &PSE;
1865 /// Loop Info analysis.
1866 LoopInfo *LI;
1868 /// Vectorization legality.
1869 LoopVectorizationLegality *Legal;
1871 /// Vector target information.
1872 const TargetTransformInfo &TTI;
1874 /// Target Library Info.
1875 const TargetLibraryInfo *TLI;
1877 /// Demanded bits analysis.
1878 DemandedBits *DB;
1880 /// Assumption cache.
1881 AssumptionCache *AC;
1883 /// Interface to emit optimization remarks.
1884 OptimizationRemarkEmitter *ORE;
1886 const Function *TheFunction;
1888 /// Loop Vectorize Hint.
1889 const LoopVectorizeHints *Hints;
1891 /// The interleave access information contains groups of interleaved accesses
1892 /// with the same stride and close to each other.
1893 InterleavedAccessInfo &InterleaveInfo;
1895 /// Values to ignore in the cost model.
1896 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1898 /// Values to ignore in the cost model when VF > 1.
1899 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1901 /// All element types found in the loop.
1902 SmallPtrSet<Type *, 16> ElementTypesInLoop;
1904 /// Profitable vector factors.
1905 SmallVector<VectorizationFactor, 8> ProfitableVFs;
1907 } // end namespace llvm
1909 /// Helper struct to manage generating runtime checks for vectorization.
1911 /// The runtime checks are created up-front in temporary blocks to allow better
1912 /// estimating the cost and un-linked from the existing IR. After deciding to
1913 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1914 /// temporary blocks are completely removed.
1915 class GeneratedRTChecks {
1916 /// Basic block which contains the generated SCEV checks, if any.
1917 BasicBlock *SCEVCheckBlock = nullptr;
1919 /// The value representing the result of the generated SCEV checks. If it is
1920 /// nullptr, either no SCEV checks have been generated or they have been used.
1921 Value *SCEVCheckCond = nullptr;
1923 /// Basic block which contains the generated memory runtime checks, if any.
1924 BasicBlock *MemCheckBlock = nullptr;
1926 /// The value representing the result of the generated memory runtime checks.
1927 /// If it is nullptr, either no memory runtime checks have been generated or
1928 /// they have been used.
1929 Instruction *MemRuntimeCheckCond = nullptr;
1931 DominatorTree *DT;
1932 LoopInfo *LI;
1934 SCEVExpander SCEVExp;
1935 SCEVExpander MemCheckExp;
1937 public:
1938 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1939 const DataLayout &DL)
1940 : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
1941 MemCheckExp(SE, DL, "scev.check") {}
1943 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1944 /// accurately estimate the cost of the runtime checks. The blocks are
1945 /// un-linked from the IR and is added back during vector code generation. If
1946 /// there is no vector code generation, the check blocks are removed
1947 /// completely.
1948 void Create(Loop *L, const LoopAccessInfo &LAI,
1949 const SCEVUnionPredicate &UnionPred) {
1951 BasicBlock *LoopHeader = L->getHeader();
1952 BasicBlock *Preheader = L->getLoopPreheader();
1954 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1955 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1956 // may be used by SCEVExpander. The blocks will be un-linked from their
1957 // predecessors and removed from LI & DT at the end of the function.
1958 if (!UnionPred.isAlwaysTrue()) {
1959 SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1960 nullptr, "vector.scevcheck");
1962 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1963 &UnionPred, SCEVCheckBlock->getTerminator());
1966 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1967 if (RtPtrChecking.Need) {
1968 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1969 MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1970 "vector.memcheck");
1972 std::tie(std::ignore, MemRuntimeCheckCond) =
1973 addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1974 RtPtrChecking.getChecks(), MemCheckExp);
1975 assert(MemRuntimeCheckCond &&
1976 "no RT checks generated although RtPtrChecking "
1977 "claimed checks are required");
1980 if (!MemCheckBlock && !SCEVCheckBlock)
1981 return;
1983 // Unhook the temporary block with the checks, update various places
1984 // accordingly.
1985 if (SCEVCheckBlock)
1986 SCEVCheckBlock->replaceAllUsesWith(Preheader);
1987 if (MemCheckBlock)
1988 MemCheckBlock->replaceAllUsesWith(Preheader);
1990 if (SCEVCheckBlock) {
1991 SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1992 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1993 Preheader->getTerminator()->eraseFromParent();
1995 if (MemCheckBlock) {
1996 MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1997 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1998 Preheader->getTerminator()->eraseFromParent();
2001 DT->changeImmediateDominator(LoopHeader, Preheader);
2002 if (MemCheckBlock) {
2003 DT->eraseNode(MemCheckBlock);
2004 LI->removeBlock(MemCheckBlock);
2006 if (SCEVCheckBlock) {
2007 DT->eraseNode(SCEVCheckBlock);
2008 LI->removeBlock(SCEVCheckBlock);
2012 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2013 /// unused.
2014 ~GeneratedRTChecks() {
2015 SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT);
2016 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT);
2017 if (!SCEVCheckCond)
2018 SCEVCleaner.markResultUsed();
2020 if (!MemRuntimeCheckCond)
2021 MemCheckCleaner.markResultUsed();
2023 if (MemRuntimeCheckCond) {
2024 auto &SE = *MemCheckExp.getSE();
2025 // Memory runtime check generation creates compares that use expanded
2026 // values. Remove them before running the SCEVExpanderCleaners.
2027 for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2028 if (MemCheckExp.isInsertedInstruction(&I))
2029 continue;
2030 SE.forgetValue(&I);
2031 SE.eraseValueFromMap(&I);
2032 I.eraseFromParent();
2035 MemCheckCleaner.cleanup();
2036 SCEVCleaner.cleanup();
2038 if (SCEVCheckCond)
2039 SCEVCheckBlock->eraseFromParent();
2040 if (MemRuntimeCheckCond)
2041 MemCheckBlock->eraseFromParent();
2044 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2045 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2046 /// depending on the generated condition.
2047 BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
2048 BasicBlock *LoopVectorPreHeader,
2049 BasicBlock *LoopExitBlock) {
2050 if (!SCEVCheckCond)
2051 return nullptr;
2052 if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
2053 if (C->isZero())
2054 return nullptr;
2056 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2058 BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2059 // Create new preheader for vector loop.
2060 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2061 PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2063 SCEVCheckBlock->getTerminator()->eraseFromParent();
2064 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2065 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2066 SCEVCheckBlock);
2068 DT->addNewBlock(SCEVCheckBlock, Pred);
2069 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2071 ReplaceInstWithInst(
2072 SCEVCheckBlock->getTerminator(),
2073 BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2074 // Mark the check as used, to prevent it from being removed during cleanup.
2075 SCEVCheckCond = nullptr;
2076 return SCEVCheckBlock;
2079 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2080 /// the branches to branch to the vector preheader or \p Bypass, depending on
2081 /// the generated condition.
2082 BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
2083 BasicBlock *LoopVectorPreHeader) {
2084 // Check if we generated code that checks in runtime if arrays overlap.
2085 if (!MemRuntimeCheckCond)
2086 return nullptr;
2088 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2089 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2090 MemCheckBlock);
2092 DT->addNewBlock(MemCheckBlock, Pred);
2093 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2094 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2096 if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2097 PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2099 ReplaceInstWithInst(
2100 MemCheckBlock->getTerminator(),
2101 BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2102 MemCheckBlock->getTerminator()->setDebugLoc(
2103 Pred->getTerminator()->getDebugLoc());
2105 // Mark the check as used, to prevent it from being removed during cleanup.
2106 MemRuntimeCheckCond = nullptr;
2107 return MemCheckBlock;
2111 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2112 // vectorization. The loop needs to be annotated with #pragma omp simd
2113 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2114 // vector length information is not provided, vectorization is not considered
2115 // explicit. Interleave hints are not allowed either. These limitations will be
2116 // relaxed in the future.
2117 // Please, note that we are currently forced to abuse the pragma 'clang
2118 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2119 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2120 // provides *explicit vectorization hints* (LV can bypass legal checks and
2121 // assume that vectorization is legal). However, both hints are implemented
2122 // using the same metadata (llvm.loop.vectorize, processed by
2123 // LoopVectorizeHints). This will be fixed in the future when the native IR
2124 // representation for pragma 'omp simd' is introduced.
2125 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2126 OptimizationRemarkEmitter *ORE) {
2127 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2128 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2130 // Only outer loops with an explicit vectorization hint are supported.
2131 // Unannotated outer loops are ignored.
2132 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2133 return false;
2135 Function *Fn = OuterLp->getHeader()->getParent();
2136 if (!Hints.allowVectorization(Fn, OuterLp,
2137 true /*VectorizeOnlyWhenForced*/)) {
2138 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2139 return false;
2142 if (Hints.getInterleave() > 1) {
2143 // TODO: Interleave support is future work.
2144 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2145 "outer loops.\n");
2146 Hints.emitRemarkWithHints();
2147 return false;
2150 return true;
2153 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2154 OptimizationRemarkEmitter *ORE,
2155 SmallVectorImpl<Loop *> &V) {
2156 // Collect inner loops and outer loops without irreducible control flow. For
2157 // now, only collect outer loops that have explicit vectorization hints. If we
2158 // are stress testing the VPlan H-CFG construction, we collect the outermost
2159 // loop of every loop nest.
2160 if (L.isInnermost() || VPlanBuildStressTest ||
2161 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2162 LoopBlocksRPO RPOT(&L);
2163 RPOT.perform(LI);
2164 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2165 V.push_back(&L);
2166 // TODO: Collect inner loops inside marked outer loops in case
2167 // vectorization fails for the outer loop. Do not invoke
2168 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2169 // already known to be reducible. We can use an inherited attribute for
2170 // that.
2171 return;
2174 for (Loop *InnerL : L)
2175 collectSupportedLoops(*InnerL, LI, ORE, V);
2178 namespace {
2180 /// The LoopVectorize Pass.
2181 struct LoopVectorize : public FunctionPass {
2182 /// Pass identification, replacement for typeid
2183 static char ID;
2185 LoopVectorizePass Impl;
2187 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2188 bool VectorizeOnlyWhenForced = false)
2189 : FunctionPass(ID),
2190 Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2191 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2194 bool runOnFunction(Function &F) override {
2195 if (skipFunction(F))
2196 return false;
2198 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2199 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2200 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2201 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2202 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2203 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2204 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2205 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2206 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2207 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2208 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2209 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2210 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2212 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2213 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2215 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2216 GetLAA, *ORE, PSI).MadeAnyChange;
2219 void getAnalysisUsage(AnalysisUsage &AU) const override {
2220 AU.addRequired<AssumptionCacheTracker>();
2221 AU.addRequired<BlockFrequencyInfoWrapperPass>();
2222 AU.addRequired<DominatorTreeWrapperPass>();
2223 AU.addRequired<LoopInfoWrapperPass>();
2224 AU.addRequired<ScalarEvolutionWrapperPass>();
2225 AU.addRequired<TargetTransformInfoWrapperPass>();
2226 AU.addRequired<AAResultsWrapperPass>();
2227 AU.addRequired<LoopAccessLegacyAnalysis>();
2228 AU.addRequired<DemandedBitsWrapperPass>();
2229 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2230 AU.addRequired<InjectTLIMappingsLegacy>();
2232 // We currently do not preserve loopinfo/dominator analyses with outer loop
2233 // vectorization. Until this is addressed, mark these analyses as preserved
2234 // only for non-VPlan-native path.
2235 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2236 if (!EnableVPlanNativePath) {
2237 AU.addPreserved<LoopInfoWrapperPass>();
2238 AU.addPreserved<DominatorTreeWrapperPass>();
2241 AU.addPreserved<BasicAAWrapperPass>();
2242 AU.addPreserved<GlobalsAAWrapperPass>();
2243 AU.addRequired<ProfileSummaryInfoWrapperPass>();
2247 } // end anonymous namespace
2249 //===----------------------------------------------------------------------===//
2250 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2251 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2252 //===----------------------------------------------------------------------===//
2254 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2255 // We need to place the broadcast of invariant variables outside the loop,
2256 // but only if it's proven safe to do so. Else, broadcast will be inside
2257 // vector loop body.
2258 Instruction *Instr = dyn_cast<Instruction>(V);
2259 bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2260 (!Instr ||
2261 DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2262 // Place the code for broadcasting invariant variables in the new preheader.
2263 IRBuilder<>::InsertPointGuard Guard(Builder);
2264 if (SafeToHoist)
2265 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2267 // Broadcast the scalar into all locations in the vector.
2268 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2270 return Shuf;
2273 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2274 const InductionDescriptor &II, Value *Step, Value *Start,
2275 Instruction *EntryVal, VPValue *Def, VPValue *CastDef,
2276 VPTransformState &State) {
2277 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2278 "Expected either an induction phi-node or a truncate of it!");
2280 // Construct the initial value of the vector IV in the vector loop preheader
2281 auto CurrIP = Builder.saveIP();
2282 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2283 if (isa<TruncInst>(EntryVal)) {
2284 assert(Start->getType()->isIntegerTy() &&
2285 "Truncation requires an integer type");
2286 auto *TruncType = cast<IntegerType>(EntryVal->getType());
2287 Step = Builder.CreateTrunc(Step, TruncType);
2288 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2290 Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2291 Value *SteppedStart =
2292 getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2294 // We create vector phi nodes for both integer and floating-point induction
2295 // variables. Here, we determine the kind of arithmetic we will perform.
2296 Instruction::BinaryOps AddOp;
2297 Instruction::BinaryOps MulOp;
2298 if (Step->getType()->isIntegerTy()) {
2299 AddOp = Instruction::Add;
2300 MulOp = Instruction::Mul;
2301 } else {
2302 AddOp = II.getInductionOpcode();
2303 MulOp = Instruction::FMul;
2306 // Multiply the vectorization factor by the step using integer or
2307 // floating-point arithmetic as appropriate.
2308 Type *StepType = Step->getType();
2309 if (Step->getType()->isFloatingPointTy())
2310 StepType = IntegerType::get(StepType->getContext(),
2311 StepType->getScalarSizeInBits());
2312 Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF);
2313 if (Step->getType()->isFloatingPointTy())
2314 RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType());
2315 Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
2317 // Create a vector splat to use in the induction update.
2319 // FIXME: If the step is non-constant, we create the vector splat with
2320 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2321 // handle a constant vector splat.
2322 Value *SplatVF = isa<Constant>(Mul)
2323 ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2324 : Builder.CreateVectorSplat(VF, Mul);
2325 Builder.restoreIP(CurrIP);
2327 // We may need to add the step a number of times, depending on the unroll
2328 // factor. The last of those goes into the PHI.
2329 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2330 &*LoopVectorBody->getFirstInsertionPt());
2331 VecInd->setDebugLoc(EntryVal->getDebugLoc());
2332 Instruction *LastInduction = VecInd;
2333 for (unsigned Part = 0; Part < UF; ++Part) {
2334 State.set(Def, LastInduction, Part);
2336 if (isa<TruncInst>(EntryVal))
2337 addMetadata(LastInduction, EntryVal);
2338 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef,
2339 State, Part);
2341 LastInduction = cast<Instruction>(
2342 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
2343 LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2346 // Move the last step to the end of the latch block. This ensures consistent
2347 // placement of all induction updates.
2348 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2349 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2350 auto *ICmp = cast<Instruction>(Br->getCondition());
2351 LastInduction->moveBefore(ICmp);
2352 LastInduction->setName("vec.ind.next");
2354 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2355 VecInd->addIncoming(LastInduction, LoopVectorLatch);
2358 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2359 return Cost->isScalarAfterVectorization(I, VF) ||
2360 Cost->isProfitableToScalarize(I, VF);
2363 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2364 if (shouldScalarizeInstruction(IV))
2365 return true;
2366 auto isScalarInst = [&](User *U) -> bool {
2367 auto *I = cast<Instruction>(U);
2368 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2370 return llvm::any_of(IV->users(), isScalarInst);
2373 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2374 const InductionDescriptor &ID, const Instruction *EntryVal,
2375 Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State,
2376 unsigned Part, unsigned Lane) {
2377 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2378 "Expected either an induction phi-node or a truncate of it!");
2380 // This induction variable is not the phi from the original loop but the
2381 // newly-created IV based on the proof that casted Phi is equal to the
2382 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2383 // re-uses the same InductionDescriptor that original IV uses but we don't
2384 // have to do any recording in this case - that is done when original IV is
2385 // processed.
2386 if (isa<TruncInst>(EntryVal))
2387 return;
2389 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2390 if (Casts.empty())
2391 return;
2392 // Only the first Cast instruction in the Casts vector is of interest.
2393 // The rest of the Casts (if exist) have no uses outside the
2394 // induction update chain itself.
2395 if (Lane < UINT_MAX)
2396 State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane));
2397 else
2398 State.set(CastDef, VectorLoopVal, Part);
2401 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
2402 TruncInst *Trunc, VPValue *Def,
2403 VPValue *CastDef,
2404 VPTransformState &State) {
2405 assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2406 "Primary induction variable must have an integer type");
2408 auto II = Legal->getInductionVars().find(IV);
2409 assert(II != Legal->getInductionVars().end() && "IV is not an induction");
2411 auto ID = II->second;
2412 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2414 // The value from the original loop to which we are mapping the new induction
2415 // variable.
2416 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2418 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2420 // Generate code for the induction step. Note that induction steps are
2421 // required to be loop-invariant
2422 auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2423 assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2424 "Induction step should be loop invariant");
2425 if (PSE.getSE()->isSCEVable(IV->getType())) {
2426 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2427 return Exp.expandCodeFor(Step, Step->getType(),
2428 LoopVectorPreHeader->getTerminator());
2430 return cast<SCEVUnknown>(Step)->getValue();
2433 // The scalar value to broadcast. This is derived from the canonical
2434 // induction variable. If a truncation type is given, truncate the canonical
2435 // induction variable and step. Otherwise, derive these values from the
2436 // induction descriptor.
2437 auto CreateScalarIV = [&](Value *&Step) -> Value * {
2438 Value *ScalarIV = Induction;
2439 if (IV != OldInduction) {
2440 ScalarIV = IV->getType()->isIntegerTy()
2441 ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2442 : Builder.CreateCast(Instruction::SIToFP, Induction,
2443 IV->getType());
2444 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2445 ScalarIV->setName("offset.idx");
2447 if (Trunc) {
2448 auto *TruncType = cast<IntegerType>(Trunc->getType());
2449 assert(Step->getType()->isIntegerTy() &&
2450 "Truncation requires an integer step");
2451 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2452 Step = Builder.CreateTrunc(Step, TruncType);
2454 return ScalarIV;
2457 // Create the vector values from the scalar IV, in the absence of creating a
2458 // vector IV.
2459 auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2460 Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2461 for (unsigned Part = 0; Part < UF; ++Part) {
2462 assert(!VF.isScalable() && "scalable vectors not yet supported.");
2463 Value *EntryPart =
2464 getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2465 ID.getInductionOpcode());
2466 State.set(Def, EntryPart, Part);
2467 if (Trunc)
2468 addMetadata(EntryPart, Trunc);
2469 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef,
2470 State, Part);
2474 // Fast-math-flags propagate from the original induction instruction.
2475 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
2476 if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
2477 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
2479 // Now do the actual transformations, and start with creating the step value.
2480 Value *Step = CreateStepValue(ID.getStep());
2481 if (VF.isZero() || VF.isScalar()) {
2482 Value *ScalarIV = CreateScalarIV(Step);
2483 CreateSplatIV(ScalarIV, Step);
2484 return;
2487 // Determine if we want a scalar version of the induction variable. This is
2488 // true if the induction variable itself is not widened, or if it has at
2489 // least one user in the loop that is not widened.
2490 auto NeedsScalarIV = needsScalarInduction(EntryVal);
2491 if (!NeedsScalarIV) {
2492 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2493 State);
2494 return;
2497 // Try to create a new independent vector induction variable. If we can't
2498 // create the phi node, we will splat the scalar induction variable in each
2499 // loop iteration.
2500 if (!shouldScalarizeInstruction(EntryVal)) {
2501 createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2502 State);
2503 Value *ScalarIV = CreateScalarIV(Step);
2504 // Create scalar steps that can be used by instructions we will later
2505 // scalarize. Note that the addition of the scalar steps will not increase
2506 // the number of instructions in the loop in the common case prior to
2507 // InstCombine. We will be trading one vector extract for each scalar step.
2508 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2509 return;
2512 // All IV users are scalar instructions, so only emit a scalar IV, not a
2513 // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2514 // predicate used by the masked loads/stores.
2515 Value *ScalarIV = CreateScalarIV(Step);
2516 if (!Cost->isScalarEpilogueAllowed())
2517 CreateSplatIV(ScalarIV, Step);
2518 buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2521 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2522 Instruction::BinaryOps BinOp) {
2523 // Create and check the types.
2524 auto *ValVTy = cast<VectorType>(Val->getType());
2525 ElementCount VLen = ValVTy->getElementCount();
2527 Type *STy = Val->getType()->getScalarType();
2528 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2529 "Induction Step must be an integer or FP");
2530 assert(Step->getType() == STy && "Step has wrong type");
2532 SmallVector<Constant *, 8> Indices;
2534 // Create a vector of consecutive numbers from zero to VF.
2535 VectorType *InitVecValVTy = ValVTy;
2536 Type *InitVecValSTy = STy;
2537 if (STy->isFloatingPointTy()) {
2538 InitVecValSTy =
2539 IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2540 InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2542 Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2544 // Add on StartIdx
2545 Value *StartIdxSplat = Builder.CreateVectorSplat(
2546 VLen, ConstantInt::get(InitVecValSTy, StartIdx));
2547 InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2549 if (STy->isIntegerTy()) {
2550 Step = Builder.CreateVectorSplat(VLen, Step);
2551 assert(Step->getType() == Val->getType() && "Invalid step vec");
2552 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2553 // which can be found from the original scalar operations.
2554 Step = Builder.CreateMul(InitVec, Step);
2555 return Builder.CreateAdd(Val, Step, "induction");
2558 // Floating point induction.
2559 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2560 "Binary Opcode should be specified for FP induction");
2561 InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2562 Step = Builder.CreateVectorSplat(VLen, Step);
2563 Value *MulOp = Builder.CreateFMul(InitVec, Step);
2564 return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2567 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2568 Instruction *EntryVal,
2569 const InductionDescriptor &ID,
2570 VPValue *Def, VPValue *CastDef,
2571 VPTransformState &State) {
2572 // We shouldn't have to build scalar steps if we aren't vectorizing.
2573 assert(VF.isVector() && "VF should be greater than one");
2574 // Get the value type and ensure it and the step have the same integer type.
2575 Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2576 assert(ScalarIVTy == Step->getType() &&
2577 "Val and Step should have the same type");
2579 // We build scalar steps for both integer and floating-point induction
2580 // variables. Here, we determine the kind of arithmetic we will perform.
2581 Instruction::BinaryOps AddOp;
2582 Instruction::BinaryOps MulOp;
2583 if (ScalarIVTy->isIntegerTy()) {
2584 AddOp = Instruction::Add;
2585 MulOp = Instruction::Mul;
2586 } else {
2587 AddOp = ID.getInductionOpcode();
2588 MulOp = Instruction::FMul;
2591 // Determine the number of scalars we need to generate for each unroll
2592 // iteration. If EntryVal is uniform, we only need to generate the first
2593 // lane. Otherwise, we generate all VF values.
2594 bool IsUniform =
2595 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF);
2596 unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue();
2597 // Compute the scalar steps and save the results in State.
2598 Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2599 ScalarIVTy->getScalarSizeInBits());
2600 Type *VecIVTy = nullptr;
2601 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2602 if (!IsUniform && VF.isScalable()) {
2603 VecIVTy = VectorType::get(ScalarIVTy, VF);
2604 UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF));
2605 SplatStep = Builder.CreateVectorSplat(VF, Step);
2606 SplatIV = Builder.CreateVectorSplat(VF, ScalarIV);
2609 for (unsigned Part = 0; Part < UF; ++Part) {
2610 Value *StartIdx0 =
2611 createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
2613 if (!IsUniform && VF.isScalable()) {
2614 auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0);
2615 auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2616 if (ScalarIVTy->isFloatingPointTy())
2617 InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2618 auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2619 auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2620 State.set(Def, Add, Part);
2621 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2622 Part);
2623 // It's useful to record the lane values too for the known minimum number
2624 // of elements so we do those below. This improves the code quality when
2625 // trying to extract the first element, for example.
2628 if (ScalarIVTy->isFloatingPointTy())
2629 StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2631 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2632 Value *StartIdx = Builder.CreateBinOp(
2633 AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2634 // The step returned by `createStepForVF` is a runtime-evaluated value
2635 // when VF is scalable. Otherwise, it should be folded into a Constant.
2636 assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
2637 "Expected StartIdx to be folded to a constant when VF is not "
2638 "scalable");
2639 auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2640 auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2641 State.set(Def, Add, VPIteration(Part, Lane));
2642 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2643 Part, Lane);
2648 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2649 const VPIteration &Instance,
2650 VPTransformState &State) {
2651 Value *ScalarInst = State.get(Def, Instance);
2652 Value *VectorValue = State.get(Def, Instance.Part);
2653 VectorValue = Builder.CreateInsertElement(
2654 VectorValue, ScalarInst,
2655 Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2656 State.set(Def, VectorValue, Instance.Part);
2659 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2660 assert(Vec->getType()->isVectorTy() && "Invalid type");
2661 return Builder.CreateVectorReverse(Vec, "reverse");
2664 // Return whether we allow using masked interleave-groups (for dealing with
2665 // strided loads/stores that reside in predicated blocks, or for dealing
2666 // with gaps).
2667 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2668 // If an override option has been passed in for interleaved accesses, use it.
2669 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2670 return EnableMaskedInterleavedMemAccesses;
2672 return TTI.enableMaskedInterleavedAccessVectorization();
2675 // Try to vectorize the interleave group that \p Instr belongs to.
2677 // E.g. Translate following interleaved load group (factor = 3):
2678 // for (i = 0; i < N; i+=3) {
2679 // R = Pic[i]; // Member of index 0
2680 // G = Pic[i+1]; // Member of index 1
2681 // B = Pic[i+2]; // Member of index 2
2682 // ... // do something to R, G, B
2683 // }
2684 // To:
2685 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2686 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2687 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2688 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2690 // Or translate following interleaved store group (factor = 3):
2691 // for (i = 0; i < N; i+=3) {
2692 // ... do something to R, G, B
2693 // Pic[i] = R; // Member of index 0
2694 // Pic[i+1] = G; // Member of index 1
2695 // Pic[i+2] = B; // Member of index 2
2696 // }
2697 // To:
2698 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2699 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2700 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2701 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2702 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2703 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2704 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2705 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2706 VPValue *BlockInMask) {
2707 Instruction *Instr = Group->getInsertPos();
2708 const DataLayout &DL = Instr->getModule()->getDataLayout();
2710 // Prepare for the vector type of the interleaved load/store.
2711 Type *ScalarTy = getLoadStoreType(Instr);
2712 unsigned InterleaveFactor = Group->getFactor();
2713 assert(!VF.isScalable() && "scalable vectors not yet supported.");
2714 auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2716 // Prepare for the new pointers.
2717 SmallVector<Value *, 2> AddrParts;
2718 unsigned Index = Group->getIndex(Instr);
2720 // TODO: extend the masked interleaved-group support to reversed access.
2721 assert((!BlockInMask || !Group->isReverse()) &&
2722 "Reversed masked interleave-group not supported.");
2724 // If the group is reverse, adjust the index to refer to the last vector lane
2725 // instead of the first. We adjust the index from the first vector lane,
2726 // rather than directly getting the pointer for lane VF - 1, because the
2727 // pointer operand of the interleaved access is supposed to be uniform. For
2728 // uniform instructions, we're only required to generate a value for the
2729 // first vector lane in each unroll iteration.
2730 if (Group->isReverse())
2731 Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2733 for (unsigned Part = 0; Part < UF; Part++) {
2734 Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2735 setDebugLocFromInst(AddrPart);
2737 // Notice current instruction could be any index. Need to adjust the address
2738 // to the member of index 0.
2740 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2741 // b = A[i]; // Member of index 0
2742 // Current pointer is pointed to A[i+1], adjust it to A[i].
2744 // E.g. A[i+1] = a; // Member of index 1
2745 // A[i] = b; // Member of index 0
2746 // A[i+2] = c; // Member of index 2 (Current instruction)
2747 // Current pointer is pointed to A[i+2], adjust it to A[i].
2749 bool InBounds = false;
2750 if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2751 InBounds = gep->isInBounds();
2752 AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2753 cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2755 // Cast to the vector pointer type.
2756 unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2757 Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2758 AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2761 setDebugLocFromInst(Instr);
2762 Value *PoisonVec = PoisonValue::get(VecTy);
2764 Value *MaskForGaps = nullptr;
2765 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2766 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2767 assert(MaskForGaps && "Mask for Gaps is required but it is null");
2770 // Vectorize the interleaved load group.
2771 if (isa<LoadInst>(Instr)) {
2772 // For each unroll part, create a wide load for the group.
2773 SmallVector<Value *, 2> NewLoads;
2774 for (unsigned Part = 0; Part < UF; Part++) {
2775 Instruction *NewLoad;
2776 if (BlockInMask || MaskForGaps) {
2777 assert(useMaskedInterleavedAccesses(*TTI) &&
2778 "masked interleaved groups are not allowed.");
2779 Value *GroupMask = MaskForGaps;
2780 if (BlockInMask) {
2781 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2782 Value *ShuffledMask = Builder.CreateShuffleVector(
2783 BlockInMaskPart,
2784 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2785 "interleaved.mask");
2786 GroupMask = MaskForGaps
2787 ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2788 MaskForGaps)
2789 : ShuffledMask;
2791 NewLoad =
2792 Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2793 GroupMask, PoisonVec, "wide.masked.vec");
2795 else
2796 NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2797 Group->getAlign(), "wide.vec");
2798 Group->addMetadata(NewLoad);
2799 NewLoads.push_back(NewLoad);
2802 // For each member in the group, shuffle out the appropriate data from the
2803 // wide loads.
2804 unsigned J = 0;
2805 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2806 Instruction *Member = Group->getMember(I);
2808 // Skip the gaps in the group.
2809 if (!Member)
2810 continue;
2812 auto StrideMask =
2813 createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2814 for (unsigned Part = 0; Part < UF; Part++) {
2815 Value *StridedVec = Builder.CreateShuffleVector(
2816 NewLoads[Part], StrideMask, "strided.vec");
2818 // If this member has different type, cast the result type.
2819 if (Member->getType() != ScalarTy) {
2820 assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2821 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2822 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2825 if (Group->isReverse())
2826 StridedVec = reverseVector(StridedVec);
2828 State.set(VPDefs[J], StridedVec, Part);
2830 ++J;
2832 return;
2835 // The sub vector type for current instruction.
2836 auto *SubVT = VectorType::get(ScalarTy, VF);
2838 // Vectorize the interleaved store group.
2839 MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2840 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2841 "masked interleaved groups are not allowed.");
2842 assert((!MaskForGaps || !VF.isScalable()) &&
2843 "masking gaps for scalable vectors is not yet supported.");
2844 for (unsigned Part = 0; Part < UF; Part++) {
2845 // Collect the stored vector from each member.
2846 SmallVector<Value *, 4> StoredVecs;
2847 for (unsigned i = 0; i < InterleaveFactor; i++) {
2848 assert((Group->getMember(i) || MaskForGaps) &&
2849 "Fail to get a member from an interleaved store group");
2850 Instruction *Member = Group->getMember(i);
2852 // Skip the gaps in the group.
2853 if (!Member) {
2854 Value *Undef = PoisonValue::get(SubVT);
2855 StoredVecs.push_back(Undef);
2856 continue;
2859 Value *StoredVec = State.get(StoredValues[i], Part);
2861 if (Group->isReverse())
2862 StoredVec = reverseVector(StoredVec);
2864 // If this member has different type, cast it to a unified type.
2866 if (StoredVec->getType() != SubVT)
2867 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2869 StoredVecs.push_back(StoredVec);
2872 // Concatenate all vectors into a wide vector.
2873 Value *WideVec = concatenateVectors(Builder, StoredVecs);
2875 // Interleave the elements in the wide vector.
2876 Value *IVec = Builder.CreateShuffleVector(
2877 WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2878 "interleaved.vec");
2880 Instruction *NewStoreInstr;
2881 if (BlockInMask || MaskForGaps) {
2882 Value *GroupMask = MaskForGaps;
2883 if (BlockInMask) {
2884 Value *BlockInMaskPart = State.get(BlockInMask, Part);
2885 Value *ShuffledMask = Builder.CreateShuffleVector(
2886 BlockInMaskPart,
2887 createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2888 "interleaved.mask");
2889 GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2890 ShuffledMask, MaskForGaps)
2891 : ShuffledMask;
2893 NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2894 Group->getAlign(), GroupMask);
2895 } else
2896 NewStoreInstr =
2897 Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2899 Group->addMetadata(NewStoreInstr);
2903 void InnerLoopVectorizer::vectorizeMemoryInstruction(
2904 Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2905 VPValue *StoredValue, VPValue *BlockInMask) {
2906 // Attempt to issue a wide load.
2907 LoadInst *LI = dyn_cast<LoadInst>(Instr);
2908 StoreInst *SI = dyn_cast<StoreInst>(Instr);
2910 assert((LI || SI) && "Invalid Load/Store instruction");
2911 assert((!SI || StoredValue) && "No stored value provided for widened store");
2912 assert((!LI || !StoredValue) && "Stored value provided for widened load");
2914 LoopVectorizationCostModel::InstWidening Decision =
2915 Cost->getWideningDecision(Instr, VF);
2916 assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2917 Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2918 Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2919 "CM decision is not to widen the memory instruction");
2921 Type *ScalarDataTy = getLoadStoreType(Instr);
2923 auto *DataTy = VectorType::get(ScalarDataTy, VF);
2924 const Align Alignment = getLoadStoreAlignment(Instr);
2926 // Determine if the pointer operand of the access is either consecutive or
2927 // reverse consecutive.
2928 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2929 bool ConsecutiveStride =
2930 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2931 bool CreateGatherScatter =
2932 (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2934 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2935 // gather/scatter. Otherwise Decision should have been to Scalarize.
2936 assert((ConsecutiveStride || CreateGatherScatter) &&
2937 "The instruction should be scalarized");
2938 (void)ConsecutiveStride;
2940 VectorParts BlockInMaskParts(UF);
2941 bool isMaskRequired = BlockInMask;
2942 if (isMaskRequired)
2943 for (unsigned Part = 0; Part < UF; ++Part)
2944 BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2946 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2947 // Calculate the pointer for the specific unroll-part.
2948 GetElementPtrInst *PartPtr = nullptr;
2950 bool InBounds = false;
2951 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2952 InBounds = gep->isInBounds();
2953 if (Reverse) {
2954 // If the address is consecutive but reversed, then the
2955 // wide store needs to start at the last vector element.
2956 // RunTimeVF = VScale * VF.getKnownMinValue()
2957 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
2958 Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2959 // NumElt = -Part * RunTimeVF
2960 Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
2961 // LastLane = 1 - RunTimeVF
2962 Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
2963 PartPtr =
2964 cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
2965 PartPtr->setIsInBounds(InBounds);
2966 PartPtr = cast<GetElementPtrInst>(
2967 Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
2968 PartPtr->setIsInBounds(InBounds);
2969 if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2970 BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2971 } else {
2972 Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
2973 PartPtr = cast<GetElementPtrInst>(
2974 Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
2975 PartPtr->setIsInBounds(InBounds);
2978 unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2979 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2982 // Handle Stores:
2983 if (SI) {
2984 setDebugLocFromInst(SI);
2986 for (unsigned Part = 0; Part < UF; ++Part) {
2987 Instruction *NewSI = nullptr;
2988 Value *StoredVal = State.get(StoredValue, Part);
2989 if (CreateGatherScatter) {
2990 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2991 Value *VectorGep = State.get(Addr, Part);
2992 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2993 MaskPart);
2994 } else {
2995 if (Reverse) {
2996 // If we store to reverse consecutive memory locations, then we need
2997 // to reverse the order of elements in the stored value.
2998 StoredVal = reverseVector(StoredVal);
2999 // We don't want to update the value in the map as it might be used in
3000 // another expression. So don't call resetVectorValue(StoredVal).
3002 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
3003 if (isMaskRequired)
3004 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
3005 BlockInMaskParts[Part]);
3006 else
3007 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
3009 addMetadata(NewSI, SI);
3011 return;
3014 // Handle loads.
3015 assert(LI && "Must have a load instruction");
3016 setDebugLocFromInst(LI);
3017 for (unsigned Part = 0; Part < UF; ++Part) {
3018 Value *NewLI;
3019 if (CreateGatherScatter) {
3020 Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
3021 Value *VectorGep = State.get(Addr, Part);
3022 NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
3023 nullptr, "wide.masked.gather");
3024 addMetadata(NewLI, LI);
3025 } else {
3026 auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
3027 if (isMaskRequired)
3028 NewLI = Builder.CreateMaskedLoad(
3029 DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
3030 PoisonValue::get(DataTy), "wide.masked.load");
3031 else
3032 NewLI =
3033 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
3035 // Add metadata to the load, but setVectorValue to the reverse shuffle.
3036 addMetadata(NewLI, LI);
3037 if (Reverse)
3038 NewLI = reverseVector(NewLI);
3041 State.set(Def, NewLI, Part);
3045 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
3046 VPUser &User,
3047 const VPIteration &Instance,
3048 bool IfPredicateInstr,
3049 VPTransformState &State) {
3050 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
3052 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
3053 // the first lane and part.
3054 if (isa<NoAliasScopeDeclInst>(Instr))
3055 if (!Instance.isFirstIteration())
3056 return;
3058 setDebugLocFromInst(Instr);
3060 // Does this instruction return a value ?
3061 bool IsVoidRetTy = Instr->getType()->isVoidTy();
3063 Instruction *Cloned = Instr->clone();
3064 if (!IsVoidRetTy)
3065 Cloned->setName(Instr->getName() + ".cloned");
3067 State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
3068 Builder.GetInsertPoint());
3069 // Replace the operands of the cloned instructions with their scalar
3070 // equivalents in the new loop.
3071 for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
3072 auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
3073 auto InputInstance = Instance;
3074 if (!Operand || !OrigLoop->contains(Operand) ||
3075 (Cost->isUniformAfterVectorization(Operand, State.VF)))
3076 InputInstance.Lane = VPLane::getFirstLane();
3077 auto *NewOp = State.get(User.getOperand(op), InputInstance);
3078 Cloned->setOperand(op, NewOp);
3080 addNewMetadata(Cloned, Instr);
3082 // Place the cloned scalar in the new loop.
3083 Builder.Insert(Cloned);
3085 State.set(Def, Cloned, Instance);
3087 // If we just cloned a new assumption, add it the assumption cache.
3088 if (auto *II = dyn_cast<AssumeInst>(Cloned))
3089 AC->registerAssumption(II);
3091 // End if-block.
3092 if (IfPredicateInstr)
3093 PredicatedInstructions.push_back(Cloned);
3096 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
3097 Value *End, Value *Step,
3098 Instruction *DL) {
3099 BasicBlock *Header = L->getHeader();
3100 BasicBlock *Latch = L->getLoopLatch();
3101 // As we're just creating this loop, it's possible no latch exists
3102 // yet. If so, use the header as this will be a single block loop.
3103 if (!Latch)
3104 Latch = Header;
3106 IRBuilder<> B(&*Header->getFirstInsertionPt());
3107 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
3108 setDebugLocFromInst(OldInst, &B);
3109 auto *Induction = B.CreatePHI(Start->getType(), 2, "index");
3111 B.SetInsertPoint(Latch->getTerminator());
3112 setDebugLocFromInst(OldInst, &B);
3114 // Create i+1 and fill the PHINode.
3116 // If the tail is not folded, we know that End - Start >= Step (either
3117 // statically or through the minimum iteration checks). We also know that both
3118 // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV +
3119 // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned
3120 // overflows and we can mark the induction increment as NUW.
3121 Value *Next = B.CreateAdd(Induction, Step, "index.next",
3122 /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false);
3123 Induction->addIncoming(Start, L->getLoopPreheader());
3124 Induction->addIncoming(Next, Latch);
3125 // Create the compare.
3126 Value *ICmp = B.CreateICmpEQ(Next, End);
3127 B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
3129 // Now we have two terminators. Remove the old one from the block.
3130 Latch->getTerminator()->eraseFromParent();
3132 return Induction;
3135 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3136 if (TripCount)
3137 return TripCount;
3139 assert(L && "Create Trip Count for null loop.");
3140 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3141 // Find the loop boundaries.
3142 ScalarEvolution *SE = PSE.getSE();
3143 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3144 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
3145 "Invalid loop count");
3147 Type *IdxTy = Legal->getWidestInductionType();
3148 assert(IdxTy && "No type for induction");
3150 // The exit count might have the type of i64 while the phi is i32. This can
3151 // happen if we have an induction variable that is sign extended before the
3152 // compare. The only way that we get a backedge taken count is that the
3153 // induction variable was signed and as such will not overflow. In such a case
3154 // truncation is legal.
3155 if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
3156 IdxTy->getPrimitiveSizeInBits())
3157 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3158 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3160 // Get the total trip count from the count by adding 1.
3161 const SCEV *ExitCount = SE->getAddExpr(
3162 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3164 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3166 // Expand the trip count and place the new instructions in the preheader.
3167 // Notice that the pre-header does not change, only the loop body.
3168 SCEVExpander Exp(*SE, DL, "induction");
3170 // Count holds the overall loop count (N).
3171 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3172 L->getLoopPreheader()->getTerminator());
3174 if (TripCount->getType()->isPointerTy())
3175 TripCount =
3176 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3177 L->getLoopPreheader()->getTerminator());
3179 return TripCount;
3182 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3183 if (VectorTripCount)
3184 return VectorTripCount;
3186 Value *TC = getOrCreateTripCount(L);
3187 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3189 Type *Ty = TC->getType();
3190 // This is where we can make the step a runtime constant.
3191 Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
3193 // If the tail is to be folded by masking, round the number of iterations N
3194 // up to a multiple of Step instead of rounding down. This is done by first
3195 // adding Step-1 and then rounding down. Note that it's ok if this addition
3196 // overflows: the vector induction variable will eventually wrap to zero given
3197 // that it starts at zero and its Step is a power of two; the loop will then
3198 // exit, with the last early-exit vector comparison also producing all-true.
3199 if (Cost->foldTailByMasking()) {
3200 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
3201 "VF*UF must be a power of 2 when folding tail by masking");
3202 assert(!VF.isScalable() &&
3203 "Tail folding not yet supported for scalable vectors");
3204 TC = Builder.CreateAdd(
3205 TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
3208 // Now we need to generate the expression for the part of the loop that the
3209 // vectorized body will execute. This is equal to N - (N % Step) if scalar
3210 // iterations are not required for correctness, or N - Step, otherwise. Step
3211 // is equal to the vectorization factor (number of SIMD elements) times the
3212 // unroll factor (number of SIMD instructions).
3213 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3215 // There are cases where we *must* run at least one iteration in the remainder
3216 // loop. See the cost model for when this can happen. If the step evenly
3217 // divides the trip count, we set the remainder to be equal to the step. If
3218 // the step does not evenly divide the trip count, no adjustment is necessary
3219 // since there will already be scalar iterations. Note that the minimum
3220 // iterations check ensures that N >= Step.
3221 if (Cost->requiresScalarEpilogue(VF)) {
3222 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3223 R = Builder.CreateSelect(IsZero, Step, R);
3226 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3228 return VectorTripCount;
3231 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3232 const DataLayout &DL) {
3233 // Verify that V is a vector type with same number of elements as DstVTy.
3234 auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3235 unsigned VF = DstFVTy->getNumElements();
3236 auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3237 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3238 Type *SrcElemTy = SrcVecTy->getElementType();
3239 Type *DstElemTy = DstFVTy->getElementType();
3240 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3241 "Vector elements must have same size");
3243 // Do a direct cast if element types are castable.
3244 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3245 return Builder.CreateBitOrPointerCast(V, DstFVTy);
3247 // V cannot be directly casted to desired vector type.
3248 // May happen when V is a floating point vector but DstVTy is a vector of
3249 // pointers or vice-versa. Handle this using a two-step bitcast using an
3250 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3251 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3252 "Only one type should be a pointer type");
3253 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3254 "Only one type should be a floating point type");
3255 Type *IntTy =
3256 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3257 auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3258 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3259 return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3262 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3263 BasicBlock *Bypass) {
3264 Value *Count = getOrCreateTripCount(L);
3265 // Reuse existing vector loop preheader for TC checks.
3266 // Note that new preheader block is generated for vector loop.
3267 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3268 IRBuilder<> Builder(TCCheckBlock->getTerminator());
3270 // Generate code to check if the loop's trip count is less than VF * UF, or
3271 // equal to it in case a scalar epilogue is required; this implies that the
3272 // vector trip count is zero. This check also covers the case where adding one
3273 // to the backedge-taken count overflowed leading to an incorrect trip count
3274 // of zero. In this case we will also jump to the scalar loop.
3275 auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
3276 : ICmpInst::ICMP_ULT;
3278 // If tail is to be folded, vector loop takes care of all iterations.
3279 Value *CheckMinIters = Builder.getFalse();
3280 if (!Cost->foldTailByMasking()) {
3281 Value *Step =
3282 createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
3283 CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3285 // Create new preheader for vector loop.
3286 LoopVectorPreHeader =
3287 SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3288 "vector.ph");
3290 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3291 DT->getNode(Bypass)->getIDom()) &&
3292 "TC check is expected to dominate Bypass");
3294 // Update dominator for Bypass & LoopExit (if needed).
3295 DT->changeImmediateDominator(Bypass, TCCheckBlock);
3296 if (!Cost->requiresScalarEpilogue(VF))
3297 // If there is an epilogue which must run, there's no edge from the
3298 // middle block to exit blocks and thus no need to update the immediate
3299 // dominator of the exit blocks.
3300 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3302 ReplaceInstWithInst(
3303 TCCheckBlock->getTerminator(),
3304 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3305 LoopBypassBlocks.push_back(TCCheckBlock);
3308 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3310 BasicBlock *const SCEVCheckBlock =
3311 RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
3312 if (!SCEVCheckBlock)
3313 return nullptr;
3315 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3316 (OptForSizeBasedOnProfile &&
3317 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3318 "Cannot SCEV check stride or overflow when optimizing for size");
3321 // Update dominator only if this is first RT check.
3322 if (LoopBypassBlocks.empty()) {
3323 DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3324 if (!Cost->requiresScalarEpilogue(VF))
3325 // If there is an epilogue which must run, there's no edge from the
3326 // middle block to exit blocks and thus no need to update the immediate
3327 // dominator of the exit blocks.
3328 DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3331 LoopBypassBlocks.push_back(SCEVCheckBlock);
3332 AddedSafetyChecks = true;
3333 return SCEVCheckBlock;
3336 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
3337 BasicBlock *Bypass) {
3338 // VPlan-native path does not do any analysis for runtime checks currently.
3339 if (EnableVPlanNativePath)
3340 return nullptr;
3342 BasicBlock *const MemCheckBlock =
3343 RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3345 // Check if we generated code that checks in runtime if arrays overlap. We put
3346 // the checks into a separate block to make the more common case of few
3347 // elements faster.
3348 if (!MemCheckBlock)
3349 return nullptr;
3351 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3352 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3353 "Cannot emit memory checks when optimizing for size, unless forced "
3354 "to vectorize.");
3355 ORE->emit([&]() {
3356 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3357 L->getStartLoc(), L->getHeader())
3358 << "Code-size may be reduced by not forcing "
3359 "vectorization, or by source-code modifications "
3360 "eliminating the need for runtime checks "
3361 "(e.g., adding 'restrict').";
3365 LoopBypassBlocks.push_back(MemCheckBlock);
3367 AddedSafetyChecks = true;
3369 // We currently don't use LoopVersioning for the actual loop cloning but we
3370 // still use it to add the noalias metadata.
3371 LVer = std::make_unique<LoopVersioning>(
3372 *Legal->getLAI(),
3373 Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3374 DT, PSE.getSE());
3375 LVer->prepareNoAliasMetadata();
3376 return MemCheckBlock;
3379 Value *InnerLoopVectorizer::emitTransformedIndex(
3380 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3381 const InductionDescriptor &ID) const {
3383 SCEVExpander Exp(*SE, DL, "induction");
3384 auto Step = ID.getStep();
3385 auto StartValue = ID.getStartValue();
3386 assert(Index->getType()->getScalarType() == Step->getType() &&
3387 "Index scalar type does not match StepValue type");
3389 // Note: the IR at this point is broken. We cannot use SE to create any new
3390 // SCEV and then expand it, hoping that SCEV's simplification will give us
3391 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3392 // lead to various SCEV crashes. So all we can do is to use builder and rely
3393 // on InstCombine for future simplifications. Here we handle some trivial
3394 // cases only.
3395 auto CreateAdd = [&B](Value *X, Value *Y) {
3396 assert(X->getType() == Y->getType() && "Types don't match!");
3397 if (auto *CX = dyn_cast<ConstantInt>(X))
3398 if (CX->isZero())
3399 return Y;
3400 if (auto *CY = dyn_cast<ConstantInt>(Y))
3401 if (CY->isZero())
3402 return X;
3403 return B.CreateAdd(X, Y);
3406 // We allow X to be a vector type, in which case Y will potentially be
3407 // splatted into a vector with the same element count.
3408 auto CreateMul = [&B](Value *X, Value *Y) {
3409 assert(X->getType()->getScalarType() == Y->getType() &&
3410 "Types don't match!");
3411 if (auto *CX = dyn_cast<ConstantInt>(X))
3412 if (CX->isOne())
3413 return Y;
3414 if (auto *CY = dyn_cast<ConstantInt>(Y))
3415 if (CY->isOne())
3416 return X;
3417 VectorType *XVTy = dyn_cast<VectorType>(X->getType());
3418 if (XVTy && !isa<VectorType>(Y->getType()))
3419 Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
3420 return B.CreateMul(X, Y);
3423 // Get a suitable insert point for SCEV expansion. For blocks in the vector
3424 // loop, choose the end of the vector loop header (=LoopVectorBody), because
3425 // the DomTree is not kept up-to-date for additional blocks generated in the
3426 // vector loop. By using the header as insertion point, we guarantee that the
3427 // expanded instructions dominate all their uses.
3428 auto GetInsertPoint = [this, &B]() {
3429 BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3430 if (InsertBB != LoopVectorBody &&
3431 LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3432 return LoopVectorBody->getTerminator();
3433 return &*B.GetInsertPoint();
3436 switch (ID.getKind()) {
3437 case InductionDescriptor::IK_IntInduction: {
3438 assert(!isa<VectorType>(Index->getType()) &&
3439 "Vector indices not supported for integer inductions yet");
3440 assert(Index->getType() == StartValue->getType() &&
3441 "Index type does not match StartValue type");
3442 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3443 return B.CreateSub(StartValue, Index);
3444 auto *Offset = CreateMul(
3445 Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3446 return CreateAdd(StartValue, Offset);
3448 case InductionDescriptor::IK_PtrInduction: {
3449 assert(isa<SCEVConstant>(Step) &&
3450 "Expected constant step for pointer induction");
3451 return B.CreateGEP(
3452 StartValue->getType()->getPointerElementType(), StartValue,
3453 CreateMul(Index,
3454 Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
3455 GetInsertPoint())));
3457 case InductionDescriptor::IK_FpInduction: {
3458 assert(!isa<VectorType>(Index->getType()) &&
3459 "Vector indices not supported for FP inductions yet");
3460 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3461 auto InductionBinOp = ID.getInductionBinOp();
3462 assert(InductionBinOp &&
3463 (InductionBinOp->getOpcode() == Instruction::FAdd ||
3464 InductionBinOp->getOpcode() == Instruction::FSub) &&
3465 "Original bin op should be defined for FP induction");
3467 Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3468 Value *MulExp = B.CreateFMul(StepValue, Index);
3469 return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3470 "induction");
3472 case InductionDescriptor::IK_NoInduction:
3473 return nullptr;
3475 llvm_unreachable("invalid enum");
3478 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3479 LoopScalarBody = OrigLoop->getHeader();
3480 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3481 assert(LoopVectorPreHeader && "Invalid loop structure");
3482 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3483 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3484 "multiple exit loop without required epilogue?");
3486 LoopMiddleBlock =
3487 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3488 LI, nullptr, Twine(Prefix) + "middle.block");
3489 LoopScalarPreHeader =
3490 SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3491 nullptr, Twine(Prefix) + "scalar.ph");
3493 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3495 // Set up the middle block terminator. Two cases:
3496 // 1) If we know that we must execute the scalar epilogue, emit an
3497 // unconditional branch.
3498 // 2) Otherwise, we must have a single unique exit block (due to how we
3499 // implement the multiple exit case). In this case, set up a conditonal
3500 // branch from the middle block to the loop scalar preheader, and the
3501 // exit block. completeLoopSkeleton will update the condition to use an
3502 // iteration check, if required to decide whether to execute the remainder.
3503 BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3504 BranchInst::Create(LoopScalarPreHeader) :
3505 BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3506 Builder.getTrue());
3507 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3508 ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3510 // We intentionally don't let SplitBlock to update LoopInfo since
3511 // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3512 // LoopVectorBody is explicitly added to the correct place few lines later.
3513 LoopVectorBody =
3514 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3515 nullptr, nullptr, Twine(Prefix) + "vector.body");
3517 // Update dominator for loop exit.
3518 if (!Cost->requiresScalarEpilogue(VF))
3519 // If there is an epilogue which must run, there's no edge from the
3520 // middle block to exit blocks and thus no need to update the immediate
3521 // dominator of the exit blocks.
3522 DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3524 // Create and register the new vector loop.
3525 Loop *Lp = LI->AllocateLoop();
3526 Loop *ParentLoop = OrigLoop->getParentLoop();
3528 // Insert the new loop into the loop nest and register the new basic blocks
3529 // before calling any utilities such as SCEV that require valid LoopInfo.
3530 if (ParentLoop) {
3531 ParentLoop->addChildLoop(Lp);
3532 } else {
3533 LI->addTopLevelLoop(Lp);
3535 Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3536 return Lp;
3539 void InnerLoopVectorizer::createInductionResumeValues(
3540 Loop *L, Value *VectorTripCount,
3541 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3542 assert(VectorTripCount && L && "Expected valid arguments");
3543 assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3544 (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3545 "Inconsistent information about additional bypass.");
3546 // We are going to resume the execution of the scalar loop.
3547 // Go over all of the induction variables that we found and fix the
3548 // PHIs that are left in the scalar version of the loop.
3549 // The starting values of PHI nodes depend on the counter of the last
3550 // iteration in the vectorized loop.
3551 // If we come from a bypass edge then we need to start from the original
3552 // start value.
3553 for (auto &InductionEntry : Legal->getInductionVars()) {
3554 PHINode *OrigPhi = InductionEntry.first;
3555 InductionDescriptor II = InductionEntry.second;
3557 // Create phi nodes to merge from the backedge-taken check block.
3558 PHINode *BCResumeVal =
3559 PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3560 LoopScalarPreHeader->getTerminator());
3561 // Copy original phi DL over to the new one.
3562 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3563 Value *&EndValue = IVEndValues[OrigPhi];
3564 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3565 if (OrigPhi == OldInduction) {
3566 // We know what the end value is.
3567 EndValue = VectorTripCount;
3568 } else {
3569 IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3571 // Fast-math-flags propagate from the original induction instruction.
3572 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3573 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3575 Type *StepType = II.getStep()->getType();
3576 Instruction::CastOps CastOp =
3577 CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3578 Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3579 const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3580 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3581 EndValue->setName("ind.end");
3583 // Compute the end value for the additional bypass (if applicable).
3584 if (AdditionalBypass.first) {
3585 B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3586 CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3587 StepType, true);
3588 CRD =
3589 B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3590 EndValueFromAdditionalBypass =
3591 emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3592 EndValueFromAdditionalBypass->setName("ind.end");
3595 // The new PHI merges the original incoming value, in case of a bypass,
3596 // or the value at the end of the vectorized loop.
3597 BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3599 // Fix the scalar body counter (PHI node).
3600 // The old induction's phi node in the scalar body needs the truncated
3601 // value.
3602 for (BasicBlock *BB : LoopBypassBlocks)
3603 BCResumeVal->addIncoming(II.getStartValue(), BB);
3605 if (AdditionalBypass.first)
3606 BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3607 EndValueFromAdditionalBypass);
3609 OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3613 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3614 MDNode *OrigLoopID) {
3615 assert(L && "Expected valid loop.");
3617 // The trip counts should be cached by now.
3618 Value *Count = getOrCreateTripCount(L);
3619 Value *VectorTripCount = getOrCreateVectorTripCount(L);
3621 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3623 // Add a check in the middle block to see if we have completed
3624 // all of the iterations in the first vector loop. Three cases:
3625 // 1) If we require a scalar epilogue, there is no conditional branch as
3626 // we unconditionally branch to the scalar preheader. Do nothing.
3627 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3628 // Thus if tail is to be folded, we know we don't need to run the
3629 // remainder and we can use the previous value for the condition (true).
3630 // 3) Otherwise, construct a runtime check.
3631 if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3632 Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3633 Count, VectorTripCount, "cmp.n",
3634 LoopMiddleBlock->getTerminator());
3636 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3637 // of the corresponding compare because they may have ended up with
3638 // different line numbers and we want to avoid awkward line stepping while
3639 // debugging. Eg. if the compare has got a line number inside the loop.
3640 CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3641 cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3644 // Get ready to start creating new instructions into the vectorized body.
3645 assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3646 "Inconsistent vector loop preheader");
3647 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3649 Optional<MDNode *> VectorizedLoopID =
3650 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3651 LLVMLoopVectorizeFollowupVectorized});
3652 if (VectorizedLoopID.hasValue()) {
3653 L->setLoopID(VectorizedLoopID.getValue());
3655 // Do not setAlreadyVectorized if loop attributes have been defined
3656 // explicitly.
3657 return LoopVectorPreHeader;
3660 // Keep all loop hints from the original loop on the vector loop (we'll
3661 // replace the vectorizer-specific hints below).
3662 if (MDNode *LID = OrigLoop->getLoopID())
3663 L->setLoopID(LID);
3665 LoopVectorizeHints Hints(L, true, *ORE);
3666 Hints.setAlreadyVectorized();
3668 #ifdef EXPENSIVE_CHECKS
3669 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3670 LI->verify(*DT);
3671 #endif
3673 return LoopVectorPreHeader;
3676 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3678 In this function we generate a new loop. The new loop will contain
3679 the vectorized instructions while the old loop will continue to run the
3680 scalar remainder.
3682 [ ] <-- loop iteration number check.
3685 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3686 | / |
3687 | / v
3688 || [ ] <-- vector pre header.
3689 |/ |
3691 | [ ] \
3692 | [ ]_| <-- vector loop.
3695 \ -[ ] <--- middle-block.
3696 \/ |
3697 /\ v
3698 | ->[ ] <--- new preheader.
3700 (opt) v <-- edge from middle to exit iff epilogue is not required.
3701 | [ ] \
3702 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3705 >[ ] <-- exit block(s).
3709 // Get the metadata of the original loop before it gets modified.
3710 MDNode *OrigLoopID = OrigLoop->getLoopID();
3712 // Workaround! Compute the trip count of the original loop and cache it
3713 // before we start modifying the CFG. This code has a systemic problem
3714 // wherein it tries to run analysis over partially constructed IR; this is
3715 // wrong, and not simply for SCEV. The trip count of the original loop
3716 // simply happens to be prone to hitting this in practice. In theory, we
3717 // can hit the same issue for any SCEV, or ValueTracking query done during
3718 // mutation. See PR49900.
3719 getOrCreateTripCount(OrigLoop);
3721 // Create an empty vector loop, and prepare basic blocks for the runtime
3722 // checks.
3723 Loop *Lp = createVectorLoopSkeleton("");
3725 // Now, compare the new count to zero. If it is zero skip the vector loop and
3726 // jump to the scalar loop. This check also covers the case where the
3727 // backedge-taken count is uint##_max: adding one to it will overflow leading
3728 // to an incorrect trip count of zero. In this (rare) case we will also jump
3729 // to the scalar loop.
3730 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3732 // Generate the code to check any assumptions that we've made for SCEV
3733 // expressions.
3734 emitSCEVChecks(Lp, LoopScalarPreHeader);
3736 // Generate the code that checks in runtime if arrays overlap. We put the
3737 // checks into a separate block to make the more common case of few elements
3738 // faster.
3739 emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3741 // Some loops have a single integer induction variable, while other loops
3742 // don't. One example is c++ iterators that often have multiple pointer
3743 // induction variables. In the code below we also support a case where we
3744 // don't have a single induction variable.
3746 // We try to obtain an induction variable from the original loop as hard
3747 // as possible. However if we don't find one that:
3748 // - is an integer
3749 // - counts from zero, stepping by one
3750 // - is the size of the widest induction variable type
3751 // then we create a new one.
3752 OldInduction = Legal->getPrimaryInduction();
3753 Type *IdxTy = Legal->getWidestInductionType();
3754 Value *StartIdx = ConstantInt::get(IdxTy, 0);
3755 // The loop step is equal to the vectorization factor (num of SIMD elements)
3756 // times the unroll factor (num of SIMD instructions).
3757 Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3758 Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
3759 Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3760 Induction =
3761 createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3762 getDebugLocFromInstOrOperands(OldInduction));
3764 // Emit phis for the new starting index of the scalar loop.
3765 createInductionResumeValues(Lp, CountRoundDown);
3767 return completeLoopSkeleton(Lp, OrigLoopID);
3770 // Fix up external users of the induction variable. At this point, we are
3771 // in LCSSA form, with all external PHIs that use the IV having one input value,
3772 // coming from the remainder loop. We need those PHIs to also have a correct
3773 // value for the IV when arriving directly from the middle block.
3774 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3775 const InductionDescriptor &II,
3776 Value *CountRoundDown, Value *EndValue,
3777 BasicBlock *MiddleBlock) {
3778 // There are two kinds of external IV usages - those that use the value
3779 // computed in the last iteration (the PHI) and those that use the penultimate
3780 // value (the value that feeds into the phi from the loop latch).
3781 // We allow both, but they, obviously, have different values.
3783 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3785 DenseMap<Value *, Value *> MissingVals;
3787 // An external user of the last iteration's value should see the value that
3788 // the remainder loop uses to initialize its own IV.
3789 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3790 for (User *U : PostInc->users()) {
3791 Instruction *UI = cast<Instruction>(U);
3792 if (!OrigLoop->contains(UI)) {
3793 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3794 MissingVals[UI] = EndValue;
3798 // An external user of the penultimate value need to see EndValue - Step.
3799 // The simplest way to get this is to recompute it from the constituent SCEVs,
3800 // that is Start + (Step * (CRD - 1)).
3801 for (User *U : OrigPhi->users()) {
3802 auto *UI = cast<Instruction>(U);
3803 if (!OrigLoop->contains(UI)) {
3804 const DataLayout &DL =
3805 OrigLoop->getHeader()->getModule()->getDataLayout();
3806 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3808 IRBuilder<> B(MiddleBlock->getTerminator());
3810 // Fast-math-flags propagate from the original induction instruction.
3811 if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3812 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3814 Value *CountMinusOne = B.CreateSub(
3815 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3816 Value *CMO =
3817 !II.getStep()->getType()->isIntegerTy()
3818 ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3819 II.getStep()->getType())
3820 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3821 CMO->setName("cast.cmo");
3822 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3823 Escape->setName("ind.escape");
3824 MissingVals[UI] = Escape;
3828 for (auto &I : MissingVals) {
3829 PHINode *PHI = cast<PHINode>(I.first);
3830 // One corner case we have to handle is two IVs "chasing" each-other,
3831 // that is %IV2 = phi [...], [ %IV1, %latch ]
3832 // In this case, if IV1 has an external use, we need to avoid adding both
3833 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3834 // don't already have an incoming value for the middle block.
3835 if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3836 PHI->addIncoming(I.second, MiddleBlock);
3840 namespace {
3842 struct CSEDenseMapInfo {
3843 static bool canHandle(const Instruction *I) {
3844 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3845 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3848 static inline Instruction *getEmptyKey() {
3849 return DenseMapInfo<Instruction *>::getEmptyKey();
3852 static inline Instruction *getTombstoneKey() {
3853 return DenseMapInfo<Instruction *>::getTombstoneKey();
3856 static unsigned getHashValue(const Instruction *I) {
3857 assert(canHandle(I) && "Unknown instruction!");
3858 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3859 I->value_op_end()));
3862 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3863 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3864 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3865 return LHS == RHS;
3866 return LHS->isIdenticalTo(RHS);
3870 } // end anonymous namespace
3872 ///Perform cse of induction variable instructions.
3873 static void cse(BasicBlock *BB) {
3874 // Perform simple cse.
3875 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3876 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3877 Instruction *In = &*I++;
3879 if (!CSEDenseMapInfo::canHandle(In))
3880 continue;
3882 // Check if we can replace this instruction with any of the
3883 // visited instructions.
3884 if (Instruction *V = CSEMap.lookup(In)) {
3885 In->replaceAllUsesWith(V);
3886 In->eraseFromParent();
3887 continue;
3890 CSEMap[In] = In;
3894 InstructionCost
3895 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3896 bool &NeedToScalarize) const {
3897 Function *F = CI->getCalledFunction();
3898 Type *ScalarRetTy = CI->getType();
3899 SmallVector<Type *, 4> Tys, ScalarTys;
3900 for (auto &ArgOp : CI->arg_operands())
3901 ScalarTys.push_back(ArgOp->getType());
3903 // Estimate cost of scalarized vector call. The source operands are assumed
3904 // to be vectors, so we need to extract individual elements from there,
3905 // execute VF scalar calls, and then gather the result into the vector return
3906 // value.
3907 InstructionCost ScalarCallCost =
3908 TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3909 if (VF.isScalar())
3910 return ScalarCallCost;
3912 // Compute corresponding vector type for return value and arguments.
3913 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3914 for (Type *ScalarTy : ScalarTys)
3915 Tys.push_back(ToVectorTy(ScalarTy, VF));
3917 // Compute costs of unpacking argument values for the scalar calls and
3918 // packing the return values to a vector.
3919 InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3921 InstructionCost Cost =
3922 ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3924 // If we can't emit a vector call for this function, then the currently found
3925 // cost is the cost we need to return.
3926 NeedToScalarize = true;
3927 VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3928 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3930 if (!TLI || CI->isNoBuiltin() || !VecFunc)
3931 return Cost;
3933 // If the corresponding vector cost is cheaper, return its cost.
3934 InstructionCost VectorCallCost =
3935 TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3936 if (VectorCallCost < Cost) {
3937 NeedToScalarize = false;
3938 Cost = VectorCallCost;
3940 return Cost;
3943 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3944 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3945 return Elt;
3946 return VectorType::get(Elt, VF);
3949 InstructionCost
3950 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3951 ElementCount VF) const {
3952 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3953 assert(ID && "Expected intrinsic call!");
3954 Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3955 FastMathFlags FMF;
3956 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3957 FMF = FPMO->getFastMathFlags();
3959 SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());
3960 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3961 SmallVector<Type *> ParamTys;
3962 std::transform(FTy->param_begin(), FTy->param_end(),
3963 std::back_inserter(ParamTys),
3964 [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3966 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3967 dyn_cast<IntrinsicInst>(CI));
3968 return TTI.getIntrinsicInstrCost(CostAttrs,
3969 TargetTransformInfo::TCK_RecipThroughput);
3972 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3973 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3974 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3975 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3978 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3979 auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3980 auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3981 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3984 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3985 // For every instruction `I` in MinBWs, truncate the operands, create a
3986 // truncated version of `I` and reextend its result. InstCombine runs
3987 // later and will remove any ext/trunc pairs.
3988 SmallPtrSet<Value *, 4> Erased;
3989 for (const auto &KV : Cost->getMinimalBitwidths()) {
3990 // If the value wasn't vectorized, we must maintain the original scalar
3991 // type. The absence of the value from State indicates that it
3992 // wasn't vectorized.
3993 VPValue *Def = State.Plan->getVPValue(KV.first);
3994 if (!State.hasAnyVectorValue(Def))
3995 continue;
3996 for (unsigned Part = 0; Part < UF; ++Part) {
3997 Value *I = State.get(Def, Part);
3998 if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3999 continue;
4000 Type *OriginalTy = I->getType();
4001 Type *ScalarTruncatedTy =
4002 IntegerType::get(OriginalTy->getContext(), KV.second);
4003 auto *TruncatedTy = VectorType::get(
4004 ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
4005 if (TruncatedTy == OriginalTy)
4006 continue;
4008 IRBuilder<> B(cast<Instruction>(I));
4009 auto ShrinkOperand = [&](Value *V) -> Value * {
4010 if (auto *ZI = dyn_cast<ZExtInst>(V))
4011 if (ZI->getSrcTy() == TruncatedTy)
4012 return ZI->getOperand(0);
4013 return B.CreateZExtOrTrunc(V, TruncatedTy);
4016 // The actual instruction modification depends on the instruction type,
4017 // unfortunately.
4018 Value *NewI = nullptr;
4019 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
4020 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
4021 ShrinkOperand(BO->getOperand(1)));
4023 // Any wrapping introduced by shrinking this operation shouldn't be
4024 // considered undefined behavior. So, we can't unconditionally copy
4025 // arithmetic wrapping flags to NewI.
4026 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
4027 } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
4028 NewI =
4029 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
4030 ShrinkOperand(CI->getOperand(1)));
4031 } else if (auto *SI = dyn_cast<SelectInst>(I)) {
4032 NewI = B.CreateSelect(SI->getCondition(),
4033 ShrinkOperand(SI->getTrueValue()),
4034 ShrinkOperand(SI->getFalseValue()));
4035 } else if (auto *CI = dyn_cast<CastInst>(I)) {
4036 switch (CI->getOpcode()) {
4037 default:
4038 llvm_unreachable("Unhandled cast!");
4039 case Instruction::Trunc:
4040 NewI = ShrinkOperand(CI->getOperand(0));
4041 break;
4042 case Instruction::SExt:
4043 NewI = B.CreateSExtOrTrunc(
4044 CI->getOperand(0),
4045 smallestIntegerVectorType(OriginalTy, TruncatedTy));
4046 break;
4047 case Instruction::ZExt:
4048 NewI = B.CreateZExtOrTrunc(
4049 CI->getOperand(0),
4050 smallestIntegerVectorType(OriginalTy, TruncatedTy));
4051 break;
4053 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
4054 auto Elements0 =
4055 cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
4056 auto *O0 = B.CreateZExtOrTrunc(
4057 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
4058 auto Elements1 =
4059 cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
4060 auto *O1 = B.CreateZExtOrTrunc(
4061 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
4063 NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
4064 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
4065 // Don't do anything with the operands, just extend the result.
4066 continue;
4067 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
4068 auto Elements =
4069 cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
4070 auto *O0 = B.CreateZExtOrTrunc(
4071 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
4072 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
4073 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
4074 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
4075 auto Elements =
4076 cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
4077 auto *O0 = B.CreateZExtOrTrunc(
4078 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
4079 NewI = B.CreateExtractElement(O0, EE->getOperand(2));
4080 } else {
4081 // If we don't know what to do, be conservative and don't do anything.
4082 continue;
4085 // Lastly, extend the result.
4086 NewI->takeName(cast<Instruction>(I));
4087 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
4088 I->replaceAllUsesWith(Res);
4089 cast<Instruction>(I)->eraseFromParent();
4090 Erased.insert(I);
4091 State.reset(Def, Res, Part);
4095 // We'll have created a bunch of ZExts that are now parentless. Clean up.
4096 for (const auto &KV : Cost->getMinimalBitwidths()) {
4097 // If the value wasn't vectorized, we must maintain the original scalar
4098 // type. The absence of the value from State indicates that it
4099 // wasn't vectorized.
4100 VPValue *Def = State.Plan->getVPValue(KV.first);
4101 if (!State.hasAnyVectorValue(Def))
4102 continue;
4103 for (unsigned Part = 0; Part < UF; ++Part) {
4104 Value *I = State.get(Def, Part);
4105 ZExtInst *Inst = dyn_cast<ZExtInst>(I);
4106 if (Inst && Inst->use_empty()) {
4107 Value *NewI = Inst->getOperand(0);
4108 Inst->eraseFromParent();
4109 State.reset(Def, NewI, Part);
4115 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
4116 // Insert truncates and extends for any truncated instructions as hints to
4117 // InstCombine.
4118 if (VF.isVector())
4119 truncateToMinimalBitwidths(State);
4121 // Fix widened non-induction PHIs by setting up the PHI operands.
4122 if (OrigPHIsToFix.size()) {
4123 assert(EnableVPlanNativePath &&
4124 "Unexpected non-induction PHIs for fixup in non VPlan-native path");
4125 fixNonInductionPHIs(State);
4128 // At this point every instruction in the original loop is widened to a
4129 // vector form. Now we need to fix the recurrences in the loop. These PHI
4130 // nodes are currently empty because we did not want to introduce cycles.
4131 // This is the second stage of vectorizing recurrences.
4132 fixCrossIterationPHIs(State);
4134 // Forget the original basic block.
4135 PSE.getSE()->forgetLoop(OrigLoop);
4137 // If we inserted an edge from the middle block to the unique exit block,
4138 // update uses outside the loop (phis) to account for the newly inserted
4139 // edge.
4140 if (!Cost->requiresScalarEpilogue(VF)) {
4141 // Fix-up external users of the induction variables.
4142 for (auto &Entry : Legal->getInductionVars())
4143 fixupIVUsers(Entry.first, Entry.second,
4144 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
4145 IVEndValues[Entry.first], LoopMiddleBlock);
4147 fixLCSSAPHIs(State);
4150 for (Instruction *PI : PredicatedInstructions)
4151 sinkScalarOperands(&*PI);
4153 // Remove redundant induction instructions.
4154 cse(LoopVectorBody);
4156 // Set/update profile weights for the vector and remainder loops as original
4157 // loop iterations are now distributed among them. Note that original loop
4158 // represented by LoopScalarBody becomes remainder loop after vectorization.
4160 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
4161 // end up getting slightly roughened result but that should be OK since
4162 // profile is not inherently precise anyway. Note also possible bypass of
4163 // vector code caused by legality checks is ignored, assigning all the weight
4164 // to the vector loop, optimistically.
4166 // For scalable vectorization we can't know at compile time how many iterations
4167 // of the loop are handled in one vector iteration, so instead assume a pessimistic
4168 // vscale of '1'.
4169 setProfileInfoAfterUnrolling(
4170 LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
4171 LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
4174 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
4175 // In order to support recurrences we need to be able to vectorize Phi nodes.
4176 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4177 // stage #2: We now need to fix the recurrences by adding incoming edges to
4178 // the currently empty PHI nodes. At this point every instruction in the
4179 // original loop is widened to a vector form so we can use them to construct
4180 // the incoming edges.
4181 VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
4182 for (VPRecipeBase &R : Header->phis()) {
4183 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
4184 fixReduction(ReductionPhi, State);
4185 else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
4186 fixFirstOrderRecurrence(FOR, State);
4190 void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR,
4191 VPTransformState &State) {
4192 // This is the second phase of vectorizing first-order recurrences. An
4193 // overview of the transformation is described below. Suppose we have the
4194 // following loop.
4196 // for (int i = 0; i < n; ++i)
4197 // b[i] = a[i] - a[i - 1];
4199 // There is a first-order recurrence on "a". For this loop, the shorthand
4200 // scalar IR looks like:
4202 // scalar.ph:
4203 // s_init = a[-1]
4204 // br scalar.body
4206 // scalar.body:
4207 // i = phi [0, scalar.ph], [i+1, scalar.body]
4208 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4209 // s2 = a[i]
4210 // b[i] = s2 - s1
4211 // br cond, scalar.body, ...
4213 // In this example, s1 is a recurrence because it's value depends on the
4214 // previous iteration. In the first phase of vectorization, we created a
4215 // vector phi v1 for s1. We now complete the vectorization and produce the
4216 // shorthand vector IR shown below (for VF = 4, UF = 1).
4218 // vector.ph:
4219 // v_init = vector(..., ..., ..., a[-1])
4220 // br vector.body
4222 // vector.body
4223 // i = phi [0, vector.ph], [i+4, vector.body]
4224 // v1 = phi [v_init, vector.ph], [v2, vector.body]
4225 // v2 = a[i, i+1, i+2, i+3];
4226 // v3 = vector(v1(3), v2(0, 1, 2))
4227 // b[i, i+1, i+2, i+3] = v2 - v3
4228 // br cond, vector.body, middle.block
4230 // middle.block:
4231 // x = v2(3)
4232 // br scalar.ph
4234 // scalar.ph:
4235 // s_init = phi [x, middle.block], [a[-1], otherwise]
4236 // br scalar.body
4238 // After execution completes the vector loop, we extract the next value of
4239 // the recurrence (x) to use as the initial value in the scalar loop.
4241 // Extract the last vector element in the middle block. This will be the
4242 // initial value for the recurrence when jumping to the scalar loop.
4243 VPValue *PreviousDef = PhiR->getBackedgeValue();
4244 Value *Incoming = State.get(PreviousDef, UF - 1);
4245 auto *ExtractForScalar = Incoming;
4246 auto *IdxTy = Builder.getInt32Ty();
4247 if (VF.isVector()) {
4248 auto *One = ConstantInt::get(IdxTy, 1);
4249 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4250 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4251 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4252 ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
4253 "vector.recur.extract");
4255 // Extract the second last element in the middle block if the
4256 // Phi is used outside the loop. We need to extract the phi itself
4257 // and not the last element (the phi update in the current iteration). This
4258 // will be the value when jumping to the exit block from the LoopMiddleBlock,
4259 // when the scalar loop is not run at all.
4260 Value *ExtractForPhiUsedOutsideLoop = nullptr;
4261 if (VF.isVector()) {
4262 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4263 auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
4264 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4265 Incoming, Idx, "vector.recur.extract.for.phi");
4266 } else if (UF > 1)
4267 // When loop is unrolled without vectorizing, initialize
4268 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
4269 // of `Incoming`. This is analogous to the vectorized case above: extracting
4270 // the second last element when VF > 1.
4271 ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4273 // Fix the initial value of the original recurrence in the scalar loop.
4274 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4275 PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
4276 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4277 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
4278 for (auto *BB : predecessors(LoopScalarPreHeader)) {
4279 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4280 Start->addIncoming(Incoming, BB);
4283 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4284 Phi->setName("scalar.recur");
4286 // Finally, fix users of the recurrence outside the loop. The users will need
4287 // either the last value of the scalar recurrence or the last value of the
4288 // vector recurrence we extracted in the middle block. Since the loop is in
4289 // LCSSA form, we just need to find all the phi nodes for the original scalar
4290 // recurrence in the exit block, and then add an edge for the middle block.
4291 // Note that LCSSA does not imply single entry when the original scalar loop
4292 // had multiple exiting edges (as we always run the last iteration in the
4293 // scalar epilogue); in that case, there is no edge from middle to exit and
4294 // and thus no phis which needed updated.
4295 if (!Cost->requiresScalarEpilogue(VF))
4296 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4297 if (any_of(LCSSAPhi.incoming_values(),
4298 [Phi](Value *V) { return V == Phi; }))
4299 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4302 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
4303 VPTransformState &State) {
4304 PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
4305 // Get it's reduction variable descriptor.
4306 assert(Legal->isReductionVariable(OrigPhi) &&
4307 "Unable to find the reduction variable");
4308 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4310 RecurKind RK = RdxDesc.getRecurrenceKind();
4311 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4312 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4313 setDebugLocFromInst(ReductionStartValue);
4315 VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
4316 // This is the vector-clone of the value that leaves the loop.
4317 Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4319 // Wrap flags are in general invalid after vectorization, clear them.
4320 clearReductionWrapFlags(RdxDesc, State);
4322 // Before each round, move the insertion point right between
4323 // the PHIs and the values we are going to write.
4324 // This allows us to write both PHINodes and the extractelement
4325 // instructions.
4326 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4328 setDebugLocFromInst(LoopExitInst);
4330 Type *PhiTy = OrigPhi->getType();
4331 // If tail is folded by masking, the vector value to leave the loop should be
4332 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4333 // instead of the former. For an inloop reduction the reduction will already
4334 // be predicated, and does not need to be handled here.
4335 if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
4336 for (unsigned Part = 0; Part < UF; ++Part) {
4337 Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4338 Value *Sel = nullptr;
4339 for (User *U : VecLoopExitInst->users()) {
4340 if (isa<SelectInst>(U)) {
4341 assert(!Sel && "Reduction exit feeding two selects");
4342 Sel = U;
4343 } else
4344 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4346 assert(Sel && "Reduction exit feeds no select");
4347 State.reset(LoopExitInstDef, Sel, Part);
4349 // If the target can create a predicated operator for the reduction at no
4350 // extra cost in the loop (for example a predicated vadd), it can be
4351 // cheaper for the select to remain in the loop than be sunk out of it,
4352 // and so use the select value for the phi instead of the old
4353 // LoopExitValue.
4354 if (PreferPredicatedReductionSelect ||
4355 TTI->preferPredicatedReductionSelect(
4356 RdxDesc.getOpcode(), PhiTy,
4357 TargetTransformInfo::ReductionFlags())) {
4358 auto *VecRdxPhi =
4359 cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part));
4360 VecRdxPhi->setIncomingValueForBlock(
4361 LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4366 // If the vector reduction can be performed in a smaller type, we truncate
4367 // then extend the loop exit value to enable InstCombine to evaluate the
4368 // entire expression in the smaller type.
4369 if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
4370 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
4371 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4372 Builder.SetInsertPoint(
4373 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4374 VectorParts RdxParts(UF);
4375 for (unsigned Part = 0; Part < UF; ++Part) {
4376 RdxParts[Part] = State.get(LoopExitInstDef, Part);
4377 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4378 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4379 : Builder.CreateZExt(Trunc, VecTy);
4380 for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4381 UI != RdxParts[Part]->user_end();)
4382 if (*UI != Trunc) {
4383 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4384 RdxParts[Part] = Extnd;
4385 } else {
4386 ++UI;
4389 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4390 for (unsigned Part = 0; Part < UF; ++Part) {
4391 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4392 State.reset(LoopExitInstDef, RdxParts[Part], Part);
4396 // Reduce all of the unrolled parts into a single vector.
4397 Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4398 unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4400 // The middle block terminator has already been assigned a DebugLoc here (the
4401 // OrigLoop's single latch terminator). We want the whole middle block to
4402 // appear to execute on this line because: (a) it is all compiler generated,
4403 // (b) these instructions are always executed after evaluating the latch
4404 // conditional branch, and (c) other passes may add new predecessors which
4405 // terminate on this line. This is the easiest way to ensure we don't
4406 // accidentally cause an extra step back into the loop while debugging.
4407 setDebugLocFromInst(LoopMiddleBlock->getTerminator());
4408 if (PhiR->isOrdered())
4409 ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
4410 else {
4411 // Floating-point operations should have some FMF to enable the reduction.
4412 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
4413 Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
4414 for (unsigned Part = 1; Part < UF; ++Part) {
4415 Value *RdxPart = State.get(LoopExitInstDef, Part);
4416 if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4417 ReducedPartRdx = Builder.CreateBinOp(
4418 (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4419 } else {
4420 ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4425 // Create the reduction after the loop. Note that inloop reductions create the
4426 // target reduction in the loop using a Reduction recipe.
4427 if (VF.isVector() && !PhiR->isInLoop()) {
4428 ReducedPartRdx =
4429 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
4430 // If the reduction can be performed in a smaller type, we need to extend
4431 // the reduction to the wider type before we branch to the original loop.
4432 if (PhiTy != RdxDesc.getRecurrenceType())
4433 ReducedPartRdx = RdxDesc.isSigned()
4434 ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4435 : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4438 // Create a phi node that merges control-flow from the backedge-taken check
4439 // block and the middle block.
4440 PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4441 LoopScalarPreHeader->getTerminator());
4442 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4443 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4444 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4446 // Now, we need to fix the users of the reduction variable
4447 // inside and outside of the scalar remainder loop.
4449 // We know that the loop is in LCSSA form. We need to update the PHI nodes
4450 // in the exit blocks. See comment on analogous loop in
4451 // fixFirstOrderRecurrence for a more complete explaination of the logic.
4452 if (!Cost->requiresScalarEpilogue(VF))
4453 for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4454 if (any_of(LCSSAPhi.incoming_values(),
4455 [LoopExitInst](Value *V) { return V == LoopExitInst; }))
4456 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4458 // Fix the scalar loop reduction variable with the incoming reduction sum
4459 // from the vector body and from the backedge value.
4460 int IncomingEdgeBlockIdx =
4461 OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4462 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4463 // Pick the other block.
4464 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4465 OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4466 OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4469 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
4470 VPTransformState &State) {
4471 RecurKind RK = RdxDesc.getRecurrenceKind();
4472 if (RK != RecurKind::Add && RK != RecurKind::Mul)
4473 return;
4475 Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4476 assert(LoopExitInstr && "null loop exit instruction");
4477 SmallVector<Instruction *, 8> Worklist;
4478 SmallPtrSet<Instruction *, 8> Visited;
4479 Worklist.push_back(LoopExitInstr);
4480 Visited.insert(LoopExitInstr);
4482 while (!Worklist.empty()) {
4483 Instruction *Cur = Worklist.pop_back_val();
4484 if (isa<OverflowingBinaryOperator>(Cur))
4485 for (unsigned Part = 0; Part < UF; ++Part) {
4486 Value *V = State.get(State.Plan->getVPValue(Cur), Part);
4487 cast<Instruction>(V)->dropPoisonGeneratingFlags();
4490 for (User *U : Cur->users()) {
4491 Instruction *UI = cast<Instruction>(U);
4492 if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4493 Visited.insert(UI).second)
4494 Worklist.push_back(UI);
4499 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4500 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4501 if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4502 // Some phis were already hand updated by the reduction and recurrence
4503 // code above, leave them alone.
4504 continue;
4506 auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4507 // Non-instruction incoming values will have only one value.
4509 VPLane Lane = VPLane::getFirstLane();
4510 if (isa<Instruction>(IncomingValue) &&
4511 !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4512 VF))
4513 Lane = VPLane::getLastLaneForVF(VF);
4515 // Can be a loop invariant incoming value or the last scalar value to be
4516 // extracted from the vectorized loop.
4517 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4518 Value *lastIncomingValue =
4519 OrigLoop->isLoopInvariant(IncomingValue)
4520 ? IncomingValue
4521 : State.get(State.Plan->getVPValue(IncomingValue),
4522 VPIteration(UF - 1, Lane));
4523 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4527 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4528 // The basic block and loop containing the predicated instruction.
4529 auto *PredBB = PredInst->getParent();
4530 auto *VectorLoop = LI->getLoopFor(PredBB);
4532 // Initialize a worklist with the operands of the predicated instruction.
4533 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4535 // Holds instructions that we need to analyze again. An instruction may be
4536 // reanalyzed if we don't yet know if we can sink it or not.
4537 SmallVector<Instruction *, 8> InstsToReanalyze;
4539 // Returns true if a given use occurs in the predicated block. Phi nodes use
4540 // their operands in their corresponding predecessor blocks.
4541 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4542 auto *I = cast<Instruction>(U.getUser());
4543 BasicBlock *BB = I->getParent();
4544 if (auto *Phi = dyn_cast<PHINode>(I))
4545 BB = Phi->getIncomingBlock(
4546 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4547 return BB == PredBB;
4550 // Iteratively sink the scalarized operands of the predicated instruction
4551 // into the block we created for it. When an instruction is sunk, it's
4552 // operands are then added to the worklist. The algorithm ends after one pass
4553 // through the worklist doesn't sink a single instruction.
4554 bool Changed;
4555 do {
4556 // Add the instructions that need to be reanalyzed to the worklist, and
4557 // reset the changed indicator.
4558 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4559 InstsToReanalyze.clear();
4560 Changed = false;
4562 while (!Worklist.empty()) {
4563 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4565 // We can't sink an instruction if it is a phi node, is not in the loop,
4566 // or may have side effects.
4567 if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4568 I->mayHaveSideEffects())
4569 continue;
4571 // If the instruction is already in PredBB, check if we can sink its
4572 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4573 // sinking the scalar instruction I, hence it appears in PredBB; but it
4574 // may have failed to sink I's operands (recursively), which we try
4575 // (again) here.
4576 if (I->getParent() == PredBB) {
4577 Worklist.insert(I->op_begin(), I->op_end());
4578 continue;
4581 // It's legal to sink the instruction if all its uses occur in the
4582 // predicated block. Otherwise, there's nothing to do yet, and we may
4583 // need to reanalyze the instruction.
4584 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4585 InstsToReanalyze.push_back(I);
4586 continue;
4589 // Move the instruction to the beginning of the predicated block, and add
4590 // it's operands to the worklist.
4591 I->moveBefore(&*PredBB->getFirstInsertionPt());
4592 Worklist.insert(I->op_begin(), I->op_end());
4594 // The sinking may have enabled other instructions to be sunk, so we will
4595 // need to iterate.
4596 Changed = true;
4598 } while (Changed);
4601 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4602 for (PHINode *OrigPhi : OrigPHIsToFix) {
4603 VPWidenPHIRecipe *VPPhi =
4604 cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4605 PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4606 // Make sure the builder has a valid insert point.
4607 Builder.SetInsertPoint(NewPhi);
4608 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4609 VPValue *Inc = VPPhi->getIncomingValue(i);
4610 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4611 NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4616 bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
4617 return Cost->useOrderedReductions(RdxDesc);
4620 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4621 VPUser &Operands, unsigned UF,
4622 ElementCount VF, bool IsPtrLoopInvariant,
4623 SmallBitVector &IsIndexLoopInvariant,
4624 VPTransformState &State) {
4625 // Construct a vector GEP by widening the operands of the scalar GEP as
4626 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4627 // results in a vector of pointers when at least one operand of the GEP
4628 // is vector-typed. Thus, to keep the representation compact, we only use
4629 // vector-typed operands for loop-varying values.
4631 if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4632 // If we are vectorizing, but the GEP has only loop-invariant operands,
4633 // the GEP we build (by only using vector-typed operands for
4634 // loop-varying values) would be a scalar pointer. Thus, to ensure we
4635 // produce a vector of pointers, we need to either arbitrarily pick an
4636 // operand to broadcast, or broadcast a clone of the original GEP.
4637 // Here, we broadcast a clone of the original.
4639 // TODO: If at some point we decide to scalarize instructions having
4640 // loop-invariant operands, this special case will no longer be
4641 // required. We would add the scalarization decision to
4642 // collectLoopScalars() and teach getVectorValue() to broadcast
4643 // the lane-zero scalar value.
4644 auto *Clone = Builder.Insert(GEP->clone());
4645 for (unsigned Part = 0; Part < UF; ++Part) {
4646 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4647 State.set(VPDef, EntryPart, Part);
4648 addMetadata(EntryPart, GEP);
4650 } else {
4651 // If the GEP has at least one loop-varying operand, we are sure to
4652 // produce a vector of pointers. But if we are only unrolling, we want
4653 // to produce a scalar GEP for each unroll part. Thus, the GEP we
4654 // produce with the code below will be scalar (if VF == 1) or vector
4655 // (otherwise). Note that for the unroll-only case, we still maintain
4656 // values in the vector mapping with initVector, as we do for other
4657 // instructions.
4658 for (unsigned Part = 0; Part < UF; ++Part) {
4659 // The pointer operand of the new GEP. If it's loop-invariant, we
4660 // won't broadcast it.
4661 auto *Ptr = IsPtrLoopInvariant
4662 ? State.get(Operands.getOperand(0), VPIteration(0, 0))
4663 : State.get(Operands.getOperand(0), Part);
4665 // Collect all the indices for the new GEP. If any index is
4666 // loop-invariant, we won't broadcast it.
4667 SmallVector<Value *, 4> Indices;
4668 for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4669 VPValue *Operand = Operands.getOperand(I);
4670 if (IsIndexLoopInvariant[I - 1])
4671 Indices.push_back(State.get(Operand, VPIteration(0, 0)));
4672 else
4673 Indices.push_back(State.get(Operand, Part));
4676 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4677 // but it should be a vector, otherwise.
4678 auto *NewGEP =
4679 GEP->isInBounds()
4680 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4681 Indices)
4682 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4683 assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
4684 "NewGEP is not a pointer vector");
4685 State.set(VPDef, NewGEP, Part);
4686 addMetadata(NewGEP, GEP);
4691 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4692 VPWidenPHIRecipe *PhiR,
4693 VPTransformState &State) {
4694 PHINode *P = cast<PHINode>(PN);
4695 if (EnableVPlanNativePath) {
4696 // Currently we enter here in the VPlan-native path for non-induction
4697 // PHIs where all control flow is uniform. We simply widen these PHIs.
4698 // Create a vector phi with no operands - the vector phi operands will be
4699 // set at the end of vector code generation.
4700 Type *VecTy = (State.VF.isScalar())
4701 ? PN->getType()
4702 : VectorType::get(PN->getType(), State.VF);
4703 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4704 State.set(PhiR, VecPhi, 0);
4705 OrigPHIsToFix.push_back(P);
4707 return;
4710 assert(PN->getParent() == OrigLoop->getHeader() &&
4711 "Non-header phis should have been handled elsewhere");
4713 // In order to support recurrences we need to be able to vectorize Phi nodes.
4714 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4715 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4716 // this value when we vectorize all of the instructions that use the PHI.
4718 assert(!Legal->isReductionVariable(P) &&
4719 "reductions should be handled elsewhere");
4721 setDebugLocFromInst(P);
4723 // This PHINode must be an induction variable.
4724 // Make sure that we know about it.
4725 assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4727 InductionDescriptor II = Legal->getInductionVars().lookup(P);
4728 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4730 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4731 // which can be found from the original scalar operations.
4732 switch (II.getKind()) {
4733 case InductionDescriptor::IK_NoInduction:
4734 llvm_unreachable("Unknown induction");
4735 case InductionDescriptor::IK_IntInduction:
4736 case InductionDescriptor::IK_FpInduction:
4737 llvm_unreachable("Integer/fp induction is handled elsewhere.");
4738 case InductionDescriptor::IK_PtrInduction: {
4739 // Handle the pointer induction variable case.
4740 assert(P->getType()->isPointerTy() && "Unexpected type.");
4742 if (Cost->isScalarAfterVectorization(P, State.VF)) {
4743 // This is the normalized GEP that starts counting at zero.
4744 Value *PtrInd =
4745 Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4746 // Determine the number of scalars we need to generate for each unroll
4747 // iteration. If the instruction is uniform, we only need to generate the
4748 // first lane. Otherwise, we generate all VF values.
4749 bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
4750 unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
4752 bool NeedsVectorIndex = !IsUniform && VF.isScalable();
4753 Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr;
4754 if (NeedsVectorIndex) {
4755 Type *VecIVTy = VectorType::get(PtrInd->getType(), VF);
4756 UnitStepVec = Builder.CreateStepVector(VecIVTy);
4757 PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd);
4760 for (unsigned Part = 0; Part < UF; ++Part) {
4761 Value *PartStart = createStepForVF(
4762 Builder, ConstantInt::get(PtrInd->getType(), Part), VF);
4764 if (NeedsVectorIndex) {
4765 Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart);
4766 Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec);
4767 Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices);
4768 Value *SclrGep =
4769 emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II);
4770 SclrGep->setName("next.gep");
4771 State.set(PhiR, SclrGep, Part);
4772 // We've cached the whole vector, which means we can support the
4773 // extraction of any lane.
4774 continue;
4777 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4778 Value *Idx = Builder.CreateAdd(
4779 PartStart, ConstantInt::get(PtrInd->getType(), Lane));
4780 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4781 Value *SclrGep =
4782 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4783 SclrGep->setName("next.gep");
4784 State.set(PhiR, SclrGep, VPIteration(Part, Lane));
4787 return;
4789 assert(isa<SCEVConstant>(II.getStep()) &&
4790 "Induction step not a SCEV constant!");
4791 Type *PhiType = II.getStep()->getType();
4793 // Build a pointer phi
4794 Value *ScalarStartValue = II.getStartValue();
4795 Type *ScStValueType = ScalarStartValue->getType();
4796 PHINode *NewPointerPhi =
4797 PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4798 NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4800 // A pointer induction, performed by using a gep
4801 BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4802 Instruction *InductionLoc = LoopLatch->getTerminator();
4803 const SCEV *ScalarStep = II.getStep();
4804 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4805 Value *ScalarStepValue =
4806 Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4807 Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
4808 Value *NumUnrolledElems =
4809 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
4810 Value *InductionGEP = GetElementPtrInst::Create(
4811 ScStValueType->getPointerElementType(), NewPointerPhi,
4812 Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
4813 InductionLoc);
4814 NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4816 // Create UF many actual address geps that use the pointer
4817 // phi as base and a vectorized version of the step value
4818 // (<step*0, ..., step*N>) as offset.
4819 for (unsigned Part = 0; Part < State.UF; ++Part) {
4820 Type *VecPhiType = VectorType::get(PhiType, State.VF);
4821 Value *StartOffsetScalar =
4822 Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
4823 Value *StartOffset =
4824 Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
4825 // Create a vector of consecutive numbers from zero to VF.
4826 StartOffset =
4827 Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
4829 Value *GEP = Builder.CreateGEP(
4830 ScStValueType->getPointerElementType(), NewPointerPhi,
4831 Builder.CreateMul(
4832 StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
4833 "vector.gep"));
4834 State.set(PhiR, GEP, Part);
4840 /// A helper function for checking whether an integer division-related
4841 /// instruction may divide by zero (in which case it must be predicated if
4842 /// executed conditionally in the scalar code).
4843 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4844 /// Non-zero divisors that are non compile-time constants will not be
4845 /// converted into multiplication, so we will still end up scalarizing
4846 /// the division, but can do so w/o predication.
4847 static bool mayDivideByZero(Instruction &I) {
4848 assert((I.getOpcode() == Instruction::UDiv ||
4849 I.getOpcode() == Instruction::SDiv ||
4850 I.getOpcode() == Instruction::URem ||
4851 I.getOpcode() == Instruction::SRem) &&
4852 "Unexpected instruction");
4853 Value *Divisor = I.getOperand(1);
4854 auto *CInt = dyn_cast<ConstantInt>(Divisor);
4855 return !CInt || CInt->isZero();
4858 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4859 VPUser &User,
4860 VPTransformState &State) {
4861 switch (I.getOpcode()) {
4862 case Instruction::Call:
4863 case Instruction::Br:
4864 case Instruction::PHI:
4865 case Instruction::GetElementPtr:
4866 case Instruction::Select:
4867 llvm_unreachable("This instruction is handled by a different recipe.");
4868 case Instruction::UDiv:
4869 case Instruction::SDiv:
4870 case Instruction::SRem:
4871 case Instruction::URem:
4872 case Instruction::Add:
4873 case Instruction::FAdd:
4874 case Instruction::Sub:
4875 case Instruction::FSub:
4876 case Instruction::FNeg:
4877 case Instruction::Mul:
4878 case Instruction::FMul:
4879 case Instruction::FDiv:
4880 case Instruction::FRem:
4881 case Instruction::Shl:
4882 case Instruction::LShr:
4883 case Instruction::AShr:
4884 case Instruction::And:
4885 case Instruction::Or:
4886 case Instruction::Xor: {
4887 // Just widen unops and binops.
4888 setDebugLocFromInst(&I);
4890 for (unsigned Part = 0; Part < UF; ++Part) {
4891 SmallVector<Value *, 2> Ops;
4892 for (VPValue *VPOp : User.operands())
4893 Ops.push_back(State.get(VPOp, Part));
4895 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4897 if (auto *VecOp = dyn_cast<Instruction>(V))
4898 VecOp->copyIRFlags(&I);
4900 // Use this vector value for all users of the original instruction.
4901 State.set(Def, V, Part);
4902 addMetadata(V, &I);
4905 break;
4907 case Instruction::ICmp:
4908 case Instruction::FCmp: {
4909 // Widen compares. Generate vector compares.
4910 bool FCmp = (I.getOpcode() == Instruction::FCmp);
4911 auto *Cmp = cast<CmpInst>(&I);
4912 setDebugLocFromInst(Cmp);
4913 for (unsigned Part = 0; Part < UF; ++Part) {
4914 Value *A = State.get(User.getOperand(0), Part);
4915 Value *B = State.get(User.getOperand(1), Part);
4916 Value *C = nullptr;
4917 if (FCmp) {
4918 // Propagate fast math flags.
4919 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4920 Builder.setFastMathFlags(Cmp->getFastMathFlags());
4921 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4922 } else {
4923 C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4925 State.set(Def, C, Part);
4926 addMetadata(C, &I);
4929 break;
4932 case Instruction::ZExt:
4933 case Instruction::SExt:
4934 case Instruction::FPToUI:
4935 case Instruction::FPToSI:
4936 case Instruction::FPExt:
4937 case Instruction::PtrToInt:
4938 case Instruction::IntToPtr:
4939 case Instruction::SIToFP:
4940 case Instruction::UIToFP:
4941 case Instruction::Trunc:
4942 case Instruction::FPTrunc:
4943 case Instruction::BitCast: {
4944 auto *CI = cast<CastInst>(&I);
4945 setDebugLocFromInst(CI);
4947 /// Vectorize casts.
4948 Type *DestTy =
4949 (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4951 for (unsigned Part = 0; Part < UF; ++Part) {
4952 Value *A = State.get(User.getOperand(0), Part);
4953 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4954 State.set(Def, Cast, Part);
4955 addMetadata(Cast, &I);
4957 break;
4959 default:
4960 // This instruction is not vectorized by simple widening.
4961 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4962 llvm_unreachable("Unhandled instruction!");
4963 } // end of switch.
4966 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4967 VPUser &ArgOperands,
4968 VPTransformState &State) {
4969 assert(!isa<DbgInfoIntrinsic>(I) &&
4970 "DbgInfoIntrinsic should have been dropped during VPlan construction");
4971 setDebugLocFromInst(&I);
4973 Module *M = I.getParent()->getParent()->getParent();
4974 auto *CI = cast<CallInst>(&I);
4976 SmallVector<Type *, 4> Tys;
4977 for (Value *ArgOperand : CI->arg_operands())
4978 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4980 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4982 // The flag shows whether we use Intrinsic or a usual Call for vectorized
4983 // version of the instruction.
4984 // Is it beneficial to perform intrinsic call compared to lib call?
4985 bool NeedToScalarize = false;
4986 InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4987 InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4988 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4989 assert((UseVectorIntrinsic || !NeedToScalarize) &&
4990 "Instruction should be scalarized elsewhere.");
4991 assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
4992 "Either the intrinsic cost or vector call cost must be valid");
4994 for (unsigned Part = 0; Part < UF; ++Part) {
4995 SmallVector<Type *, 2> TysForDecl = {CI->getType()};
4996 SmallVector<Value *, 4> Args;
4997 for (auto &I : enumerate(ArgOperands.operands())) {
4998 // Some intrinsics have a scalar argument - don't replace it with a
4999 // vector.
5000 Value *Arg;
5001 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
5002 Arg = State.get(I.value(), Part);
5003 else {
5004 Arg = State.get(I.value(), VPIteration(0, 0));
5005 if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index()))
5006 TysForDecl.push_back(Arg->getType());
5008 Args.push_back(Arg);
5011 Function *VectorF;
5012 if (UseVectorIntrinsic) {
5013 // Use vector version of the intrinsic.
5014 if (VF.isVector())
5015 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
5016 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
5017 assert(VectorF && "Can't retrieve vector intrinsic.");
5018 } else {
5019 // Use vector version of the function call.
5020 const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
5021 #ifndef NDEBUG
5022 assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
5023 "Can't create vector function.");
5024 #endif
5025 VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
5027 SmallVector<OperandBundleDef, 1> OpBundles;
5028 CI->getOperandBundlesAsDefs(OpBundles);
5029 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
5031 if (isa<FPMathOperator>(V))
5032 V->copyFastMathFlags(CI);
5034 State.set(Def, V, Part);
5035 addMetadata(V, &I);
5039 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
5040 VPUser &Operands,
5041 bool InvariantCond,
5042 VPTransformState &State) {
5043 setDebugLocFromInst(&I);
5045 // The condition can be loop invariant but still defined inside the
5046 // loop. This means that we can't just use the original 'cond' value.
5047 // We have to take the 'vectorized' value and pick the first lane.
5048 // Instcombine will make this a no-op.
5049 auto *InvarCond = InvariantCond
5050 ? State.get(Operands.getOperand(0), VPIteration(0, 0))
5051 : nullptr;
5053 for (unsigned Part = 0; Part < UF; ++Part) {
5054 Value *Cond =
5055 InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
5056 Value *Op0 = State.get(Operands.getOperand(1), Part);
5057 Value *Op1 = State.get(Operands.getOperand(2), Part);
5058 Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
5059 State.set(VPDef, Sel, Part);
5060 addMetadata(Sel, &I);
5064 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
5065 // We should not collect Scalars more than once per VF. Right now, this
5066 // function is called from collectUniformsAndScalars(), which already does
5067 // this check. Collecting Scalars for VF=1 does not make any sense.
5068 assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
5069 "This function should not be visited twice for the same VF");
5071 SmallSetVector<Instruction *, 8> Worklist;
5073 // These sets are used to seed the analysis with pointers used by memory
5074 // accesses that will remain scalar.
5075 SmallSetVector<Instruction *, 8> ScalarPtrs;
5076 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
5077 auto *Latch = TheLoop->getLoopLatch();
5079 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
5080 // The pointer operands of loads and stores will be scalar as long as the
5081 // memory access is not a gather or scatter operation. The value operand of a
5082 // store will remain scalar if the store is scalarized.
5083 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
5084 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
5085 assert(WideningDecision != CM_Unknown &&
5086 "Widening decision should be ready at this moment");
5087 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
5088 if (Ptr == Store->getValueOperand())
5089 return WideningDecision == CM_Scalarize;
5090 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
5091 "Ptr is neither a value or pointer operand");
5092 return WideningDecision != CM_GatherScatter;
5095 // A helper that returns true if the given value is a bitcast or
5096 // getelementptr instruction contained in the loop.
5097 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
5098 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
5099 isa<GetElementPtrInst>(V)) &&
5100 !TheLoop->isLoopInvariant(V);
5103 auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
5104 if (!isa<PHINode>(Ptr) ||
5105 !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
5106 return false;
5107 auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
5108 if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
5109 return false;
5110 return isScalarUse(MemAccess, Ptr);
5113 // A helper that evaluates a memory access's use of a pointer. If the
5114 // pointer is actually the pointer induction of a loop, it is being
5115 // inserted into Worklist. If the use will be a scalar use, and the
5116 // pointer is only used by memory accesses, we place the pointer in
5117 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
5118 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
5119 if (isScalarPtrInduction(MemAccess, Ptr)) {
5120 Worklist.insert(cast<Instruction>(Ptr));
5121 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
5122 << "\n");
5124 Instruction *Update = cast<Instruction>(
5125 cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
5126 ScalarPtrs.insert(Update);
5127 return;
5129 // We only care about bitcast and getelementptr instructions contained in
5130 // the loop.
5131 if (!isLoopVaryingBitCastOrGEP(Ptr))
5132 return;
5134 // If the pointer has already been identified as scalar (e.g., if it was
5135 // also identified as uniform), there's nothing to do.
5136 auto *I = cast<Instruction>(Ptr);
5137 if (Worklist.count(I))
5138 return;
5140 // If the use of the pointer will be a scalar use, and all users of the
5141 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5142 // place the pointer in PossibleNonScalarPtrs.
5143 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
5144 return isa<LoadInst>(U) || isa<StoreInst>(U);
5146 ScalarPtrs.insert(I);
5147 else
5148 PossibleNonScalarPtrs.insert(I);
5151 // We seed the scalars analysis with three classes of instructions: (1)
5152 // instructions marked uniform-after-vectorization and (2) bitcast,
5153 // getelementptr and (pointer) phi instructions used by memory accesses
5154 // requiring a scalar use.
5156 // (1) Add to the worklist all instructions that have been identified as
5157 // uniform-after-vectorization.
5158 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5160 // (2) Add to the worklist all bitcast and getelementptr instructions used by
5161 // memory accesses requiring a scalar use. The pointer operands of loads and
5162 // stores will be scalar as long as the memory accesses is not a gather or
5163 // scatter operation. The value operand of a store will remain scalar if the
5164 // store is scalarized.
5165 for (auto *BB : TheLoop->blocks())
5166 for (auto &I : *BB) {
5167 if (auto *Load = dyn_cast<LoadInst>(&I)) {
5168 evaluatePtrUse(Load, Load->getPointerOperand());
5169 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5170 evaluatePtrUse(Store, Store->getPointerOperand());
5171 evaluatePtrUse(Store, Store->getValueOperand());
5174 for (auto *I : ScalarPtrs)
5175 if (!PossibleNonScalarPtrs.count(I)) {
5176 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
5177 Worklist.insert(I);
5180 // Insert the forced scalars.
5181 // FIXME: Currently widenPHIInstruction() often creates a dead vector
5182 // induction variable when the PHI user is scalarized.
5183 auto ForcedScalar = ForcedScalars.find(VF);
5184 if (ForcedScalar != ForcedScalars.end())
5185 for (auto *I : ForcedScalar->second)
5186 Worklist.insert(I);
5188 // Expand the worklist by looking through any bitcasts and getelementptr
5189 // instructions we've already identified as scalar. This is similar to the
5190 // expansion step in collectLoopUniforms(); however, here we're only
5191 // expanding to include additional bitcasts and getelementptr instructions.
5192 unsigned Idx = 0;
5193 while (Idx != Worklist.size()) {
5194 Instruction *Dst = Worklist[Idx++];
5195 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5196 continue;
5197 auto *Src = cast<Instruction>(Dst->getOperand(0));
5198 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5199 auto *J = cast<Instruction>(U);
5200 return !TheLoop->contains(J) || Worklist.count(J) ||
5201 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5202 isScalarUse(J, Src));
5203 })) {
5204 Worklist.insert(Src);
5205 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
5209 // An induction variable will remain scalar if all users of the induction
5210 // variable and induction variable update remain scalar.
5211 for (auto &Induction : Legal->getInductionVars()) {
5212 auto *Ind = Induction.first;
5213 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5215 // If tail-folding is applied, the primary induction variable will be used
5216 // to feed a vector compare.
5217 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
5218 continue;
5220 // Determine if all users of the induction variable are scalar after
5221 // vectorization.
5222 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5223 auto *I = cast<Instruction>(U);
5224 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5226 if (!ScalarInd)
5227 continue;
5229 // Determine if all users of the induction variable update instruction are
5230 // scalar after vectorization.
5231 auto ScalarIndUpdate =
5232 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5233 auto *I = cast<Instruction>(U);
5234 return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5236 if (!ScalarIndUpdate)
5237 continue;
5239 // The induction variable and its update instruction will remain scalar.
5240 Worklist.insert(Ind);
5241 Worklist.insert(IndUpdate);
5242 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5243 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
5244 << "\n");
5247 Scalars[VF].insert(Worklist.begin(), Worklist.end());
5250 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
5251 if (!blockNeedsPredication(I->getParent()))
5252 return false;
5253 switch(I->getOpcode()) {
5254 default:
5255 break;
5256 case Instruction::Load:
5257 case Instruction::Store: {
5258 if (!Legal->isMaskRequired(I))
5259 return false;
5260 auto *Ptr = getLoadStorePointerOperand(I);
5261 auto *Ty = getLoadStoreType(I);
5262 const Align Alignment = getLoadStoreAlignment(I);
5263 return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5264 TTI.isLegalMaskedGather(Ty, Alignment))
5265 : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5266 TTI.isLegalMaskedScatter(Ty, Alignment));
5268 case Instruction::UDiv:
5269 case Instruction::SDiv:
5270 case Instruction::SRem:
5271 case Instruction::URem:
5272 return mayDivideByZero(*I);
5274 return false;
5277 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5278 Instruction *I, ElementCount VF) {
5279 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
5280 assert(getWideningDecision(I, VF) == CM_Unknown &&
5281 "Decision should not be set yet.");
5282 auto *Group = getInterleavedAccessGroup(I);
5283 assert(Group && "Must have a group.");
5285 // If the instruction's allocated size doesn't equal it's type size, it
5286 // requires padding and will be scalarized.
5287 auto &DL = I->getModule()->getDataLayout();
5288 auto *ScalarTy = getLoadStoreType(I);
5289 if (hasIrregularType(ScalarTy, DL))
5290 return false;
5292 // Check if masking is required.
5293 // A Group may need masking for one of two reasons: it resides in a block that
5294 // needs predication, or it was decided to use masking to deal with gaps
5295 // (either a gap at the end of a load-access that may result in a speculative
5296 // load, or any gaps in a store-access).
5297 bool PredicatedAccessRequiresMasking =
5298 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
5299 bool LoadAccessWithGapsRequiresEpilogMasking =
5300 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
5301 !isScalarEpilogueAllowed();
5302 bool StoreAccessWithGapsRequiresMasking =
5303 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
5304 if (!PredicatedAccessRequiresMasking &&
5305 !LoadAccessWithGapsRequiresEpilogMasking &&
5306 !StoreAccessWithGapsRequiresMasking)
5307 return true;
5309 // If masked interleaving is required, we expect that the user/target had
5310 // enabled it, because otherwise it either wouldn't have been created or
5311 // it should have been invalidated by the CostModel.
5312 assert(useMaskedInterleavedAccesses(TTI) &&
5313 "Masked interleave-groups for predicated accesses are not enabled.");
5315 auto *Ty = getLoadStoreType(I);
5316 const Align Alignment = getLoadStoreAlignment(I);
5317 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5318 : TTI.isLegalMaskedStore(Ty, Alignment);
5321 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5322 Instruction *I, ElementCount VF) {
5323 // Get and ensure we have a valid memory instruction.
5324 LoadInst *LI = dyn_cast<LoadInst>(I);
5325 StoreInst *SI = dyn_cast<StoreInst>(I);
5326 assert((LI || SI) && "Invalid memory instruction");
5328 auto *Ptr = getLoadStorePointerOperand(I);
5330 // In order to be widened, the pointer should be consecutive, first of all.
5331 if (!Legal->isConsecutivePtr(Ptr))
5332 return false;
5334 // If the instruction is a store located in a predicated block, it will be
5335 // scalarized.
5336 if (isScalarWithPredication(I))
5337 return false;
5339 // If the instruction's allocated size doesn't equal it's type size, it
5340 // requires padding and will be scalarized.
5341 auto &DL = I->getModule()->getDataLayout();
5342 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5343 if (hasIrregularType(ScalarTy, DL))
5344 return false;
5346 return true;
5349 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5350 // We should not collect Uniforms more than once per VF. Right now,
5351 // this function is called from collectUniformsAndScalars(), which
5352 // already does this check. Collecting Uniforms for VF=1 does not make any
5353 // sense.
5355 assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5356 "This function should not be visited twice for the same VF");
5358 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5359 // not analyze again. Uniforms.count(VF) will return 1.
5360 Uniforms[VF].clear();
5362 // We now know that the loop is vectorizable!
5363 // Collect instructions inside the loop that will remain uniform after
5364 // vectorization.
5366 // Global values, params and instructions outside of current loop are out of
5367 // scope.
5368 auto isOutOfScope = [&](Value *V) -> bool {
5369 Instruction *I = dyn_cast<Instruction>(V);
5370 return (!I || !TheLoop->contains(I));
5373 SetVector<Instruction *> Worklist;
5374 BasicBlock *Latch = TheLoop->getLoopLatch();
5376 // Instructions that are scalar with predication must not be considered
5377 // uniform after vectorization, because that would create an erroneous
5378 // replicating region where only a single instance out of VF should be formed.
5379 // TODO: optimize such seldom cases if found important, see PR40816.
5380 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5381 if (isOutOfScope(I)) {
5382 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5383 << *I << "\n");
5384 return;
5386 if (isScalarWithPredication(I)) {
5387 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5388 << *I << "\n");
5389 return;
5391 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5392 Worklist.insert(I);
5395 // Start with the conditional branch. If the branch condition is an
5396 // instruction contained in the loop that is only used by the branch, it is
5397 // uniform.
5398 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5399 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5400 addToWorklistIfAllowed(Cmp);
5402 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5403 InstWidening WideningDecision = getWideningDecision(I, VF);
5404 assert(WideningDecision != CM_Unknown &&
5405 "Widening decision should be ready at this moment");
5407 // A uniform memory op is itself uniform. We exclude uniform stores
5408 // here as they demand the last lane, not the first one.
5409 if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5410 assert(WideningDecision == CM_Scalarize);
5411 return true;
5414 return (WideningDecision == CM_Widen ||
5415 WideningDecision == CM_Widen_Reverse ||
5416 WideningDecision == CM_Interleave);
5420 // Returns true if Ptr is the pointer operand of a memory access instruction
5421 // I, and I is known to not require scalarization.
5422 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5423 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5426 // Holds a list of values which are known to have at least one uniform use.
5427 // Note that there may be other uses which aren't uniform. A "uniform use"
5428 // here is something which only demands lane 0 of the unrolled iterations;
5429 // it does not imply that all lanes produce the same value (e.g. this is not
5430 // the usual meaning of uniform)
5431 SetVector<Value *> HasUniformUse;
5433 // Scan the loop for instructions which are either a) known to have only
5434 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5435 for (auto *BB : TheLoop->blocks())
5436 for (auto &I : *BB) {
5437 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
5438 switch (II->getIntrinsicID()) {
5439 case Intrinsic::sideeffect:
5440 case Intrinsic::experimental_noalias_scope_decl:
5441 case Intrinsic::assume:
5442 case Intrinsic::lifetime_start:
5443 case Intrinsic::lifetime_end:
5444 if (TheLoop->hasLoopInvariantOperands(&I))
5445 addToWorklistIfAllowed(&I);
5446 break;
5447 default:
5448 break;
5452 // ExtractValue instructions must be uniform, because the operands are
5453 // known to be loop-invariant.
5454 if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
5455 assert(isOutOfScope(EVI->getAggregateOperand()) &&
5456 "Expected aggregate value to be loop invariant");
5457 addToWorklistIfAllowed(EVI);
5458 continue;
5461 // If there's no pointer operand, there's nothing to do.
5462 auto *Ptr = getLoadStorePointerOperand(&I);
5463 if (!Ptr)
5464 continue;
5466 // A uniform memory op is itself uniform. We exclude uniform stores
5467 // here as they demand the last lane, not the first one.
5468 if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5469 addToWorklistIfAllowed(&I);
5471 if (isUniformDecision(&I, VF)) {
5472 assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5473 HasUniformUse.insert(Ptr);
5477 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5478 // demanding) users. Since loops are assumed to be in LCSSA form, this
5479 // disallows uses outside the loop as well.
5480 for (auto *V : HasUniformUse) {
5481 if (isOutOfScope(V))
5482 continue;
5483 auto *I = cast<Instruction>(V);
5484 auto UsersAreMemAccesses =
5485 llvm::all_of(I->users(), [&](User *U) -> bool {
5486 return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5488 if (UsersAreMemAccesses)
5489 addToWorklistIfAllowed(I);
5492 // Expand Worklist in topological order: whenever a new instruction
5493 // is added , its users should be already inside Worklist. It ensures
5494 // a uniform instruction will only be used by uniform instructions.
5495 unsigned idx = 0;
5496 while (idx != Worklist.size()) {
5497 Instruction *I = Worklist[idx++];
5499 for (auto OV : I->operand_values()) {
5500 // isOutOfScope operands cannot be uniform instructions.
5501 if (isOutOfScope(OV))
5502 continue;
5503 // First order recurrence Phi's should typically be considered
5504 // non-uniform.
5505 auto *OP = dyn_cast<PHINode>(OV);
5506 if (OP && Legal->isFirstOrderRecurrence(OP))
5507 continue;
5508 // If all the users of the operand are uniform, then add the
5509 // operand into the uniform worklist.
5510 auto *OI = cast<Instruction>(OV);
5511 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5512 auto *J = cast<Instruction>(U);
5513 return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5515 addToWorklistIfAllowed(OI);
5519 // For an instruction to be added into Worklist above, all its users inside
5520 // the loop should also be in Worklist. However, this condition cannot be
5521 // true for phi nodes that form a cyclic dependence. We must process phi
5522 // nodes separately. An induction variable will remain uniform if all users
5523 // of the induction variable and induction variable update remain uniform.
5524 // The code below handles both pointer and non-pointer induction variables.
5525 for (auto &Induction : Legal->getInductionVars()) {
5526 auto *Ind = Induction.first;
5527 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5529 // Determine if all users of the induction variable are uniform after
5530 // vectorization.
5531 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5532 auto *I = cast<Instruction>(U);
5533 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5534 isVectorizedMemAccessUse(I, Ind);
5536 if (!UniformInd)
5537 continue;
5539 // Determine if all users of the induction variable update instruction are
5540 // uniform after vectorization.
5541 auto UniformIndUpdate =
5542 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5543 auto *I = cast<Instruction>(U);
5544 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5545 isVectorizedMemAccessUse(I, IndUpdate);
5547 if (!UniformIndUpdate)
5548 continue;
5550 // The induction variable and its update instruction will remain uniform.
5551 addToWorklistIfAllowed(Ind);
5552 addToWorklistIfAllowed(IndUpdate);
5555 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5558 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5559 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5561 if (Legal->getRuntimePointerChecking()->Need) {
5562 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5563 "runtime pointer checks needed. Enable vectorization of this "
5564 "loop with '#pragma clang loop vectorize(enable)' when "
5565 "compiling with -Os/-Oz",
5566 "CantVersionLoopWithOptForSize", ORE, TheLoop);
5567 return true;
5570 if (!PSE.getUnionPredicate().getPredicates().empty()) {
5571 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5572 "runtime SCEV checks needed. Enable vectorization of this "
5573 "loop with '#pragma clang loop vectorize(enable)' when "
5574 "compiling with -Os/-Oz",
5575 "CantVersionLoopWithOptForSize", ORE, TheLoop);
5576 return true;
5579 // FIXME: Avoid specializing for stride==1 instead of bailing out.
5580 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5581 reportVectorizationFailure("Runtime stride check for small trip count",
5582 "runtime stride == 1 checks needed. Enable vectorization of "
5583 "this loop without such check by compiling with -Os/-Oz",
5584 "CantVersionLoopWithOptForSize", ORE, TheLoop);
5585 return true;
5588 return false;
5591 ElementCount
5592 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
5593 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
5594 return ElementCount::getScalable(0);
5596 if (Hints->isScalableVectorizationDisabled()) {
5597 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
5598 "ScalableVectorizationDisabled", ORE, TheLoop);
5599 return ElementCount::getScalable(0);
5602 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
5604 auto MaxScalableVF = ElementCount::getScalable(
5605 std::numeric_limits<ElementCount::ScalarTy>::max());
5607 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
5608 // FIXME: While for scalable vectors this is currently sufficient, this should
5609 // be replaced by a more detailed mechanism that filters out specific VFs,
5610 // instead of invalidating vectorization for a whole set of VFs based on the
5611 // MaxVF.
5613 // Disable scalable vectorization if the loop contains unsupported reductions.
5614 if (!canVectorizeReductions(MaxScalableVF)) {
5615 reportVectorizationInfo(
5616 "Scalable vectorization not supported for the reduction "
5617 "operations found in this loop.",
5618 "ScalableVFUnfeasible", ORE, TheLoop);
5619 return ElementCount::getScalable(0);
5622 // Disable scalable vectorization if the loop contains any instructions
5623 // with element types not supported for scalable vectors.
5624 if (any_of(ElementTypesInLoop, [&](Type *Ty) {
5625 return !Ty->isVoidTy() &&
5626 !this->TTI.isElementTypeLegalForScalableVector(Ty);
5627 })) {
5628 reportVectorizationInfo("Scalable vectorization is not supported "
5629 "for all element types found in this loop.",
5630 "ScalableVFUnfeasible", ORE, TheLoop);
5631 return ElementCount::getScalable(0);
5634 if (Legal->isSafeForAnyVectorWidth())
5635 return MaxScalableVF;
5637 // Limit MaxScalableVF by the maximum safe dependence distance.
5638 Optional<unsigned> MaxVScale = TTI.getMaxVScale();
5639 if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5640 unsigned VScaleMax = TheFunction->getFnAttribute(Attribute::VScaleRange)
5641 .getVScaleRangeArgs()
5642 .second;
5643 if (VScaleMax > 0)
5644 MaxVScale = VScaleMax;
5646 MaxScalableVF = ElementCount::getScalable(
5647 MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
5648 if (!MaxScalableVF)
5649 reportVectorizationInfo(
5650 "Max legal vector width too small, scalable vectorization "
5651 "unfeasible.",
5652 "ScalableVFUnfeasible", ORE, TheLoop);
5654 return MaxScalableVF;
5657 FixedScalableVFPair
5658 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5659 ElementCount UserVF) {
5660 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5661 unsigned SmallestType, WidestType;
5662 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5664 // Get the maximum safe dependence distance in bits computed by LAA.
5665 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5666 // the memory accesses that is most restrictive (involved in the smallest
5667 // dependence distance).
5668 unsigned MaxSafeElements =
5669 PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
5671 auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
5672 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
5674 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
5675 << ".\n");
5676 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
5677 << ".\n");
5679 // First analyze the UserVF, fall back if the UserVF should be ignored.
5680 if (UserVF) {
5681 auto MaxSafeUserVF =
5682 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
5684 if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
5685 // If `VF=vscale x N` is safe, then so is `VF=N`
5686 if (UserVF.isScalable())
5687 return FixedScalableVFPair(
5688 ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
5689 else
5690 return UserVF;
5693 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
5695 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
5696 // is better to ignore the hint and let the compiler choose a suitable VF.
5697 if (!UserVF.isScalable()) {
5698 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5699 << " is unsafe, clamping to max safe VF="
5700 << MaxSafeFixedVF << ".\n");
5701 ORE->emit([&]() {
5702 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5703 TheLoop->getStartLoc(),
5704 TheLoop->getHeader())
5705 << "User-specified vectorization factor "
5706 << ore::NV("UserVectorizationFactor", UserVF)
5707 << " is unsafe, clamping to maximum safe vectorization factor "
5708 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
5710 return MaxSafeFixedVF;
5713 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
5714 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5715 << " is ignored because scalable vectors are not "
5716 "available.\n");
5717 ORE->emit([&]() {
5718 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5719 TheLoop->getStartLoc(),
5720 TheLoop->getHeader())
5721 << "User-specified vectorization factor "
5722 << ore::NV("UserVectorizationFactor", UserVF)
5723 << " is ignored because the target does not support scalable "
5724 "vectors. The compiler will pick a more suitable value.";
5726 } else {
5727 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5728 << " is unsafe. Ignoring scalable UserVF.\n");
5729 ORE->emit([&]() {
5730 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5731 TheLoop->getStartLoc(),
5732 TheLoop->getHeader())
5733 << "User-specified vectorization factor "
5734 << ore::NV("UserVectorizationFactor", UserVF)
5735 << " is unsafe. Ignoring the hint to let the compiler pick a "
5736 "more suitable value.";
5741 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5742 << " / " << WidestType << " bits.\n");
5744 FixedScalableVFPair Result(ElementCount::getFixed(1),
5745 ElementCount::getScalable(0));
5746 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
5747 WidestType, MaxSafeFixedVF))
5748 Result.FixedVF = MaxVF;
5750 if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
5751 WidestType, MaxSafeScalableVF))
5752 if (MaxVF.isScalable()) {
5753 Result.ScalableVF = MaxVF;
5754 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
5755 << "\n");
5758 return Result;
5761 FixedScalableVFPair
5762 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5763 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5764 // TODO: It may by useful to do since it's still likely to be dynamically
5765 // uniform if the target can skip.
5766 reportVectorizationFailure(
5767 "Not inserting runtime ptr check for divergent target",
5768 "runtime pointer checks needed. Not enabled for divergent target",
5769 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5770 return FixedScalableVFPair::getNone();
5773 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5774 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5775 if (TC == 1) {
5776 reportVectorizationFailure("Single iteration (non) loop",
5777 "loop trip count is one, irrelevant for vectorization",
5778 "SingleIterationLoop", ORE, TheLoop);
5779 return FixedScalableVFPair::getNone();
5782 switch (ScalarEpilogueStatus) {
5783 case CM_ScalarEpilogueAllowed:
5784 return computeFeasibleMaxVF(TC, UserVF);
5785 case CM_ScalarEpilogueNotAllowedUsePredicate:
5786 LLVM_FALLTHROUGH;
5787 case CM_ScalarEpilogueNotNeededUsePredicate:
5788 LLVM_DEBUG(
5789 dbgs() << "LV: vector predicate hint/switch found.\n"
5790 << "LV: Not allowing scalar epilogue, creating predicated "
5791 << "vector loop.\n");
5792 break;
5793 case CM_ScalarEpilogueNotAllowedLowTripLoop:
5794 // fallthrough as a special case of OptForSize
5795 case CM_ScalarEpilogueNotAllowedOptSize:
5796 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5797 LLVM_DEBUG(
5798 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5799 else
5800 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5801 << "count.\n");
5803 // Bail if runtime checks are required, which are not good when optimising
5804 // for size.
5805 if (runtimeChecksRequired())
5806 return FixedScalableVFPair::getNone();
5808 break;
5811 // The only loops we can vectorize without a scalar epilogue, are loops with
5812 // a bottom-test and a single exiting block. We'd have to handle the fact
5813 // that not every instruction executes on the last iteration. This will
5814 // require a lane mask which varies through the vector loop body. (TODO)
5815 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5816 // If there was a tail-folding hint/switch, but we can't fold the tail by
5817 // masking, fallback to a vectorization with a scalar epilogue.
5818 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5819 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5820 "scalar epilogue instead.\n");
5821 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5822 return computeFeasibleMaxVF(TC, UserVF);
5824 return FixedScalableVFPair::getNone();
5827 // Now try the tail folding
5829 // Invalidate interleave groups that require an epilogue if we can't mask
5830 // the interleave-group.
5831 if (!useMaskedInterleavedAccesses(TTI)) {
5832 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5833 "No decisions should have been taken at this point");
5834 // Note: There is no need to invalidate any cost modeling decisions here, as
5835 // non where taken so far.
5836 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5839 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF);
5840 // Avoid tail folding if the trip count is known to be a multiple of any VF
5841 // we chose.
5842 // FIXME: The condition below pessimises the case for fixed-width vectors,
5843 // when scalable VFs are also candidates for vectorization.
5844 if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5845 ElementCount MaxFixedVF = MaxFactors.FixedVF;
5846 assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5847 "MaxFixedVF must be a power of 2");
5848 unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5849 : MaxFixedVF.getFixedValue();
5850 ScalarEvolution *SE = PSE.getSE();
5851 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5852 const SCEV *ExitCount = SE->getAddExpr(
5853 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5854 const SCEV *Rem = SE->getURemExpr(
5855 SE->applyLoopGuards(ExitCount, TheLoop),
5856 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5857 if (Rem->isZero()) {
5858 // Accept MaxFixedVF if we do not have a tail.
5859 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5860 return MaxFactors;
5864 // For scalable vectors, don't use tail folding as this is currently not yet
5865 // supported. The code is likely to have ended up here if the tripcount is
5866 // low, in which case it makes sense not to use scalable vectors.
5867 if (MaxFactors.ScalableVF.isVector())
5868 MaxFactors.ScalableVF = ElementCount::getScalable(0);
5870 // If we don't know the precise trip count, or if the trip count that we
5871 // found modulo the vectorization factor is not zero, try to fold the tail
5872 // by masking.
5873 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5874 if (Legal->prepareToFoldTailByMasking()) {
5875 FoldTailByMasking = true;
5876 return MaxFactors;
5879 // If there was a tail-folding hint/switch, but we can't fold the tail by
5880 // masking, fallback to a vectorization with a scalar epilogue.
5881 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5882 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5883 "scalar epilogue instead.\n");
5884 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5885 return MaxFactors;
5888 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5889 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5890 return FixedScalableVFPair::getNone();
5893 if (TC == 0) {
5894 reportVectorizationFailure(
5895 "Unable to calculate the loop count due to complex control flow",
5896 "unable to calculate the loop count due to complex control flow",
5897 "UnknownLoopCountComplexCFG", ORE, TheLoop);
5898 return FixedScalableVFPair::getNone();
5901 reportVectorizationFailure(
5902 "Cannot optimize for size and vectorize at the same time.",
5903 "cannot optimize for size and vectorize at the same time. "
5904 "Enable vectorization of this loop with '#pragma clang loop "
5905 "vectorize(enable)' when compiling with -Os/-Oz",
5906 "NoTailLoopWithOptForSize", ORE, TheLoop);
5907 return FixedScalableVFPair::getNone();
5910 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5911 unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5912 const ElementCount &MaxSafeVF) {
5913 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5914 TypeSize WidestRegister = TTI.getRegisterBitWidth(
5915 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5916 : TargetTransformInfo::RGK_FixedWidthVector);
5918 // Convenience function to return the minimum of two ElementCounts.
5919 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5920 assert((LHS.isScalable() == RHS.isScalable()) &&
5921 "Scalable flags must match");
5922 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5925 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5926 // Note that both WidestRegister and WidestType may not be a powers of 2.
5927 auto MaxVectorElementCount = ElementCount::get(
5928 PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5929 ComputeScalableMaxVF);
5930 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5931 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5932 << (MaxVectorElementCount * WidestType) << " bits.\n");
5934 if (!MaxVectorElementCount) {
5935 LLVM_DEBUG(dbgs() << "LV: The target has no "
5936 << (ComputeScalableMaxVF ? "scalable" : "fixed")
5937 << " vector registers.\n");
5938 return ElementCount::getFixed(1);
5941 const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5942 if (ConstTripCount &&
5943 ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5944 isPowerOf2_32(ConstTripCount)) {
5945 // We need to clamp the VF to be the ConstTripCount. There is no point in
5946 // choosing a higher viable VF as done in the loop below. If
5947 // MaxVectorElementCount is scalable, we only fall back on a fixed VF when
5948 // the TC is less than or equal to the known number of lanes.
5949 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5950 << ConstTripCount << "\n");
5951 return TripCountEC;
5954 ElementCount MaxVF = MaxVectorElementCount;
5955 if (TTI.shouldMaximizeVectorBandwidth() ||
5956 (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5957 auto MaxVectorElementCountMaxBW = ElementCount::get(
5958 PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5959 ComputeScalableMaxVF);
5960 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5962 // Collect all viable vectorization factors larger than the default MaxVF
5963 // (i.e. MaxVectorElementCount).
5964 SmallVector<ElementCount, 8> VFs;
5965 for (ElementCount VS = MaxVectorElementCount * 2;
5966 ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5967 VFs.push_back(VS);
5969 // For each VF calculate its register usage.
5970 auto RUs = calculateRegisterUsage(VFs);
5972 // Select the largest VF which doesn't require more registers than existing
5973 // ones.
5974 for (int i = RUs.size() - 1; i >= 0; --i) {
5975 bool Selected = true;
5976 for (auto &pair : RUs[i].MaxLocalUsers) {
5977 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5978 if (pair.second > TargetNumRegisters)
5979 Selected = false;
5981 if (Selected) {
5982 MaxVF = VFs[i];
5983 break;
5986 if (ElementCount MinVF =
5987 TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5988 if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5989 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5990 << ") with target's minimum: " << MinVF << '\n');
5991 MaxVF = MinVF;
5995 return MaxVF;
5998 bool LoopVectorizationCostModel::isMoreProfitable(
5999 const VectorizationFactor &A, const VectorizationFactor &B) const {
6000 InstructionCost CostA = A.Cost;
6001 InstructionCost CostB = B.Cost;
6003 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
6005 if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
6006 MaxTripCount) {
6007 // If we are folding the tail and the trip count is a known (possibly small)
6008 // constant, the trip count will be rounded up to an integer number of
6009 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
6010 // which we compare directly. When not folding the tail, the total cost will
6011 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
6012 // approximated with the per-lane cost below instead of using the tripcount
6013 // as here.
6014 auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
6015 auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
6016 return RTCostA < RTCostB;
6019 // When set to preferred, for now assume vscale may be larger than 1, so
6020 // that scalable vectorization is slightly favorable over fixed-width
6021 // vectorization.
6022 if (Hints->isScalableVectorizationPreferred())
6023 if (A.Width.isScalable() && !B.Width.isScalable())
6024 return (CostA * B.Width.getKnownMinValue()) <=
6025 (CostB * A.Width.getKnownMinValue());
6027 // To avoid the need for FP division:
6028 // (CostA / A.Width) < (CostB / B.Width)
6029 // <=> (CostA * B.Width) < (CostB * A.Width)
6030 return (CostA * B.Width.getKnownMinValue()) <
6031 (CostB * A.Width.getKnownMinValue());
6034 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
6035 const ElementCountSet &VFCandidates) {
6036 InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
6037 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
6038 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
6039 assert(VFCandidates.count(ElementCount::getFixed(1)) &&
6040 "Expected Scalar VF to be a candidate");
6042 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
6043 VectorizationFactor ChosenFactor = ScalarCost;
6045 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
6046 if (ForceVectorization && VFCandidates.size() > 1) {
6047 // Ignore scalar width, because the user explicitly wants vectorization.
6048 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
6049 // evaluation.
6050 ChosenFactor.Cost = InstructionCost::getMax();
6053 SmallVector<InstructionVFPair> InvalidCosts;
6054 for (const auto &i : VFCandidates) {
6055 // The cost for scalar VF=1 is already calculated, so ignore it.
6056 if (i.isScalar())
6057 continue;
6059 VectorizationCostTy C = expectedCost(i, &InvalidCosts);
6060 VectorizationFactor Candidate(i, C.first);
6061 LLVM_DEBUG(
6062 dbgs() << "LV: Vector loop of width " << i << " costs: "
6063 << (Candidate.Cost / Candidate.Width.getKnownMinValue())
6064 << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "")
6065 << ".\n");
6067 if (!C.second && !ForceVectorization) {
6068 LLVM_DEBUG(
6069 dbgs() << "LV: Not considering vector loop of width " << i
6070 << " because it will not generate any vector instructions.\n");
6071 continue;
6074 // If profitable add it to ProfitableVF list.
6075 if (isMoreProfitable(Candidate, ScalarCost))
6076 ProfitableVFs.push_back(Candidate);
6078 if (isMoreProfitable(Candidate, ChosenFactor))
6079 ChosenFactor = Candidate;
6082 // Emit a report of VFs with invalid costs in the loop.
6083 if (!InvalidCosts.empty()) {
6084 // Group the remarks per instruction, keeping the instruction order from
6085 // InvalidCosts.
6086 std::map<Instruction *, unsigned> Numbering;
6087 unsigned I = 0;
6088 for (auto &Pair : InvalidCosts)
6089 if (!Numbering.count(Pair.first))
6090 Numbering[Pair.first] = I++;
6092 // Sort the list, first on instruction(number) then on VF.
6093 llvm::sort(InvalidCosts,
6094 [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
6095 if (Numbering[A.first] != Numbering[B.first])
6096 return Numbering[A.first] < Numbering[B.first];
6097 ElementCountComparator ECC;
6098 return ECC(A.second, B.second);
6101 // For a list of ordered instruction-vf pairs:
6102 // [(load, vf1), (load, vf2), (store, vf1)]
6103 // Group the instructions together to emit separate remarks for:
6104 // load (vf1, vf2)
6105 // store (vf1)
6106 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
6107 auto Subset = ArrayRef<InstructionVFPair>();
6108 do {
6109 if (Subset.empty())
6110 Subset = Tail.take_front(1);
6112 Instruction *I = Subset.front().first;
6114 // If the next instruction is different, or if there are no other pairs,
6115 // emit a remark for the collated subset. e.g.
6116 // [(load, vf1), (load, vf2))]
6117 // to emit:
6118 // remark: invalid costs for 'load' at VF=(vf, vf2)
6119 if (Subset == Tail || Tail[Subset.size()].first != I) {
6120 std::string OutString;
6121 raw_string_ostream OS(OutString);
6122 assert(!Subset.empty() && "Unexpected empty range");
6123 OS << "Instruction with invalid costs prevented vectorization at VF=(";
6124 for (auto &Pair : Subset)
6125 OS << (Pair.second == Subset.front().second ? "" : ", ")
6126 << Pair.second;
6127 OS << "):";
6128 if (auto *CI = dyn_cast<CallInst>(I))
6129 OS << " call to " << CI->getCalledFunction()->getName();
6130 else
6131 OS << " " << I->getOpcodeName();
6132 OS.flush();
6133 reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
6134 Tail = Tail.drop_front(Subset.size());
6135 Subset = {};
6136 } else
6137 // Grow the subset by one element
6138 Subset = Tail.take_front(Subset.size() + 1);
6139 } while (!Tail.empty());
6142 if (!EnableCondStoresVectorization && NumPredStores) {
6143 reportVectorizationFailure("There are conditional stores.",
6144 "store that is conditionally executed prevents vectorization",
6145 "ConditionalStore", ORE, TheLoop);
6146 ChosenFactor = ScalarCost;
6149 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
6150 ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
6151 << "LV: Vectorization seems to be not beneficial, "
6152 << "but was forced by a user.\n");
6153 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
6154 return ChosenFactor;
6157 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
6158 const Loop &L, ElementCount VF) const {
6159 // Cross iteration phis such as reductions need special handling and are
6160 // currently unsupported.
6161 if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
6162 return Legal->isFirstOrderRecurrence(&Phi) ||
6163 Legal->isReductionVariable(&Phi);
6165 return false;
6167 // Phis with uses outside of the loop require special handling and are
6168 // currently unsupported.
6169 for (auto &Entry : Legal->getInductionVars()) {
6170 // Look for uses of the value of the induction at the last iteration.
6171 Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
6172 for (User *U : PostInc->users())
6173 if (!L.contains(cast<Instruction>(U)))
6174 return false;
6175 // Look for uses of penultimate value of the induction.
6176 for (User *U : Entry.first->users())
6177 if (!L.contains(cast<Instruction>(U)))
6178 return false;
6181 // Induction variables that are widened require special handling that is
6182 // currently not supported.
6183 if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
6184 return !(this->isScalarAfterVectorization(Entry.first, VF) ||
6185 this->isProfitableToScalarize(Entry.first, VF));
6187 return false;
6189 // Epilogue vectorization code has not been auditted to ensure it handles
6190 // non-latch exits properly. It may be fine, but it needs auditted and
6191 // tested.
6192 if (L.getExitingBlock() != L.getLoopLatch())
6193 return false;
6195 return true;
6198 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
6199 const ElementCount VF) const {
6200 // FIXME: We need a much better cost-model to take different parameters such
6201 // as register pressure, code size increase and cost of extra branches into
6202 // account. For now we apply a very crude heuristic and only consider loops
6203 // with vectorization factors larger than a certain value.
6204 // We also consider epilogue vectorization unprofitable for targets that don't
6205 // consider interleaving beneficial (eg. MVE).
6206 if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
6207 return false;
6208 if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
6209 return true;
6210 return false;
6213 VectorizationFactor
6214 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
6215 const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
6216 VectorizationFactor Result = VectorizationFactor::Disabled();
6217 if (!EnableEpilogueVectorization) {
6218 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
6219 return Result;
6222 if (!isScalarEpilogueAllowed()) {
6223 LLVM_DEBUG(
6224 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
6225 "allowed.\n";);
6226 return Result;
6229 // FIXME: This can be fixed for scalable vectors later, because at this stage
6230 // the LoopVectorizer will only consider vectorizing a loop with scalable
6231 // vectors when the loop has a hint to enable vectorization for a given VF.
6232 if (MainLoopVF.isScalable()) {
6233 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not "
6234 "yet supported.\n");
6235 return Result;
6238 // Not really a cost consideration, but check for unsupported cases here to
6239 // simplify the logic.
6240 if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
6241 LLVM_DEBUG(
6242 dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
6243 "not a supported candidate.\n";);
6244 return Result;
6247 if (EpilogueVectorizationForceVF > 1) {
6248 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
6249 if (LVP.hasPlanWithVFs(
6250 {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
6251 return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
6252 else {
6253 LLVM_DEBUG(
6254 dbgs()
6255 << "LEV: Epilogue vectorization forced factor is not viable.\n";);
6256 return Result;
6260 if (TheLoop->getHeader()->getParent()->hasOptSize() ||
6261 TheLoop->getHeader()->getParent()->hasMinSize()) {
6262 LLVM_DEBUG(
6263 dbgs()
6264 << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
6265 return Result;
6268 if (!isEpilogueVectorizationProfitable(MainLoopVF))
6269 return Result;
6271 for (auto &NextVF : ProfitableVFs)
6272 if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
6273 (Result.Width.getFixedValue() == 1 ||
6274 isMoreProfitable(NextVF, Result)) &&
6275 LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
6276 Result = NextVF;
6278 if (Result != VectorizationFactor::Disabled())
6279 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
6280 << Result.Width.getFixedValue() << "\n";);
6281 return Result;
6284 std::pair<unsigned, unsigned>
6285 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
6286 unsigned MinWidth = -1U;
6287 unsigned MaxWidth = 8;
6288 const DataLayout &DL = TheFunction->getParent()->getDataLayout();
6289 for (Type *T : ElementTypesInLoop) {
6290 MinWidth = std::min<unsigned>(
6291 MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
6292 MaxWidth = std::max<unsigned>(
6293 MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
6295 return {MinWidth, MaxWidth};
6298 void LoopVectorizationCostModel::collectElementTypesForWidening() {
6299 ElementTypesInLoop.clear();
6300 // For each block.
6301 for (BasicBlock *BB : TheLoop->blocks()) {
6302 // For each instruction in the loop.
6303 for (Instruction &I : BB->instructionsWithoutDebug()) {
6304 Type *T = I.getType();
6306 // Skip ignored values.
6307 if (ValuesToIgnore.count(&I))
6308 continue;
6310 // Only examine Loads, Stores and PHINodes.
6311 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
6312 continue;
6314 // Examine PHI nodes that are reduction variables. Update the type to
6315 // account for the recurrence type.
6316 if (auto *PN = dyn_cast<PHINode>(&I)) {
6317 if (!Legal->isReductionVariable(PN))
6318 continue;
6319 const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[PN];
6320 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
6321 TTI.preferInLoopReduction(RdxDesc.getOpcode(),
6322 RdxDesc.getRecurrenceType(),
6323 TargetTransformInfo::ReductionFlags()))
6324 continue;
6325 T = RdxDesc.getRecurrenceType();
6328 // Examine the stored values.
6329 if (auto *ST = dyn_cast<StoreInst>(&I))
6330 T = ST->getValueOperand()->getType();
6332 // Ignore loaded pointer types and stored pointer types that are not
6333 // vectorizable.
6335 // FIXME: The check here attempts to predict whether a load or store will
6336 // be vectorized. We only know this for certain after a VF has
6337 // been selected. Here, we assume that if an access can be
6338 // vectorized, it will be. We should also look at extending this
6339 // optimization to non-pointer types.
6341 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
6342 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
6343 continue;
6345 ElementTypesInLoop.insert(T);
6350 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
6351 unsigned LoopCost) {
6352 // -- The interleave heuristics --
6353 // We interleave the loop in order to expose ILP and reduce the loop overhead.
6354 // There are many micro-architectural considerations that we can't predict
6355 // at this level. For example, frontend pressure (on decode or fetch) due to
6356 // code size, or the number and capabilities of the execution ports.
6358 // We use the following heuristics to select the interleave count:
6359 // 1. If the code has reductions, then we interleave to break the cross
6360 // iteration dependency.
6361 // 2. If the loop is really small, then we interleave to reduce the loop
6362 // overhead.
6363 // 3. We don't interleave if we think that we will spill registers to memory
6364 // due to the increased register pressure.
6366 if (!isScalarEpilogueAllowed())
6367 return 1;
6369 // We used the distance for the interleave count.
6370 if (Legal->getMaxSafeDepDistBytes() != -1U)
6371 return 1;
6373 auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
6374 const bool HasReductions = !Legal->getReductionVars().empty();
6375 // Do not interleave loops with a relatively small known or estimated trip
6376 // count. But we will interleave when InterleaveSmallLoopScalarReduction is
6377 // enabled, and the code has scalar reductions(HasReductions && VF = 1),
6378 // because with the above conditions interleaving can expose ILP and break
6379 // cross iteration dependences for reductions.
6380 if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
6381 !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
6382 return 1;
6384 RegisterUsage R = calculateRegisterUsage({VF})[0];
6385 // We divide by these constants so assume that we have at least one
6386 // instruction that uses at least one register.
6387 for (auto& pair : R.MaxLocalUsers) {
6388 pair.second = std::max(pair.second, 1U);
6391 // We calculate the interleave count using the following formula.
6392 // Subtract the number of loop invariants from the number of available
6393 // registers. These registers are used by all of the interleaved instances.
6394 // Next, divide the remaining registers by the number of registers that is
6395 // required by the loop, in order to estimate how many parallel instances
6396 // fit without causing spills. All of this is rounded down if necessary to be
6397 // a power of two. We want power of two interleave count to simplify any
6398 // addressing operations or alignment considerations.
6399 // We also want power of two interleave counts to ensure that the induction
6400 // variable of the vector loop wraps to zero, when tail is folded by masking;
6401 // this currently happens when OptForSize, in which case IC is set to 1 above.
6402 unsigned IC = UINT_MAX;
6404 for (auto& pair : R.MaxLocalUsers) {
6405 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
6406 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
6407 << " registers of "
6408 << TTI.getRegisterClassName(pair.first) << " register class\n");
6409 if (VF.isScalar()) {
6410 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
6411 TargetNumRegisters = ForceTargetNumScalarRegs;
6412 } else {
6413 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
6414 TargetNumRegisters = ForceTargetNumVectorRegs;
6416 unsigned MaxLocalUsers = pair.second;
6417 unsigned LoopInvariantRegs = 0;
6418 if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
6419 LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
6421 unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
6422 // Don't count the induction variable as interleaved.
6423 if (EnableIndVarRegisterHeur) {
6424 TmpIC =
6425 PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
6426 std::max(1U, (MaxLocalUsers - 1)));
6429 IC = std::min(IC, TmpIC);
6432 // Clamp the interleave ranges to reasonable counts.
6433 unsigned MaxInterleaveCount =
6434 TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
6436 // Check if the user has overridden the max.
6437 if (VF.isScalar()) {
6438 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6439 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6440 } else {
6441 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6442 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6445 // If trip count is known or estimated compile time constant, limit the
6446 // interleave count to be less than the trip count divided by VF, provided it
6447 // is at least 1.
6449 // For scalable vectors we can't know if interleaving is beneficial. It may
6450 // not be beneficial for small loops if none of the lanes in the second vector
6451 // iterations is enabled. However, for larger loops, there is likely to be a
6452 // similar benefit as for fixed-width vectors. For now, we choose to leave
6453 // the InterleaveCount as if vscale is '1', although if some information about
6454 // the vector is known (e.g. min vector size), we can make a better decision.
6455 if (BestKnownTC) {
6456 MaxInterleaveCount =
6457 std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
6458 // Make sure MaxInterleaveCount is greater than 0.
6459 MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
6462 assert(MaxInterleaveCount > 0 &&
6463 "Maximum interleave count must be greater than 0");
6465 // Clamp the calculated IC to be between the 1 and the max interleave count
6466 // that the target and trip count allows.
6467 if (IC > MaxInterleaveCount)
6468 IC = MaxInterleaveCount;
6469 else
6470 // Make sure IC is greater than 0.
6471 IC = std::max(1u, IC);
6473 assert(IC > 0 && "Interleave count must be greater than 0.");
6475 // If we did not calculate the cost for VF (because the user selected the VF)
6476 // then we calculate the cost of VF here.
6477 if (LoopCost == 0) {
6478 InstructionCost C = expectedCost(VF).first;
6479 assert(C.isValid() && "Expected to have chosen a VF with valid cost");
6480 LoopCost = *C.getValue();
6483 assert(LoopCost && "Non-zero loop cost expected");
6485 // Interleave if we vectorized this loop and there is a reduction that could
6486 // benefit from interleaving.
6487 if (VF.isVector() && HasReductions) {
6488 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6489 return IC;
6492 // Note that if we've already vectorized the loop we will have done the
6493 // runtime check and so interleaving won't require further checks.
6494 bool InterleavingRequiresRuntimePointerCheck =
6495 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6497 // We want to interleave small loops in order to reduce the loop overhead and
6498 // potentially expose ILP opportunities.
6499 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6500 << "LV: IC is " << IC << '\n'
6501 << "LV: VF is " << VF << '\n');
6502 const bool AggressivelyInterleaveReductions =
6503 TTI.enableAggressiveInterleaving(HasReductions);
6504 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6505 // We assume that the cost overhead is 1 and we use the cost model
6506 // to estimate the cost of the loop and interleave until the cost of the
6507 // loop overhead is about 5% of the cost of the loop.
6508 unsigned SmallIC =
6509 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6511 // Interleave until store/load ports (estimated by max interleave count) are
6512 // saturated.
6513 unsigned NumStores = Legal->getNumStores();
6514 unsigned NumLoads = Legal->getNumLoads();
6515 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6516 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6518 // If we have a scalar reduction (vector reductions are already dealt with
6519 // by this point), we can increase the critical path length if the loop
6520 // we're interleaving is inside another loop. For tree-wise reductions
6521 // set the limit to 2, and for ordered reductions it's best to disable
6522 // interleaving entirely.
6523 if (HasReductions && TheLoop->getLoopDepth() > 1) {
6524 bool HasOrderedReductions =
6525 any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
6526 const RecurrenceDescriptor &RdxDesc = Reduction.second;
6527 return RdxDesc.isOrdered();
6529 if (HasOrderedReductions) {
6530 LLVM_DEBUG(
6531 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
6532 return 1;
6535 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6536 SmallIC = std::min(SmallIC, F);
6537 StoresIC = std::min(StoresIC, F);
6538 LoadsIC = std::min(LoadsIC, F);
6541 if (EnableLoadStoreRuntimeInterleave &&
6542 std::max(StoresIC, LoadsIC) > SmallIC) {
6543 LLVM_DEBUG(
6544 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6545 return std::max(StoresIC, LoadsIC);
6548 // If there are scalar reductions and TTI has enabled aggressive
6549 // interleaving for reductions, we will interleave to expose ILP.
6550 if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6551 AggressivelyInterleaveReductions) {
6552 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6553 // Interleave no less than SmallIC but not as aggressive as the normal IC
6554 // to satisfy the rare situation when resources are too limited.
6555 return std::max(IC / 2, SmallIC);
6556 } else {
6557 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6558 return SmallIC;
6562 // Interleave if this is a large loop (small loops are already dealt with by
6563 // this point) that could benefit from interleaving.
6564 if (AggressivelyInterleaveReductions) {
6565 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6566 return IC;
6569 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6570 return 1;
6573 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6574 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6575 // This function calculates the register usage by measuring the highest number
6576 // of values that are alive at a single location. Obviously, this is a very
6577 // rough estimation. We scan the loop in a topological order in order and
6578 // assign a number to each instruction. We use RPO to ensure that defs are
6579 // met before their users. We assume that each instruction that has in-loop
6580 // users starts an interval. We record every time that an in-loop value is
6581 // used, so we have a list of the first and last occurrences of each
6582 // instruction. Next, we transpose this data structure into a multi map that
6583 // holds the list of intervals that *end* at a specific location. This multi
6584 // map allows us to perform a linear search. We scan the instructions linearly
6585 // and record each time that a new interval starts, by placing it in a set.
6586 // If we find this value in the multi-map then we remove it from the set.
6587 // The max register usage is the maximum size of the set.
6588 // We also search for instructions that are defined outside the loop, but are
6589 // used inside the loop. We need this number separately from the max-interval
6590 // usage number because when we unroll, loop-invariant values do not take
6591 // more register.
6592 LoopBlocksDFS DFS(TheLoop);
6593 DFS.perform(LI);
6595 RegisterUsage RU;
6597 // Each 'key' in the map opens a new interval. The values
6598 // of the map are the index of the 'last seen' usage of the
6599 // instruction that is the key.
6600 using IntervalMap = DenseMap<Instruction *, unsigned>;
6602 // Maps instruction to its index.
6603 SmallVector<Instruction *, 64> IdxToInstr;
6604 // Marks the end of each interval.
6605 IntervalMap EndPoint;
6606 // Saves the list of instruction indices that are used in the loop.
6607 SmallPtrSet<Instruction *, 8> Ends;
6608 // Saves the list of values that are used in the loop but are
6609 // defined outside the loop, such as arguments and constants.
6610 SmallPtrSet<Value *, 8> LoopInvariants;
6612 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6613 for (Instruction &I : BB->instructionsWithoutDebug()) {
6614 IdxToInstr.push_back(&I);
6616 // Save the end location of each USE.
6617 for (Value *U : I.operands()) {
6618 auto *Instr = dyn_cast<Instruction>(U);
6620 // Ignore non-instruction values such as arguments, constants, etc.
6621 if (!Instr)
6622 continue;
6624 // If this instruction is outside the loop then record it and continue.
6625 if (!TheLoop->contains(Instr)) {
6626 LoopInvariants.insert(Instr);
6627 continue;
6630 // Overwrite previous end points.
6631 EndPoint[Instr] = IdxToInstr.size();
6632 Ends.insert(Instr);
6637 // Saves the list of intervals that end with the index in 'key'.
6638 using InstrList = SmallVector<Instruction *, 2>;
6639 DenseMap<unsigned, InstrList> TransposeEnds;
6641 // Transpose the EndPoints to a list of values that end at each index.
6642 for (auto &Interval : EndPoint)
6643 TransposeEnds[Interval.second].push_back(Interval.first);
6645 SmallPtrSet<Instruction *, 8> OpenIntervals;
6646 SmallVector<RegisterUsage, 8> RUs(VFs.size());
6647 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6649 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6651 // A lambda that gets the register usage for the given type and VF.
6652 const auto &TTICapture = TTI;
6653 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
6654 if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6655 return 0;
6656 InstructionCost::CostType RegUsage =
6657 *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
6658 assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
6659 "Nonsensical values for register usage.");
6660 return RegUsage;
6663 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6664 Instruction *I = IdxToInstr[i];
6666 // Remove all of the instructions that end at this location.
6667 InstrList &List = TransposeEnds[i];
6668 for (Instruction *ToRemove : List)
6669 OpenIntervals.erase(ToRemove);
6671 // Ignore instructions that are never used within the loop.
6672 if (!Ends.count(I))
6673 continue;
6675 // Skip ignored values.
6676 if (ValuesToIgnore.count(I))
6677 continue;
6679 // For each VF find the maximum usage of registers.
6680 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6681 // Count the number of live intervals.
6682 SmallMapVector<unsigned, unsigned, 4> RegUsage;
6684 if (VFs[j].isScalar()) {
6685 for (auto Inst : OpenIntervals) {
6686 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6687 if (RegUsage.find(ClassID) == RegUsage.end())
6688 RegUsage[ClassID] = 1;
6689 else
6690 RegUsage[ClassID] += 1;
6692 } else {
6693 collectUniformsAndScalars(VFs[j]);
6694 for (auto Inst : OpenIntervals) {
6695 // Skip ignored values for VF > 1.
6696 if (VecValuesToIgnore.count(Inst))
6697 continue;
6698 if (isScalarAfterVectorization(Inst, VFs[j])) {
6699 unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6700 if (RegUsage.find(ClassID) == RegUsage.end())
6701 RegUsage[ClassID] = 1;
6702 else
6703 RegUsage[ClassID] += 1;
6704 } else {
6705 unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6706 if (RegUsage.find(ClassID) == RegUsage.end())
6707 RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6708 else
6709 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6714 for (auto& pair : RegUsage) {
6715 if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6716 MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6717 else
6718 MaxUsages[j][pair.first] = pair.second;
6722 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6723 << OpenIntervals.size() << '\n');
6725 // Add the current instruction to the list of open intervals.
6726 OpenIntervals.insert(I);
6729 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6730 SmallMapVector<unsigned, unsigned, 4> Invariant;
6732 for (auto Inst : LoopInvariants) {
6733 unsigned Usage =
6734 VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6735 unsigned ClassID =
6736 TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6737 if (Invariant.find(ClassID) == Invariant.end())
6738 Invariant[ClassID] = Usage;
6739 else
6740 Invariant[ClassID] += Usage;
6743 LLVM_DEBUG({
6744 dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6745 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6746 << " item\n";
6747 for (const auto &pair : MaxUsages[i]) {
6748 dbgs() << "LV(REG): RegisterClass: "
6749 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6750 << " registers\n";
6752 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6753 << " item\n";
6754 for (const auto &pair : Invariant) {
6755 dbgs() << "LV(REG): RegisterClass: "
6756 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6757 << " registers\n";
6761 RU.LoopInvariantRegs = Invariant;
6762 RU.MaxLocalUsers = MaxUsages[i];
6763 RUs[i] = RU;
6766 return RUs;
6769 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6770 // TODO: Cost model for emulated masked load/store is completely
6771 // broken. This hack guides the cost model to use an artificially
6772 // high enough value to practically disable vectorization with such
6773 // operations, except where previously deployed legality hack allowed
6774 // using very low cost values. This is to avoid regressions coming simply
6775 // from moving "masked load/store" check from legality to cost model.
6776 // Masked Load/Gather emulation was previously never allowed.
6777 // Limited number of Masked Store/Scatter emulation was allowed.
6778 assert(isPredicatedInst(I) &&
6779 "Expecting a scalar emulated instruction");
6780 return isa<LoadInst>(I) ||
6781 (isa<StoreInst>(I) &&
6782 NumPredStores > NumberOfStoresToPredicate);
6785 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6786 // If we aren't vectorizing the loop, or if we've already collected the
6787 // instructions to scalarize, there's nothing to do. Collection may already
6788 // have occurred if we have a user-selected VF and are now computing the
6789 // expected cost for interleaving.
6790 if (VF.isScalar() || VF.isZero() ||
6791 InstsToScalarize.find(VF) != InstsToScalarize.end())
6792 return;
6794 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6795 // not profitable to scalarize any instructions, the presence of VF in the
6796 // map will indicate that we've analyzed it already.
6797 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6799 // Find all the instructions that are scalar with predication in the loop and
6800 // determine if it would be better to not if-convert the blocks they are in.
6801 // If so, we also record the instructions to scalarize.
6802 for (BasicBlock *BB : TheLoop->blocks()) {
6803 if (!blockNeedsPredication(BB))
6804 continue;
6805 for (Instruction &I : *BB)
6806 if (isScalarWithPredication(&I)) {
6807 ScalarCostsTy ScalarCosts;
6808 // Do not apply discount if scalable, because that would lead to
6809 // invalid scalarization costs.
6810 // Do not apply discount logic if hacked cost is needed
6811 // for emulated masked memrefs.
6812 if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) &&
6813 computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6814 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6815 // Remember that BB will remain after vectorization.
6816 PredicatedBBsAfterVectorization.insert(BB);
6821 int LoopVectorizationCostModel::computePredInstDiscount(
6822 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6823 assert(!isUniformAfterVectorization(PredInst, VF) &&
6824 "Instruction marked uniform-after-vectorization will be predicated");
6826 // Initialize the discount to zero, meaning that the scalar version and the
6827 // vector version cost the same.
6828 InstructionCost Discount = 0;
6830 // Holds instructions to analyze. The instructions we visit are mapped in
6831 // ScalarCosts. Those instructions are the ones that would be scalarized if
6832 // we find that the scalar version costs less.
6833 SmallVector<Instruction *, 8> Worklist;
6835 // Returns true if the given instruction can be scalarized.
6836 auto canBeScalarized = [&](Instruction *I) -> bool {
6837 // We only attempt to scalarize instructions forming a single-use chain
6838 // from the original predicated block that would otherwise be vectorized.
6839 // Although not strictly necessary, we give up on instructions we know will
6840 // already be scalar to avoid traversing chains that are unlikely to be
6841 // beneficial.
6842 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6843 isScalarAfterVectorization(I, VF))
6844 return false;
6846 // If the instruction is scalar with predication, it will be analyzed
6847 // separately. We ignore it within the context of PredInst.
6848 if (isScalarWithPredication(I))
6849 return false;
6851 // If any of the instruction's operands are uniform after vectorization,
6852 // the instruction cannot be scalarized. This prevents, for example, a
6853 // masked load from being scalarized.
6855 // We assume we will only emit a value for lane zero of an instruction
6856 // marked uniform after vectorization, rather than VF identical values.
6857 // Thus, if we scalarize an instruction that uses a uniform, we would
6858 // create uses of values corresponding to the lanes we aren't emitting code
6859 // for. This behavior can be changed by allowing getScalarValue to clone
6860 // the lane zero values for uniforms rather than asserting.
6861 for (Use &U : I->operands())
6862 if (auto *J = dyn_cast<Instruction>(U.get()))
6863 if (isUniformAfterVectorization(J, VF))
6864 return false;
6866 // Otherwise, we can scalarize the instruction.
6867 return true;
6870 // Compute the expected cost discount from scalarizing the entire expression
6871 // feeding the predicated instruction. We currently only consider expressions
6872 // that are single-use instruction chains.
6873 Worklist.push_back(PredInst);
6874 while (!Worklist.empty()) {
6875 Instruction *I = Worklist.pop_back_val();
6877 // If we've already analyzed the instruction, there's nothing to do.
6878 if (ScalarCosts.find(I) != ScalarCosts.end())
6879 continue;
6881 // Compute the cost of the vector instruction. Note that this cost already
6882 // includes the scalarization overhead of the predicated instruction.
6883 InstructionCost VectorCost = getInstructionCost(I, VF).first;
6885 // Compute the cost of the scalarized instruction. This cost is the cost of
6886 // the instruction as if it wasn't if-converted and instead remained in the
6887 // predicated block. We will scale this cost by block probability after
6888 // computing the scalarization overhead.
6889 InstructionCost ScalarCost =
6890 VF.getFixedValue() *
6891 getInstructionCost(I, ElementCount::getFixed(1)).first;
6893 // Compute the scalarization overhead of needed insertelement instructions
6894 // and phi nodes.
6895 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6896 ScalarCost += TTI.getScalarizationOverhead(
6897 cast<VectorType>(ToVectorTy(I->getType(), VF)),
6898 APInt::getAllOnesValue(VF.getFixedValue()), true, false);
6899 ScalarCost +=
6900 VF.getFixedValue() *
6901 TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6904 // Compute the scalarization overhead of needed extractelement
6905 // instructions. For each of the instruction's operands, if the operand can
6906 // be scalarized, add it to the worklist; otherwise, account for the
6907 // overhead.
6908 for (Use &U : I->operands())
6909 if (auto *J = dyn_cast<Instruction>(U.get())) {
6910 assert(VectorType::isValidElementType(J->getType()) &&
6911 "Instruction has non-scalar type");
6912 if (canBeScalarized(J))
6913 Worklist.push_back(J);
6914 else if (needsExtract(J, VF)) {
6915 ScalarCost += TTI.getScalarizationOverhead(
6916 cast<VectorType>(ToVectorTy(J->getType(), VF)),
6917 APInt::getAllOnesValue(VF.getFixedValue()), false, true);
6921 // Scale the total scalar cost by block probability.
6922 ScalarCost /= getReciprocalPredBlockProb();
6924 // Compute the discount. A non-negative discount means the vector version
6925 // of the instruction costs more, and scalarizing would be beneficial.
6926 Discount += VectorCost - ScalarCost;
6927 ScalarCosts[I] = ScalarCost;
6930 return *Discount.getValue();
6933 LoopVectorizationCostModel::VectorizationCostTy
6934 LoopVectorizationCostModel::expectedCost(
6935 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6936 VectorizationCostTy Cost;
6938 // For each block.
6939 for (BasicBlock *BB : TheLoop->blocks()) {
6940 VectorizationCostTy BlockCost;
6942 // For each instruction in the old loop.
6943 for (Instruction &I : BB->instructionsWithoutDebug()) {
6944 // Skip ignored values.
6945 if (ValuesToIgnore.count(&I) ||
6946 (VF.isVector() && VecValuesToIgnore.count(&I)))
6947 continue;
6949 VectorizationCostTy C = getInstructionCost(&I, VF);
6951 // Check if we should override the cost.
6952 if (C.first.isValid() &&
6953 ForceTargetInstructionCost.getNumOccurrences() > 0)
6954 C.first = InstructionCost(ForceTargetInstructionCost);
6956 // Keep a list of instructions with invalid costs.
6957 if (Invalid && !C.first.isValid())
6958 Invalid->emplace_back(&I, VF);
6960 BlockCost.first += C.first;
6961 BlockCost.second |= C.second;
6962 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6963 << " for VF " << VF << " For instruction: " << I
6964 << '\n');
6967 // If we are vectorizing a predicated block, it will have been
6968 // if-converted. This means that the block's instructions (aside from
6969 // stores and instructions that may divide by zero) will now be
6970 // unconditionally executed. For the scalar case, we may not always execute
6971 // the predicated block, if it is an if-else block. Thus, scale the block's
6972 // cost by the probability of executing it. blockNeedsPredication from
6973 // Legal is used so as to not include all blocks in tail folded loops.
6974 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6975 BlockCost.first /= getReciprocalPredBlockProb();
6977 Cost.first += BlockCost.first;
6978 Cost.second |= BlockCost.second;
6981 return Cost;
6984 /// Gets Address Access SCEV after verifying that the access pattern
6985 /// is loop invariant except the induction variable dependence.
6987 /// This SCEV can be sent to the Target in order to estimate the address
6988 /// calculation cost.
6989 static const SCEV *getAddressAccessSCEV(
6990 Value *Ptr,
6991 LoopVectorizationLegality *Legal,
6992 PredicatedScalarEvolution &PSE,
6993 const Loop *TheLoop) {
6995 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6996 if (!Gep)
6997 return nullptr;
6999 // We are looking for a gep with all loop invariant indices except for one
7000 // which should be an induction variable.
7001 auto SE = PSE.getSE();
7002 unsigned NumOperands = Gep->getNumOperands();
7003 for (unsigned i = 1; i < NumOperands; ++i) {
7004 Value *Opd = Gep->getOperand(i);
7005 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
7006 !Legal->isInductionVariable(Opd))
7007 return nullptr;
7010 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
7011 return PSE.getSCEV(Ptr);
7014 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
7015 return Legal->hasStride(I->getOperand(0)) ||
7016 Legal->hasStride(I->getOperand(1));
7019 InstructionCost
7020 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
7021 ElementCount VF) {
7022 assert(VF.isVector() &&
7023 "Scalarization cost of instruction implies vectorization.");
7024 if (VF.isScalable())
7025 return InstructionCost::getInvalid();
7027 Type *ValTy = getLoadStoreType(I);
7028 auto SE = PSE.getSE();
7030 unsigned AS = getLoadStoreAddressSpace(I);
7031 Value *Ptr = getLoadStorePointerOperand(I);
7032 Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
7034 // Figure out whether the access is strided and get the stride value
7035 // if it's known in compile time
7036 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
7038 // Get the cost of the scalar memory instruction and address computation.
7039 InstructionCost Cost =
7040 VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
7042 // Don't pass *I here, since it is scalar but will actually be part of a
7043 // vectorized loop where the user of it is a vectorized instruction.
7044 const Align Alignment = getLoadStoreAlignment(I);
7045 Cost += VF.getKnownMinValue() *
7046 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
7047 AS, TTI::TCK_RecipThroughput);
7049 // Get the overhead of the extractelement and insertelement instructions
7050 // we might create due to scalarization.
7051 Cost += getScalarizationOverhead(I, VF);
7053 // If we have a predicated load/store, it will need extra i1 extracts and
7054 // conditional branches, but may not be executed for each vector lane. Scale
7055 // the cost by the probability of executing the predicated block.
7056 if (isPredicatedInst(I)) {
7057 Cost /= getReciprocalPredBlockProb();
7059 // Add the cost of an i1 extract and a branch
7060 auto *Vec_i1Ty =
7061 VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
7062 Cost += TTI.getScalarizationOverhead(
7063 Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
7064 /*Insert=*/false, /*Extract=*/true);
7065 Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
7067 if (useEmulatedMaskMemRefHack(I))
7068 // Artificially setting to a high enough value to practically disable
7069 // vectorization with such operations.
7070 Cost = 3000000;
7073 return Cost;
7076 InstructionCost
7077 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
7078 ElementCount VF) {
7079 Type *ValTy = getLoadStoreType(I);
7080 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
7081 Value *Ptr = getLoadStorePointerOperand(I);
7082 unsigned AS = getLoadStoreAddressSpace(I);
7083 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
7084 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7086 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7087 "Stride should be 1 or -1 for consecutive memory access");
7088 const Align Alignment = getLoadStoreAlignment(I);
7089 InstructionCost Cost = 0;
7090 if (Legal->isMaskRequired(I))
7091 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
7092 CostKind);
7093 else
7094 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
7095 CostKind, I);
7097 bool Reverse = ConsecutiveStride < 0;
7098 if (Reverse)
7099 Cost +=
7100 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
7101 return Cost;
7104 InstructionCost
7105 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
7106 ElementCount VF) {
7107 assert(Legal->isUniformMemOp(*I));
7109 Type *ValTy = getLoadStoreType(I);
7110 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
7111 const Align Alignment = getLoadStoreAlignment(I);
7112 unsigned AS = getLoadStoreAddressSpace(I);
7113 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7114 if (isa<LoadInst>(I)) {
7115 return TTI.getAddressComputationCost(ValTy) +
7116 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
7117 CostKind) +
7118 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
7120 StoreInst *SI = cast<StoreInst>(I);
7122 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
7123 return TTI.getAddressComputationCost(ValTy) +
7124 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
7125 CostKind) +
7126 (isLoopInvariantStoreValue
7128 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
7129 VF.getKnownMinValue() - 1));
7132 InstructionCost
7133 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
7134 ElementCount VF) {
7135 Type *ValTy = getLoadStoreType(I);
7136 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
7137 const Align Alignment = getLoadStoreAlignment(I);
7138 const Value *Ptr = getLoadStorePointerOperand(I);
7140 return TTI.getAddressComputationCost(VectorTy) +
7141 TTI.getGatherScatterOpCost(
7142 I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
7143 TargetTransformInfo::TCK_RecipThroughput, I);
7146 InstructionCost
7147 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
7148 ElementCount VF) {
7149 // TODO: Once we have support for interleaving with scalable vectors
7150 // we can calculate the cost properly here.
7151 if (VF.isScalable())
7152 return InstructionCost::getInvalid();
7154 Type *ValTy = getLoadStoreType(I);
7155 auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
7156 unsigned AS = getLoadStoreAddressSpace(I);
7158 auto Group = getInterleavedAccessGroup(I);
7159 assert(Group && "Fail to get an interleaved access group.");
7161 unsigned InterleaveFactor = Group->getFactor();
7162 auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
7164 // Holds the indices of existing members in the interleaved group.
7165 SmallVector<unsigned, 4> Indices;
7166 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
7167 if (Group->getMember(IF))
7168 Indices.push_back(IF);
7170 // Calculate the cost of the whole interleaved group.
7171 bool UseMaskForGaps =
7172 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
7173 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
7174 InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
7175 I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
7176 AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
7178 if (Group->isReverse()) {
7179 // TODO: Add support for reversed masked interleaved access.
7180 assert(!Legal->isMaskRequired(I) &&
7181 "Reverse masked interleaved access not supported.");
7182 Cost +=
7183 Group->getNumMembers() *
7184 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
7186 return Cost;
7189 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
7190 Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
7191 using namespace llvm::PatternMatch;
7192 // Early exit for no inloop reductions
7193 if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
7194 return None;
7195 auto *VectorTy = cast<VectorType>(Ty);
7197 // We are looking for a pattern of, and finding the minimal acceptable cost:
7198 // reduce(mul(ext(A), ext(B))) or
7199 // reduce(mul(A, B)) or
7200 // reduce(ext(A)) or
7201 // reduce(A).
7202 // The basic idea is that we walk down the tree to do that, finding the root
7203 // reduction instruction in InLoopReductionImmediateChains. From there we find
7204 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
7205 // of the components. If the reduction cost is lower then we return it for the
7206 // reduction instruction and 0 for the other instructions in the pattern. If
7207 // it is not we return an invalid cost specifying the orignal cost method
7208 // should be used.
7209 Instruction *RetI = I;
7210 if (match(RetI, m_ZExtOrSExt(m_Value()))) {
7211 if (!RetI->hasOneUser())
7212 return None;
7213 RetI = RetI->user_back();
7215 if (match(RetI, m_Mul(m_Value(), m_Value())) &&
7216 RetI->user_back()->getOpcode() == Instruction::Add) {
7217 if (!RetI->hasOneUser())
7218 return None;
7219 RetI = RetI->user_back();
7222 // Test if the found instruction is a reduction, and if not return an invalid
7223 // cost specifying the parent to use the original cost modelling.
7224 if (!InLoopReductionImmediateChains.count(RetI))
7225 return None;
7227 // Find the reduction this chain is a part of and calculate the basic cost of
7228 // the reduction on its own.
7229 Instruction *LastChain = InLoopReductionImmediateChains[RetI];
7230 Instruction *ReductionPhi = LastChain;
7231 while (!isa<PHINode>(ReductionPhi))
7232 ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
7234 const RecurrenceDescriptor &RdxDesc =
7235 Legal->getReductionVars()[cast<PHINode>(ReductionPhi)];
7237 InstructionCost BaseCost = TTI.getArithmeticReductionCost(
7238 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
7240 // If we're using ordered reductions then we can just return the base cost
7241 // here, since getArithmeticReductionCost calculates the full ordered
7242 // reduction cost when FP reassociation is not allowed.
7243 if (useOrderedReductions(RdxDesc))
7244 return BaseCost;
7246 // Get the operand that was not the reduction chain and match it to one of the
7247 // patterns, returning the better cost if it is found.
7248 Instruction *RedOp = RetI->getOperand(1) == LastChain
7249 ? dyn_cast<Instruction>(RetI->getOperand(0))
7250 : dyn_cast<Instruction>(RetI->getOperand(1));
7252 VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
7254 Instruction *Op0, *Op1;
7255 if (RedOp &&
7256 match(RedOp,
7257 m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
7258 match(Op0, m_ZExtOrSExt(m_Value())) &&
7259 Op0->getOpcode() == Op1->getOpcode() &&
7260 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
7261 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
7262 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
7264 // Matched reduce(ext(mul(ext(A), ext(B)))
7265 // Note that the extend opcodes need to all match, or if A==B they will have
7266 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
7267 // which is equally fine.
7268 bool IsUnsigned = isa<ZExtInst>(Op0);
7269 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
7270 auto *MulType = VectorType::get(Op0->getType(), VectorTy);
7272 InstructionCost ExtCost =
7273 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
7274 TTI::CastContextHint::None, CostKind, Op0);
7275 InstructionCost MulCost =
7276 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
7277 InstructionCost Ext2Cost =
7278 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
7279 TTI::CastContextHint::None, CostKind, RedOp);
7281 InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7282 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7283 CostKind);
7285 if (RedCost.isValid() &&
7286 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
7287 return I == RetI ? RedCost : 0;
7288 } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
7289 !TheLoop->isLoopInvariant(RedOp)) {
7290 // Matched reduce(ext(A))
7291 bool IsUnsigned = isa<ZExtInst>(RedOp);
7292 auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
7293 InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7294 /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7295 CostKind);
7297 InstructionCost ExtCost =
7298 TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
7299 TTI::CastContextHint::None, CostKind, RedOp);
7300 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
7301 return I == RetI ? RedCost : 0;
7302 } else if (RedOp &&
7303 match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
7304 if (match(Op0, m_ZExtOrSExt(m_Value())) &&
7305 Op0->getOpcode() == Op1->getOpcode() &&
7306 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
7307 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
7308 bool IsUnsigned = isa<ZExtInst>(Op0);
7309 auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
7310 // Matched reduce(mul(ext, ext))
7311 InstructionCost ExtCost =
7312 TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType,
7313 TTI::CastContextHint::None, CostKind, Op0);
7314 InstructionCost MulCost =
7315 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7317 InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7318 /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7319 CostKind);
7321 if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost)
7322 return I == RetI ? RedCost : 0;
7323 } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
7324 // Matched reduce(mul())
7325 InstructionCost MulCost =
7326 TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7328 InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7329 /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
7330 CostKind);
7332 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
7333 return I == RetI ? RedCost : 0;
7337 return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
7340 InstructionCost
7341 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
7342 ElementCount VF) {
7343 // Calculate scalar cost only. Vectorization cost should be ready at this
7344 // moment.
7345 if (VF.isScalar()) {
7346 Type *ValTy = getLoadStoreType(I);
7347 const Align Alignment = getLoadStoreAlignment(I);
7348 unsigned AS = getLoadStoreAddressSpace(I);
7350 return TTI.getAddressComputationCost(ValTy) +
7351 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
7352 TTI::TCK_RecipThroughput, I);
7354 return getWideningCost(I, VF);
7357 LoopVectorizationCostModel::VectorizationCostTy
7358 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
7359 ElementCount VF) {
7360 // If we know that this instruction will remain uniform, check the cost of
7361 // the scalar version.
7362 if (isUniformAfterVectorization(I, VF))
7363 VF = ElementCount::getFixed(1);
7365 if (VF.isVector() && isProfitableToScalarize(I, VF))
7366 return VectorizationCostTy(InstsToScalarize[VF][I], false);
7368 // Forced scalars do not have any scalarization overhead.
7369 auto ForcedScalar = ForcedScalars.find(VF);
7370 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
7371 auto InstSet = ForcedScalar->second;
7372 if (InstSet.count(I))
7373 return VectorizationCostTy(
7374 (getInstructionCost(I, ElementCount::getFixed(1)).first *
7375 VF.getKnownMinValue()),
7376 false);
7379 Type *VectorTy;
7380 InstructionCost C = getInstructionCost(I, VF, VectorTy);
7382 bool TypeNotScalarized =
7383 VF.isVector() && VectorTy->isVectorTy() &&
7384 TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
7385 return VectorizationCostTy(C, TypeNotScalarized);
7388 InstructionCost
7389 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
7390 ElementCount VF) const {
7392 // There is no mechanism yet to create a scalable scalarization loop,
7393 // so this is currently Invalid.
7394 if (VF.isScalable())
7395 return InstructionCost::getInvalid();
7397 if (VF.isScalar())
7398 return 0;
7400 InstructionCost Cost = 0;
7401 Type *RetTy = ToVectorTy(I->getType(), VF);
7402 if (!RetTy->isVoidTy() &&
7403 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
7404 Cost += TTI.getScalarizationOverhead(
7405 cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
7406 true, false);
7408 // Some targets keep addresses scalar.
7409 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
7410 return Cost;
7412 // Some targets support efficient element stores.
7413 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
7414 return Cost;
7416 // Collect operands to consider.
7417 CallInst *CI = dyn_cast<CallInst>(I);
7418 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
7420 // Skip operands that do not require extraction/scalarization and do not incur
7421 // any overhead.
7422 SmallVector<Type *> Tys;
7423 for (auto *V : filterExtractingOperands(Ops, VF))
7424 Tys.push_back(MaybeVectorizeType(V->getType(), VF));
7425 return Cost + TTI.getOperandsScalarizationOverhead(
7426 filterExtractingOperands(Ops, VF), Tys);
7429 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
7430 if (VF.isScalar())
7431 return;
7432 NumPredStores = 0;
7433 for (BasicBlock *BB : TheLoop->blocks()) {
7434 // For each instruction in the old loop.
7435 for (Instruction &I : *BB) {
7436 Value *Ptr = getLoadStorePointerOperand(&I);
7437 if (!Ptr)
7438 continue;
7440 // TODO: We should generate better code and update the cost model for
7441 // predicated uniform stores. Today they are treated as any other
7442 // predicated store (see added test cases in
7443 // invariant-store-vectorization.ll).
7444 if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
7445 NumPredStores++;
7447 if (Legal->isUniformMemOp(I)) {
7448 // TODO: Avoid replicating loads and stores instead of
7449 // relying on instcombine to remove them.
7450 // Load: Scalar load + broadcast
7451 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
7452 InstructionCost Cost;
7453 if (isa<StoreInst>(&I) && VF.isScalable() &&
7454 isLegalGatherOrScatter(&I)) {
7455 Cost = getGatherScatterCost(&I, VF);
7456 setWideningDecision(&I, VF, CM_GatherScatter, Cost);
7457 } else {
7458 assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
7459 "Cannot yet scalarize uniform stores");
7460 Cost = getUniformMemOpCost(&I, VF);
7461 setWideningDecision(&I, VF, CM_Scalarize, Cost);
7463 continue;
7466 // We assume that widening is the best solution when possible.
7467 if (memoryInstructionCanBeWidened(&I, VF)) {
7468 InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
7469 int ConsecutiveStride =
7470 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
7471 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7472 "Expected consecutive stride.");
7473 InstWidening Decision =
7474 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
7475 setWideningDecision(&I, VF, Decision, Cost);
7476 continue;
7479 // Choose between Interleaving, Gather/Scatter or Scalarization.
7480 InstructionCost InterleaveCost = InstructionCost::getInvalid();
7481 unsigned NumAccesses = 1;
7482 if (isAccessInterleaved(&I)) {
7483 auto Group = getInterleavedAccessGroup(&I);
7484 assert(Group && "Fail to get an interleaved access group.");
7486 // Make one decision for the whole group.
7487 if (getWideningDecision(&I, VF) != CM_Unknown)
7488 continue;
7490 NumAccesses = Group->getNumMembers();
7491 if (interleavedAccessCanBeWidened(&I, VF))
7492 InterleaveCost = getInterleaveGroupCost(&I, VF);
7495 InstructionCost GatherScatterCost =
7496 isLegalGatherOrScatter(&I)
7497 ? getGatherScatterCost(&I, VF) * NumAccesses
7498 : InstructionCost::getInvalid();
7500 InstructionCost ScalarizationCost =
7501 getMemInstScalarizationCost(&I, VF) * NumAccesses;
7503 // Choose better solution for the current VF,
7504 // write down this decision and use it during vectorization.
7505 InstructionCost Cost;
7506 InstWidening Decision;
7507 if (InterleaveCost <= GatherScatterCost &&
7508 InterleaveCost < ScalarizationCost) {
7509 Decision = CM_Interleave;
7510 Cost = InterleaveCost;
7511 } else if (GatherScatterCost < ScalarizationCost) {
7512 Decision = CM_GatherScatter;
7513 Cost = GatherScatterCost;
7514 } else {
7515 Decision = CM_Scalarize;
7516 Cost = ScalarizationCost;
7518 // If the instructions belongs to an interleave group, the whole group
7519 // receives the same decision. The whole group receives the cost, but
7520 // the cost will actually be assigned to one instruction.
7521 if (auto Group = getInterleavedAccessGroup(&I))
7522 setWideningDecision(Group, VF, Decision, Cost);
7523 else
7524 setWideningDecision(&I, VF, Decision, Cost);
7528 // Make sure that any load of address and any other address computation
7529 // remains scalar unless there is gather/scatter support. This avoids
7530 // inevitable extracts into address registers, and also has the benefit of
7531 // activating LSR more, since that pass can't optimize vectorized
7532 // addresses.
7533 if (TTI.prefersVectorizedAddressing())
7534 return;
7536 // Start with all scalar pointer uses.
7537 SmallPtrSet<Instruction *, 8> AddrDefs;
7538 for (BasicBlock *BB : TheLoop->blocks())
7539 for (Instruction &I : *BB) {
7540 Instruction *PtrDef =
7541 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
7542 if (PtrDef && TheLoop->contains(PtrDef) &&
7543 getWideningDecision(&I, VF) != CM_GatherScatter)
7544 AddrDefs.insert(PtrDef);
7547 // Add all instructions used to generate the addresses.
7548 SmallVector<Instruction *, 4> Worklist;
7549 append_range(Worklist, AddrDefs);
7550 while (!Worklist.empty()) {
7551 Instruction *I = Worklist.pop_back_val();
7552 for (auto &Op : I->operands())
7553 if (auto *InstOp = dyn_cast<Instruction>(Op))
7554 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7555 AddrDefs.insert(InstOp).second)
7556 Worklist.push_back(InstOp);
7559 for (auto *I : AddrDefs) {
7560 if (isa<LoadInst>(I)) {
7561 // Setting the desired widening decision should ideally be handled in
7562 // by cost functions, but since this involves the task of finding out
7563 // if the loaded register is involved in an address computation, it is
7564 // instead changed here when we know this is the case.
7565 InstWidening Decision = getWideningDecision(I, VF);
7566 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
7567 // Scalarize a widened load of address.
7568 setWideningDecision(
7569 I, VF, CM_Scalarize,
7570 (VF.getKnownMinValue() *
7571 getMemoryInstructionCost(I, ElementCount::getFixed(1))));
7572 else if (auto Group = getInterleavedAccessGroup(I)) {
7573 // Scalarize an interleave group of address loads.
7574 for (unsigned I = 0; I < Group->getFactor(); ++I) {
7575 if (Instruction *Member = Group->getMember(I))
7576 setWideningDecision(
7577 Member, VF, CM_Scalarize,
7578 (VF.getKnownMinValue() *
7579 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7582 } else
7583 // Make sure I gets scalarized and a cost estimate without
7584 // scalarization overhead.
7585 ForcedScalars[VF].insert(I);
7589 InstructionCost
7590 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7591 Type *&VectorTy) {
7592 Type *RetTy = I->getType();
7593 if (canTruncateToMinimalBitwidth(I, VF))
7594 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7595 auto SE = PSE.getSE();
7596 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7598 auto hasSingleCopyAfterVectorization = [this](Instruction *I,
7599 ElementCount VF) -> bool {
7600 if (VF.isScalar())
7601 return true;
7603 auto Scalarized = InstsToScalarize.find(VF);
7604 assert(Scalarized != InstsToScalarize.end() &&
7605 "VF not yet analyzed for scalarization profitability");
7606 return !Scalarized->second.count(I) &&
7607 llvm::all_of(I->users(), [&](User *U) {
7608 auto *UI = cast<Instruction>(U);
7609 return !Scalarized->second.count(UI);
7612 (void) hasSingleCopyAfterVectorization;
7614 if (isScalarAfterVectorization(I, VF)) {
7615 // With the exception of GEPs and PHIs, after scalarization there should
7616 // only be one copy of the instruction generated in the loop. This is
7617 // because the VF is either 1, or any instructions that need scalarizing
7618 // have already been dealt with by the the time we get here. As a result,
7619 // it means we don't have to multiply the instruction cost by VF.
7620 assert(I->getOpcode() == Instruction::GetElementPtr ||
7621 I->getOpcode() == Instruction::PHI ||
7622 (I->getOpcode() == Instruction::BitCast &&
7623 I->getType()->isPointerTy()) ||
7624 hasSingleCopyAfterVectorization(I, VF));
7625 VectorTy = RetTy;
7626 } else
7627 VectorTy = ToVectorTy(RetTy, VF);
7629 // TODO: We need to estimate the cost of intrinsic calls.
7630 switch (I->getOpcode()) {
7631 case Instruction::GetElementPtr:
7632 // We mark this instruction as zero-cost because the cost of GEPs in
7633 // vectorized code depends on whether the corresponding memory instruction
7634 // is scalarized or not. Therefore, we handle GEPs with the memory
7635 // instruction cost.
7636 return 0;
7637 case Instruction::Br: {
7638 // In cases of scalarized and predicated instructions, there will be VF
7639 // predicated blocks in the vectorized loop. Each branch around these
7640 // blocks requires also an extract of its vector compare i1 element.
7641 bool ScalarPredicatedBB = false;
7642 BranchInst *BI = cast<BranchInst>(I);
7643 if (VF.isVector() && BI->isConditional() &&
7644 (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7645 PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7646 ScalarPredicatedBB = true;
7648 if (ScalarPredicatedBB) {
7649 // Not possible to scalarize scalable vector with predicated instructions.
7650 if (VF.isScalable())
7651 return InstructionCost::getInvalid();
7652 // Return cost for branches around scalarized and predicated blocks.
7653 auto *Vec_i1Ty =
7654 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7655 return (
7656 TTI.getScalarizationOverhead(
7657 Vec_i1Ty, APInt::getAllOnesValue(VF.getFixedValue()), false,
7658 true) +
7659 (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7660 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7661 // The back-edge branch will remain, as will all scalar branches.
7662 return TTI.getCFInstrCost(Instruction::Br, CostKind);
7663 else
7664 // This branch will be eliminated by if-conversion.
7665 return 0;
7666 // Note: We currently assume zero cost for an unconditional branch inside
7667 // a predicated block since it will become a fall-through, although we
7668 // may decide in the future to call TTI for all branches.
7670 case Instruction::PHI: {
7671 auto *Phi = cast<PHINode>(I);
7673 // First-order recurrences are replaced by vector shuffles inside the loop.
7674 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7675 if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7676 return TTI.getShuffleCost(
7677 TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7678 None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7680 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7681 // converted into select instructions. We require N - 1 selects per phi
7682 // node, where N is the number of incoming values.
7683 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7684 return (Phi->getNumIncomingValues() - 1) *
7685 TTI.getCmpSelInstrCost(
7686 Instruction::Select, ToVectorTy(Phi->getType(), VF),
7687 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7688 CmpInst::BAD_ICMP_PREDICATE, CostKind);
7690 return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7692 case Instruction::UDiv:
7693 case Instruction::SDiv:
7694 case Instruction::URem:
7695 case Instruction::SRem:
7696 // If we have a predicated instruction, it may not be executed for each
7697 // vector lane. Get the scalarization cost and scale this amount by the
7698 // probability of executing the predicated block. If the instruction is not
7699 // predicated, we fall through to the next case.
7700 if (VF.isVector() && isScalarWithPredication(I)) {
7701 InstructionCost Cost = 0;
7703 // These instructions have a non-void type, so account for the phi nodes
7704 // that we will create. This cost is likely to be zero. The phi node
7705 // cost, if any, should be scaled by the block probability because it
7706 // models a copy at the end of each predicated block.
7707 Cost += VF.getKnownMinValue() *
7708 TTI.getCFInstrCost(Instruction::PHI, CostKind);
7710 // The cost of the non-predicated instruction.
7711 Cost += VF.getKnownMinValue() *
7712 TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7714 // The cost of insertelement and extractelement instructions needed for
7715 // scalarization.
7716 Cost += getScalarizationOverhead(I, VF);
7718 // Scale the cost by the probability of executing the predicated blocks.
7719 // This assumes the predicated block for each vector lane is equally
7720 // likely.
7721 return Cost / getReciprocalPredBlockProb();
7723 LLVM_FALLTHROUGH;
7724 case Instruction::Add:
7725 case Instruction::FAdd:
7726 case Instruction::Sub:
7727 case Instruction::FSub:
7728 case Instruction::Mul:
7729 case Instruction::FMul:
7730 case Instruction::FDiv:
7731 case Instruction::FRem:
7732 case Instruction::Shl:
7733 case Instruction::LShr:
7734 case Instruction::AShr:
7735 case Instruction::And:
7736 case Instruction::Or:
7737 case Instruction::Xor: {
7738 // Since we will replace the stride by 1 the multiplication should go away.
7739 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7740 return 0;
7742 // Detect reduction patterns
7743 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7744 return *RedCost;
7746 // Certain instructions can be cheaper to vectorize if they have a constant
7747 // second vector operand. One example of this are shifts on x86.
7748 Value *Op2 = I->getOperand(1);
7749 TargetTransformInfo::OperandValueProperties Op2VP;
7750 TargetTransformInfo::OperandValueKind Op2VK =
7751 TTI.getOperandInfo(Op2, Op2VP);
7752 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7753 Op2VK = TargetTransformInfo::OK_UniformValue;
7755 SmallVector<const Value *, 4> Operands(I->operand_values());
7756 return TTI.getArithmeticInstrCost(
7757 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7758 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7760 case Instruction::FNeg: {
7761 return TTI.getArithmeticInstrCost(
7762 I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7763 TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7764 TargetTransformInfo::OP_None, I->getOperand(0), I);
7766 case Instruction::Select: {
7767 SelectInst *SI = cast<SelectInst>(I);
7768 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7769 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7771 const Value *Op0, *Op1;
7772 using namespace llvm::PatternMatch;
7773 if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7774 match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7775 // select x, y, false --> x & y
7776 // select x, true, y --> x | y
7777 TTI::OperandValueProperties Op1VP = TTI::OP_None;
7778 TTI::OperandValueProperties Op2VP = TTI::OP_None;
7779 TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7780 TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7781 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7782 Op1->getType()->getScalarSizeInBits() == 1);
7784 SmallVector<const Value *, 2> Operands{Op0, Op1};
7785 return TTI.getArithmeticInstrCost(
7786 match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7787 CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7790 Type *CondTy = SI->getCondition()->getType();
7791 if (!ScalarCond)
7792 CondTy = VectorType::get(CondTy, VF);
7793 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
7794 CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7796 case Instruction::ICmp:
7797 case Instruction::FCmp: {
7798 Type *ValTy = I->getOperand(0)->getType();
7799 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7800 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7801 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7802 VectorTy = ToVectorTy(ValTy, VF);
7803 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7804 CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7806 case Instruction::Store:
7807 case Instruction::Load: {
7808 ElementCount Width = VF;
7809 if (Width.isVector()) {
7810 InstWidening Decision = getWideningDecision(I, Width);
7811 assert(Decision != CM_Unknown &&
7812 "CM decision should be taken at this point");
7813 if (Decision == CM_Scalarize)
7814 Width = ElementCount::getFixed(1);
7816 VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7817 return getMemoryInstructionCost(I, VF);
7819 case Instruction::BitCast:
7820 if (I->getType()->isPointerTy())
7821 return 0;
7822 LLVM_FALLTHROUGH;
7823 case Instruction::ZExt:
7824 case Instruction::SExt:
7825 case Instruction::FPToUI:
7826 case Instruction::FPToSI:
7827 case Instruction::FPExt:
7828 case Instruction::PtrToInt:
7829 case Instruction::IntToPtr:
7830 case Instruction::SIToFP:
7831 case Instruction::UIToFP:
7832 case Instruction::Trunc:
7833 case Instruction::FPTrunc: {
7834 // Computes the CastContextHint from a Load/Store instruction.
7835 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7836 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7837 "Expected a load or a store!");
7839 if (VF.isScalar() || !TheLoop->contains(I))
7840 return TTI::CastContextHint::Normal;
7842 switch (getWideningDecision(I, VF)) {
7843 case LoopVectorizationCostModel::CM_GatherScatter:
7844 return TTI::CastContextHint::GatherScatter;
7845 case LoopVectorizationCostModel::CM_Interleave:
7846 return TTI::CastContextHint::Interleave;
7847 case LoopVectorizationCostModel::CM_Scalarize:
7848 case LoopVectorizationCostModel::CM_Widen:
7849 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7850 : TTI::CastContextHint::Normal;
7851 case LoopVectorizationCostModel::CM_Widen_Reverse:
7852 return TTI::CastContextHint::Reversed;
7853 case LoopVectorizationCostModel::CM_Unknown:
7854 llvm_unreachable("Instr did not go through cost modelling?");
7857 llvm_unreachable("Unhandled case!");
7860 unsigned Opcode = I->getOpcode();
7861 TTI::CastContextHint CCH = TTI::CastContextHint::None;
7862 // For Trunc, the context is the only user, which must be a StoreInst.
7863 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7864 if (I->hasOneUse())
7865 if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7866 CCH = ComputeCCH(Store);
7868 // For Z/Sext, the context is the operand, which must be a LoadInst.
7869 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7870 Opcode == Instruction::FPExt) {
7871 if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7872 CCH = ComputeCCH(Load);
7875 // We optimize the truncation of induction variables having constant
7876 // integer steps. The cost of these truncations is the same as the scalar
7877 // operation.
7878 if (isOptimizableIVTruncate(I, VF)) {
7879 auto *Trunc = cast<TruncInst>(I);
7880 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7881 Trunc->getSrcTy(), CCH, CostKind, Trunc);
7884 // Detect reduction patterns
7885 if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7886 return *RedCost;
7888 Type *SrcScalarTy = I->getOperand(0)->getType();
7889 Type *SrcVecTy =
7890 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7891 if (canTruncateToMinimalBitwidth(I, VF)) {
7892 // This cast is going to be shrunk. This may remove the cast or it might
7893 // turn it into slightly different cast. For example, if MinBW == 16,
7894 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7896 // Calculate the modified src and dest types.
7897 Type *MinVecTy = VectorTy;
7898 if (Opcode == Instruction::Trunc) {
7899 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7900 VectorTy =
7901 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7902 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7903 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7904 VectorTy =
7905 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7909 return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7911 case Instruction::Call: {
7912 bool NeedToScalarize;
7913 CallInst *CI = cast<CallInst>(I);
7914 InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7915 if (getVectorIntrinsicIDForCall(CI, TLI)) {
7916 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7917 return std::min(CallCost, IntrinsicCost);
7919 return CallCost;
7921 case Instruction::ExtractValue:
7922 return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7923 case Instruction::Alloca:
7924 // We cannot easily widen alloca to a scalable alloca, as
7925 // the result would need to be a vector of pointers.
7926 if (VF.isScalable())
7927 return InstructionCost::getInvalid();
7928 LLVM_FALLTHROUGH;
7929 default:
7930 // This opcode is unknown. Assume that it is the same as 'mul'.
7931 return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7932 } // end of switch.
7935 char LoopVectorize::ID = 0;
7937 static const char lv_name[] = "Loop Vectorization";
7939 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7940 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7941 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7942 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7943 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7944 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7945 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7946 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7947 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7948 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7949 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7950 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7951 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7952 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7953 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7954 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7956 namespace llvm {
7958 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7960 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7961 bool VectorizeOnlyWhenForced) {
7962 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7965 } // end namespace llvm
7967 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7968 // Check if the pointer operand of a load or store instruction is
7969 // consecutive.
7970 if (auto *Ptr = getLoadStorePointerOperand(Inst))
7971 return Legal->isConsecutivePtr(Ptr);
7972 return false;
7975 void LoopVectorizationCostModel::collectValuesToIgnore() {
7976 // Ignore ephemeral values.
7977 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7979 // Ignore type-promoting instructions we identified during reduction
7980 // detection.
7981 for (auto &Reduction : Legal->getReductionVars()) {
7982 RecurrenceDescriptor &RedDes = Reduction.second;
7983 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7984 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7986 // Ignore type-casting instructions we identified during induction
7987 // detection.
7988 for (auto &Induction : Legal->getInductionVars()) {
7989 InductionDescriptor &IndDes = Induction.second;
7990 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7991 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7995 void LoopVectorizationCostModel::collectInLoopReductions() {
7996 for (auto &Reduction : Legal->getReductionVars()) {
7997 PHINode *Phi = Reduction.first;
7998 RecurrenceDescriptor &RdxDesc = Reduction.second;
8000 // We don't collect reductions that are type promoted (yet).
8001 if (RdxDesc.getRecurrenceType() != Phi->getType())
8002 continue;
8004 // If the target would prefer this reduction to happen "in-loop", then we
8005 // want to record it as such.
8006 unsigned Opcode = RdxDesc.getOpcode();
8007 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
8008 !TTI.preferInLoopReduction(Opcode, Phi->getType(),
8009 TargetTransformInfo::ReductionFlags()))
8010 continue;
8012 // Check that we can correctly put the reductions into the loop, by
8013 // finding the chain of operations that leads from the phi to the loop
8014 // exit value.
8015 SmallVector<Instruction *, 4> ReductionOperations =
8016 RdxDesc.getReductionOpChain(Phi, TheLoop);
8017 bool InLoop = !ReductionOperations.empty();
8018 if (InLoop) {
8019 InLoopReductionChains[Phi] = ReductionOperations;
8020 // Add the elements to InLoopReductionImmediateChains for cost modelling.
8021 Instruction *LastChain = Phi;
8022 for (auto *I : ReductionOperations) {
8023 InLoopReductionImmediateChains[I] = LastChain;
8024 LastChain = I;
8027 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
8028 << " reduction for phi: " << *Phi << "\n");
8032 // TODO: we could return a pair of values that specify the max VF and
8033 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
8034 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
8035 // doesn't have a cost model that can choose which plan to execute if
8036 // more than one is generated.
8037 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
8038 LoopVectorizationCostModel &CM) {
8039 unsigned WidestType;
8040 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
8041 return WidestVectorRegBits / WidestType;
8044 VectorizationFactor
8045 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
8046 assert(!UserVF.isScalable() && "scalable vectors not yet supported");
8047 ElementCount VF = UserVF;
8048 // Outer loop handling: They may require CFG and instruction level
8049 // transformations before even evaluating whether vectorization is profitable.
8050 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8051 // the vectorization pipeline.
8052 if (!OrigLoop->isInnermost()) {
8053 // If the user doesn't provide a vectorization factor, determine a
8054 // reasonable one.
8055 if (UserVF.isZero()) {
8056 VF = ElementCount::getFixed(determineVPlanVF(
8057 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
8058 .getFixedSize(),
8059 CM));
8060 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
8062 // Make sure we have a VF > 1 for stress testing.
8063 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
8064 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
8065 << "overriding computed VF.\n");
8066 VF = ElementCount::getFixed(4);
8069 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8070 assert(isPowerOf2_32(VF.getKnownMinValue()) &&
8071 "VF needs to be a power of two");
8072 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
8073 << "VF " << VF << " to build VPlans.\n");
8074 buildVPlans(VF, VF);
8076 // For VPlan build stress testing, we bail out after VPlan construction.
8077 if (VPlanBuildStressTest)
8078 return VectorizationFactor::Disabled();
8080 return {VF, 0 /*Cost*/};
8083 LLVM_DEBUG(
8084 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
8085 "VPlan-native path.\n");
8086 return VectorizationFactor::Disabled();
8089 Optional<VectorizationFactor>
8090 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
8091 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8092 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
8093 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
8094 return None;
8096 // Invalidate interleave groups if all blocks of loop will be predicated.
8097 if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
8098 !useMaskedInterleavedAccesses(*TTI)) {
8099 LLVM_DEBUG(
8100 dbgs()
8101 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
8102 "which requires masked-interleaved support.\n");
8103 if (CM.InterleaveInfo.invalidateGroups())
8104 // Invalidating interleave groups also requires invalidating all decisions
8105 // based on them, which includes widening decisions and uniform and scalar
8106 // values.
8107 CM.invalidateCostModelingDecisions();
8110 ElementCount MaxUserVF =
8111 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
8112 bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
8113 if (!UserVF.isZero() && UserVFIsLegal) {
8114 assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
8115 "VF needs to be a power of two");
8116 // Collect the instructions (and their associated costs) that will be more
8117 // profitable to scalarize.
8118 if (CM.selectUserVectorizationFactor(UserVF)) {
8119 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
8120 CM.collectInLoopReductions();
8121 buildVPlansWithVPRecipes(UserVF, UserVF);
8122 LLVM_DEBUG(printPlans(dbgs()));
8123 return {{UserVF, 0}};
8124 } else
8125 reportVectorizationInfo("UserVF ignored because of invalid costs.",
8126 "InvalidCost", ORE, OrigLoop);
8129 // Populate the set of Vectorization Factor Candidates.
8130 ElementCountSet VFCandidates;
8131 for (auto VF = ElementCount::getFixed(1);
8132 ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
8133 VFCandidates.insert(VF);
8134 for (auto VF = ElementCount::getScalable(1);
8135 ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
8136 VFCandidates.insert(VF);
8138 for (const auto &VF : VFCandidates) {
8139 // Collect Uniform and Scalar instructions after vectorization with VF.
8140 CM.collectUniformsAndScalars(VF);
8142 // Collect the instructions (and their associated costs) that will be more
8143 // profitable to scalarize.
8144 if (VF.isVector())
8145 CM.collectInstsToScalarize(VF);
8148 CM.collectInLoopReductions();
8149 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
8150 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
8152 LLVM_DEBUG(printPlans(dbgs()));
8153 if (!MaxFactors.hasVector())
8154 return VectorizationFactor::Disabled();
8156 // Select the optimal vectorization factor.
8157 auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
8159 // Check if it is profitable to vectorize with runtime checks.
8160 unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
8161 if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
8162 bool PragmaThresholdReached =
8163 NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
8164 bool ThresholdReached =
8165 NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
8166 if ((ThresholdReached && !Hints.allowReordering()) ||
8167 PragmaThresholdReached) {
8168 ORE->emit([&]() {
8169 return OptimizationRemarkAnalysisAliasing(
8170 DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(),
8171 OrigLoop->getHeader())
8172 << "loop not vectorized: cannot prove it is safe to reorder "
8173 "memory operations";
8175 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
8176 Hints.emitRemarkWithHints();
8177 return VectorizationFactor::Disabled();
8180 return SelectedVF;
8183 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
8184 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
8185 << '\n');
8186 BestVF = VF;
8187 BestUF = UF;
8189 erase_if(VPlans, [VF](const VPlanPtr &Plan) {
8190 return !Plan->hasVF(VF);
8192 assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
8195 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
8196 DominatorTree *DT) {
8197 // Perform the actual loop transformation.
8199 // 1. Create a new empty loop. Unlink the old loop and connect the new one.
8200 assert(BestVF.hasValue() && "Vectorization Factor is missing");
8201 assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
8203 VPTransformState State{
8204 *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()};
8205 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
8206 State.TripCount = ILV.getOrCreateTripCount(nullptr);
8207 State.CanonicalIV = ILV.Induction;
8209 ILV.printDebugTracesAtStart();
8211 //===------------------------------------------------===//
8213 // Notice: any optimization or new instruction that go
8214 // into the code below should also be implemented in
8215 // the cost-model.
8217 //===------------------------------------------------===//
8219 // 2. Copy and widen instructions from the old loop into the new loop.
8220 VPlans.front()->execute(&State);
8222 // 3. Fix the vectorized code: take care of header phi's, live-outs,
8223 // predication, updating analyses.
8224 ILV.fixVectorizedLoop(State);
8226 ILV.printDebugTracesAtEnd();
8229 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
8230 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
8231 for (const auto &Plan : VPlans)
8232 if (PrintVPlansInDotFormat)
8233 Plan->printDOT(O);
8234 else
8235 Plan->print(O);
8237 #endif
8239 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
8240 SmallPtrSetImpl<Instruction *> &DeadInstructions) {
8242 // We create new control-flow for the vectorized loop, so the original exit
8243 // conditions will be dead after vectorization if it's only used by the
8244 // terminator
8245 SmallVector<BasicBlock*> ExitingBlocks;
8246 OrigLoop->getExitingBlocks(ExitingBlocks);
8247 for (auto *BB : ExitingBlocks) {
8248 auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
8249 if (!Cmp || !Cmp->hasOneUse())
8250 continue;
8252 // TODO: we should introduce a getUniqueExitingBlocks on Loop
8253 if (!DeadInstructions.insert(Cmp).second)
8254 continue;
8256 // The operands of the icmp is often a dead trunc, used by IndUpdate.
8257 // TODO: can recurse through operands in general
8258 for (Value *Op : Cmp->operands()) {
8259 if (isa<TruncInst>(Op) && Op->hasOneUse())
8260 DeadInstructions.insert(cast<Instruction>(Op));
8264 // We create new "steps" for induction variable updates to which the original
8265 // induction variables map. An original update instruction will be dead if
8266 // all its users except the induction variable are dead.
8267 auto *Latch = OrigLoop->getLoopLatch();
8268 for (auto &Induction : Legal->getInductionVars()) {
8269 PHINode *Ind = Induction.first;
8270 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
8272 // If the tail is to be folded by masking, the primary induction variable,
8273 // if exists, isn't dead: it will be used for masking. Don't kill it.
8274 if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
8275 continue;
8277 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
8278 return U == Ind || DeadInstructions.count(cast<Instruction>(U));
8280 DeadInstructions.insert(IndUpdate);
8282 // We record as "Dead" also the type-casting instructions we had identified
8283 // during induction analysis. We don't need any handling for them in the
8284 // vectorized loop because we have proven that, under a proper runtime
8285 // test guarding the vectorized loop, the value of the phi, and the casted
8286 // value of the phi, are the same. The last instruction in this casting chain
8287 // will get its scalar/vector/widened def from the scalar/vector/widened def
8288 // of the respective phi node. Any other casts in the induction def-use chain
8289 // have no other uses outside the phi update chain, and will be ignored.
8290 InductionDescriptor &IndDes = Induction.second;
8291 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
8292 DeadInstructions.insert(Casts.begin(), Casts.end());
8296 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
8298 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
8300 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
8301 Instruction::BinaryOps BinOp) {
8302 // When unrolling and the VF is 1, we only need to add a simple scalar.
8303 Type *Ty = Val->getType();
8304 assert(!Ty->isVectorTy() && "Val must be a scalar");
8306 if (Ty->isFloatingPointTy()) {
8307 Constant *C = ConstantFP::get(Ty, (double)StartIdx);
8309 // Floating-point operations inherit FMF via the builder's flags.
8310 Value *MulOp = Builder.CreateFMul(C, Step);
8311 return Builder.CreateBinOp(BinOp, Val, MulOp);
8313 Constant *C = ConstantInt::get(Ty, StartIdx);
8314 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
8317 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
8318 SmallVector<Metadata *, 4> MDs;
8319 // Reserve first location for self reference to the LoopID metadata node.
8320 MDs.push_back(nullptr);
8321 bool IsUnrollMetadata = false;
8322 MDNode *LoopID = L->getLoopID();
8323 if (LoopID) {
8324 // First find existing loop unrolling disable metadata.
8325 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
8326 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
8327 if (MD) {
8328 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
8329 IsUnrollMetadata =
8330 S && S->getString().startswith("llvm.loop.unroll.disable");
8332 MDs.push_back(LoopID->getOperand(i));
8336 if (!IsUnrollMetadata) {
8337 // Add runtime unroll disable metadata.
8338 LLVMContext &Context = L->getHeader()->getContext();
8339 SmallVector<Metadata *, 1> DisableOperands;
8340 DisableOperands.push_back(
8341 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
8342 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
8343 MDs.push_back(DisableNode);
8344 MDNode *NewLoopID = MDNode::get(Context, MDs);
8345 // Set operand 0 to refer to the loop id itself.
8346 NewLoopID->replaceOperandWith(0, NewLoopID);
8347 L->setLoopID(NewLoopID);
8351 //===--------------------------------------------------------------------===//
8352 // EpilogueVectorizerMainLoop
8353 //===--------------------------------------------------------------------===//
8355 /// This function is partially responsible for generating the control flow
8356 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8357 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
8358 MDNode *OrigLoopID = OrigLoop->getLoopID();
8359 Loop *Lp = createVectorLoopSkeleton("");
8361 // Generate the code to check the minimum iteration count of the vector
8362 // epilogue (see below).
8363 EPI.EpilogueIterationCountCheck =
8364 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
8365 EPI.EpilogueIterationCountCheck->setName("iter.check");
8367 // Generate the code to check any assumptions that we've made for SCEV
8368 // expressions.
8369 EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader);
8371 // Generate the code that checks at runtime if arrays overlap. We put the
8372 // checks into a separate block to make the more common case of few elements
8373 // faster.
8374 EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
8376 // Generate the iteration count check for the main loop, *after* the check
8377 // for the epilogue loop, so that the path-length is shorter for the case
8378 // that goes directly through the vector epilogue. The longer-path length for
8379 // the main loop is compensated for, by the gain from vectorizing the larger
8380 // trip count. Note: the branch will get updated later on when we vectorize
8381 // the epilogue.
8382 EPI.MainLoopIterationCountCheck =
8383 emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
8385 // Generate the induction variable.
8386 OldInduction = Legal->getPrimaryInduction();
8387 Type *IdxTy = Legal->getWidestInductionType();
8388 Value *StartIdx = ConstantInt::get(IdxTy, 0);
8389 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
8390 Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8391 EPI.VectorTripCount = CountRoundDown;
8392 Induction =
8393 createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8394 getDebugLocFromInstOrOperands(OldInduction));
8396 // Skip induction resume value creation here because they will be created in
8397 // the second pass. If we created them here, they wouldn't be used anyway,
8398 // because the vplan in the second pass still contains the inductions from the
8399 // original loop.
8401 return completeLoopSkeleton(Lp, OrigLoopID);
8404 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
8405 LLVM_DEBUG({
8406 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
8407 << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
8408 << ", Main Loop UF:" << EPI.MainLoopUF
8409 << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
8410 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8414 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
8415 DEBUG_WITH_TYPE(VerboseDebug, {
8416 dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
8420 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
8421 Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
8422 assert(L && "Expected valid Loop.");
8423 assert(Bypass && "Expected valid bypass basic block.");
8424 unsigned VFactor =
8425 ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
8426 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
8427 Value *Count = getOrCreateTripCount(L);
8428 // Reuse existing vector loop preheader for TC checks.
8429 // Note that new preheader block is generated for vector loop.
8430 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
8431 IRBuilder<> Builder(TCCheckBlock->getTerminator());
8433 // Generate code to check if the loop's trip count is less than VF * UF of the
8434 // main vector loop.
8435 auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
8436 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8438 Value *CheckMinIters = Builder.CreateICmp(
8439 P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
8440 "min.iters.check");
8442 if (!ForEpilogue)
8443 TCCheckBlock->setName("vector.main.loop.iter.check");
8445 // Create new preheader for vector loop.
8446 LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
8447 DT, LI, nullptr, "vector.ph");
8449 if (ForEpilogue) {
8450 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
8451 DT->getNode(Bypass)->getIDom()) &&
8452 "TC check is expected to dominate Bypass");
8454 // Update dominator for Bypass & LoopExit.
8455 DT->changeImmediateDominator(Bypass, TCCheckBlock);
8456 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8457 // For loops with multiple exits, there's no edge from the middle block
8458 // to exit blocks (as the epilogue must run) and thus no need to update
8459 // the immediate dominator of the exit blocks.
8460 DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
8462 LoopBypassBlocks.push_back(TCCheckBlock);
8464 // Save the trip count so we don't have to regenerate it in the
8465 // vec.epilog.iter.check. This is safe to do because the trip count
8466 // generated here dominates the vector epilog iter check.
8467 EPI.TripCount = Count;
8470 ReplaceInstWithInst(
8471 TCCheckBlock->getTerminator(),
8472 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8474 return TCCheckBlock;
8477 //===--------------------------------------------------------------------===//
8478 // EpilogueVectorizerEpilogueLoop
8479 //===--------------------------------------------------------------------===//
8481 /// This function is partially responsible for generating the control flow
8482 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8483 BasicBlock *
8484 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
8485 MDNode *OrigLoopID = OrigLoop->getLoopID();
8486 Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
8488 // Now, compare the remaining count and if there aren't enough iterations to
8489 // execute the vectorized epilogue skip to the scalar part.
8490 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
8491 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
8492 LoopVectorPreHeader =
8493 SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
8494 LI, nullptr, "vec.epilog.ph");
8495 emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
8496 VecEpilogueIterationCountCheck);
8498 // Adjust the control flow taking the state info from the main loop
8499 // vectorization into account.
8500 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
8501 "expected this to be saved from the previous pass.");
8502 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
8503 VecEpilogueIterationCountCheck, LoopVectorPreHeader);
8505 DT->changeImmediateDominator(LoopVectorPreHeader,
8506 EPI.MainLoopIterationCountCheck);
8508 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
8509 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8511 if (EPI.SCEVSafetyCheck)
8512 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
8513 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8514 if (EPI.MemSafetyCheck)
8515 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
8516 VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8518 DT->changeImmediateDominator(
8519 VecEpilogueIterationCountCheck,
8520 VecEpilogueIterationCountCheck->getSinglePredecessor());
8522 DT->changeImmediateDominator(LoopScalarPreHeader,
8523 EPI.EpilogueIterationCountCheck);
8524 if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8525 // If there is an epilogue which must run, there's no edge from the
8526 // middle block to exit blocks and thus no need to update the immediate
8527 // dominator of the exit blocks.
8528 DT->changeImmediateDominator(LoopExitBlock,
8529 EPI.EpilogueIterationCountCheck);
8531 // Keep track of bypass blocks, as they feed start values to the induction
8532 // phis in the scalar loop preheader.
8533 if (EPI.SCEVSafetyCheck)
8534 LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
8535 if (EPI.MemSafetyCheck)
8536 LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
8537 LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
8539 // Generate a resume induction for the vector epilogue and put it in the
8540 // vector epilogue preheader
8541 Type *IdxTy = Legal->getWidestInductionType();
8542 PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
8543 LoopVectorPreHeader->getFirstNonPHI());
8544 EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
8545 EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
8546 EPI.MainLoopIterationCountCheck);
8548 // Generate the induction variable.
8549 OldInduction = Legal->getPrimaryInduction();
8550 Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8551 Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
8552 Value *StartIdx = EPResumeVal;
8553 Induction =
8554 createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8555 getDebugLocFromInstOrOperands(OldInduction));
8557 // Generate induction resume values. These variables save the new starting
8558 // indexes for the scalar loop. They are used to test if there are any tail
8559 // iterations left once the vector loop has completed.
8560 // Note that when the vectorized epilogue is skipped due to iteration count
8561 // check, then the resume value for the induction variable comes from
8562 // the trip count of the main vector loop, hence passing the AdditionalBypass
8563 // argument.
8564 createInductionResumeValues(Lp, CountRoundDown,
8565 {VecEpilogueIterationCountCheck,
8566 EPI.VectorTripCount} /* AdditionalBypass */);
8568 AddRuntimeUnrollDisableMetaData(Lp);
8569 return completeLoopSkeleton(Lp, OrigLoopID);
8572 BasicBlock *
8573 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8574 Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
8576 assert(EPI.TripCount &&
8577 "Expected trip count to have been safed in the first pass.");
8578 assert(
8579 (!isa<Instruction>(EPI.TripCount) ||
8580 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8581 "saved trip count does not dominate insertion point.");
8582 Value *TC = EPI.TripCount;
8583 IRBuilder<> Builder(Insert->getTerminator());
8584 Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8586 // Generate code to check if the loop's trip count is less than VF * UF of the
8587 // vector epilogue loop.
8588 auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
8589 ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8591 Value *CheckMinIters = Builder.CreateICmp(
8592 P, Count,
8593 ConstantInt::get(Count->getType(),
8594 EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
8595 "min.epilog.iters.check");
8597 ReplaceInstWithInst(
8598 Insert->getTerminator(),
8599 BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8601 LoopBypassBlocks.push_back(Insert);
8602 return Insert;
8605 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8606 LLVM_DEBUG({
8607 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8608 << "Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
8609 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8613 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8614 DEBUG_WITH_TYPE(VerboseDebug, {
8615 dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
8619 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8620 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8621 assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8622 bool PredicateAtRangeStart = Predicate(Range.Start);
8624 for (ElementCount TmpVF = Range.Start * 2;
8625 ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8626 if (Predicate(TmpVF) != PredicateAtRangeStart) {
8627 Range.End = TmpVF;
8628 break;
8631 return PredicateAtRangeStart;
8634 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8635 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8636 /// of VF's starting at a given VF and extending it as much as possible. Each
8637 /// vectorization decision can potentially shorten this sub-range during
8638 /// buildVPlan().
8639 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8640 ElementCount MaxVF) {
8641 auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8642 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8643 VFRange SubRange = {VF, MaxVFPlusOne};
8644 VPlans.push_back(buildVPlan(SubRange));
8645 VF = SubRange.End;
8649 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8650 VPlanPtr &Plan) {
8651 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8653 // Look for cached value.
8654 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8655 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8656 if (ECEntryIt != EdgeMaskCache.end())
8657 return ECEntryIt->second;
8659 VPValue *SrcMask = createBlockInMask(Src, Plan);
8661 // The terminator has to be a branch inst!
8662 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8663 assert(BI && "Unexpected terminator found");
8665 if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8666 return EdgeMaskCache[Edge] = SrcMask;
8668 // If source is an exiting block, we know the exit edge is dynamically dead
8669 // in the vector loop, and thus we don't need to restrict the mask. Avoid
8670 // adding uses of an otherwise potentially dead instruction.
8671 if (OrigLoop->isLoopExiting(Src))
8672 return EdgeMaskCache[Edge] = SrcMask;
8674 VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8675 assert(EdgeMask && "No Edge Mask found for condition");
8677 if (BI->getSuccessor(0) != Dst)
8678 EdgeMask = Builder.createNot(EdgeMask);
8680 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8681 // The condition is 'SrcMask && EdgeMask', which is equivalent to
8682 // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8683 // The select version does not introduce new UB if SrcMask is false and
8684 // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8685 VPValue *False = Plan->getOrAddVPValue(
8686 ConstantInt::getFalse(BI->getCondition()->getType()));
8687 EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False);
8690 return EdgeMaskCache[Edge] = EdgeMask;
8693 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8694 assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8696 // Look for cached value.
8697 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8698 if (BCEntryIt != BlockMaskCache.end())
8699 return BCEntryIt->second;
8701 // All-one mask is modelled as no-mask following the convention for masked
8702 // load/store/gather/scatter. Initialize BlockMask to no-mask.
8703 VPValue *BlockMask = nullptr;
8705 if (OrigLoop->getHeader() == BB) {
8706 if (!CM.blockNeedsPredication(BB))
8707 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8709 // Create the block in mask as the first non-phi instruction in the block.
8710 VPBuilder::InsertPointGuard Guard(Builder);
8711 auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
8712 Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
8714 // Introduce the early-exit compare IV <= BTC to form header block mask.
8715 // This is used instead of IV < TC because TC may wrap, unlike BTC.
8716 // Start by constructing the desired canonical IV.
8717 VPValue *IV = nullptr;
8718 if (Legal->getPrimaryInduction())
8719 IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
8720 else {
8721 auto IVRecipe = new VPWidenCanonicalIVRecipe();
8722 Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
8723 IV = IVRecipe->getVPSingleValue();
8725 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8726 bool TailFolded = !CM.isScalarEpilogueAllowed();
8728 if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
8729 // While ActiveLaneMask is a binary op that consumes the loop tripcount
8730 // as a second argument, we only pass the IV here and extract the
8731 // tripcount from the transform state where codegen of the VP instructions
8732 // happen.
8733 BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
8734 } else {
8735 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8737 return BlockMaskCache[BB] = BlockMask;
8740 // This is the block mask. We OR all incoming edges.
8741 for (auto *Predecessor : predecessors(BB)) {
8742 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8743 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8744 return BlockMaskCache[BB] = EdgeMask;
8746 if (!BlockMask) { // BlockMask has its initialized nullptr value.
8747 BlockMask = EdgeMask;
8748 continue;
8751 BlockMask = Builder.createOr(BlockMask, EdgeMask);
8754 return BlockMaskCache[BB] = BlockMask;
8757 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8758 ArrayRef<VPValue *> Operands,
8759 VFRange &Range,
8760 VPlanPtr &Plan) {
8761 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8762 "Must be called with either a load or store");
8764 auto willWiden = [&](ElementCount VF) -> bool {
8765 if (VF.isScalar())
8766 return false;
8767 LoopVectorizationCostModel::InstWidening Decision =
8768 CM.getWideningDecision(I, VF);
8769 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8770 "CM decision should be taken at this point.");
8771 if (Decision == LoopVectorizationCostModel::CM_Interleave)
8772 return true;
8773 if (CM.isScalarAfterVectorization(I, VF) ||
8774 CM.isProfitableToScalarize(I, VF))
8775 return false;
8776 return Decision != LoopVectorizationCostModel::CM_Scalarize;
8779 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8780 return nullptr;
8782 VPValue *Mask = nullptr;
8783 if (Legal->isMaskRequired(I))
8784 Mask = createBlockInMask(I->getParent(), Plan);
8786 if (LoadInst *Load = dyn_cast<LoadInst>(I))
8787 return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask);
8789 StoreInst *Store = cast<StoreInst>(I);
8790 return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8791 Mask);
8794 VPWidenIntOrFpInductionRecipe *
8795 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi,
8796 ArrayRef<VPValue *> Operands) const {
8797 // Check if this is an integer or fp induction. If so, build the recipe that
8798 // produces its scalar and vector values.
8799 InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8800 if (II.getKind() == InductionDescriptor::IK_IntInduction ||
8801 II.getKind() == InductionDescriptor::IK_FpInduction) {
8802 assert(II.getStartValue() ==
8803 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8804 const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts();
8805 return new VPWidenIntOrFpInductionRecipe(
8806 Phi, Operands[0], Casts.empty() ? nullptr : Casts.front());
8809 return nullptr;
8812 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8813 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range,
8814 VPlan &Plan) const {
8815 // Optimize the special case where the source is a constant integer
8816 // induction variable. Notice that we can only optimize the 'trunc' case
8817 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8818 // (c) other casts depend on pointer size.
8820 // Determine whether \p K is a truncation based on an induction variable that
8821 // can be optimized.
8822 auto isOptimizableIVTruncate =
8823 [&](Instruction *K) -> std::function<bool(ElementCount)> {
8824 return [=](ElementCount VF) -> bool {
8825 return CM.isOptimizableIVTruncate(K, VF);
8829 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8830 isOptimizableIVTruncate(I), Range)) {
8832 InductionDescriptor II =
8833 Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0)));
8834 VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8835 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
8836 Start, nullptr, I);
8838 return nullptr;
8841 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8842 ArrayRef<VPValue *> Operands,
8843 VPlanPtr &Plan) {
8844 // If all incoming values are equal, the incoming VPValue can be used directly
8845 // instead of creating a new VPBlendRecipe.
8846 VPValue *FirstIncoming = Operands[0];
8847 if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8848 return FirstIncoming == Inc;
8849 })) {
8850 return Operands[0];
8853 // We know that all PHIs in non-header blocks are converted into selects, so
8854 // we don't have to worry about the insertion order and we can just use the
8855 // builder. At this point we generate the predication tree. There may be
8856 // duplications since this is a simple recursive scan, but future
8857 // optimizations will clean it up.
8858 SmallVector<VPValue *, 2> OperandsWithMask;
8859 unsigned NumIncoming = Phi->getNumIncomingValues();
8861 for (unsigned In = 0; In < NumIncoming; In++) {
8862 VPValue *EdgeMask =
8863 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8864 assert((EdgeMask || NumIncoming == 1) &&
8865 "Multiple predecessors with one having a full mask");
8866 OperandsWithMask.push_back(Operands[In]);
8867 if (EdgeMask)
8868 OperandsWithMask.push_back(EdgeMask);
8870 return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8873 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8874 ArrayRef<VPValue *> Operands,
8875 VFRange &Range) const {
8877 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8878 [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); },
8879 Range);
8881 if (IsPredicated)
8882 return nullptr;
8884 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8885 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8886 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8887 ID == Intrinsic::pseudoprobe ||
8888 ID == Intrinsic::experimental_noalias_scope_decl))
8889 return nullptr;
8891 auto willWiden = [&](ElementCount VF) -> bool {
8892 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8893 // The following case may be scalarized depending on the VF.
8894 // The flag shows whether we use Intrinsic or a usual Call for vectorized
8895 // version of the instruction.
8896 // Is it beneficial to perform intrinsic call compared to lib call?
8897 bool NeedToScalarize = false;
8898 InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8899 InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8900 bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8901 return UseVectorIntrinsic || !NeedToScalarize;
8904 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8905 return nullptr;
8907 ArrayRef<VPValue *> Ops = Operands.take_front(CI->getNumArgOperands());
8908 return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8911 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8912 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8913 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8914 // Instruction should be widened, unless it is scalar after vectorization,
8915 // scalarization is profitable or it is predicated.
8916 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8917 return CM.isScalarAfterVectorization(I, VF) ||
8918 CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I);
8920 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8921 Range);
8924 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8925 ArrayRef<VPValue *> Operands) const {
8926 auto IsVectorizableOpcode = [](unsigned Opcode) {
8927 switch (Opcode) {
8928 case Instruction::Add:
8929 case Instruction::And:
8930 case Instruction::AShr:
8931 case Instruction::BitCast:
8932 case Instruction::FAdd:
8933 case Instruction::FCmp:
8934 case Instruction::FDiv:
8935 case Instruction::FMul:
8936 case Instruction::FNeg:
8937 case Instruction::FPExt:
8938 case Instruction::FPToSI:
8939 case Instruction::FPToUI:
8940 case Instruction::FPTrunc:
8941 case Instruction::FRem:
8942 case Instruction::FSub:
8943 case Instruction::ICmp:
8944 case Instruction::IntToPtr:
8945 case Instruction::LShr:
8946 case Instruction::Mul:
8947 case Instruction::Or:
8948 case Instruction::PtrToInt:
8949 case Instruction::SDiv:
8950 case Instruction::Select:
8951 case Instruction::SExt:
8952 case Instruction::Shl:
8953 case Instruction::SIToFP:
8954 case Instruction::SRem:
8955 case Instruction::Sub:
8956 case Instruction::Trunc:
8957 case Instruction::UDiv:
8958 case Instruction::UIToFP:
8959 case Instruction::URem:
8960 case Instruction::Xor:
8961 case Instruction::ZExt:
8962 return true;
8964 return false;
8967 if (!IsVectorizableOpcode(I->getOpcode()))
8968 return nullptr;
8970 // Success: widen this instruction.
8971 return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8974 void VPRecipeBuilder::fixHeaderPhis() {
8975 BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8976 for (VPWidenPHIRecipe *R : PhisToFix) {
8977 auto *PN = cast<PHINode>(R->getUnderlyingValue());
8978 VPRecipeBase *IncR =
8979 getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8980 R->addOperand(IncR->getVPSingleValue());
8984 VPBasicBlock *VPRecipeBuilder::handleReplication(
8985 Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8986 VPlanPtr &Plan) {
8987 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8988 [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8989 Range);
8991 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8992 [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range);
8994 // Even if the instruction is not marked as uniform, there are certain
8995 // intrinsic calls that can be effectively treated as such, so we check for
8996 // them here. Conservatively, we only do this for scalable vectors, since
8997 // for fixed-width VFs we can always fall back on full scalarization.
8998 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8999 switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
9000 case Intrinsic::assume:
9001 case Intrinsic::lifetime_start:
9002 case Intrinsic::lifetime_end:
9003 // For scalable vectors if one of the operands is variant then we still
9004 // want to mark as uniform, which will generate one instruction for just
9005 // the first lane of the vector. We can't scalarize the call in the same
9006 // way as for fixed-width vectors because we don't know how many lanes
9007 // there are.
9009 // The reasons for doing it this way for scalable vectors are:
9010 // 1. For the assume intrinsic generating the instruction for the first
9011 // lane is still be better than not generating any at all. For
9012 // example, the input may be a splat across all lanes.
9013 // 2. For the lifetime start/end intrinsics the pointer operand only
9014 // does anything useful when the input comes from a stack object,
9015 // which suggests it should always be uniform. For non-stack objects
9016 // the effect is to poison the object, which still allows us to
9017 // remove the call.
9018 IsUniform = true;
9019 break;
9020 default:
9021 break;
9025 auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
9026 IsUniform, IsPredicated);
9027 setRecipe(I, Recipe);
9028 Plan->addVPValue(I, Recipe);
9030 // Find if I uses a predicated instruction. If so, it will use its scalar
9031 // value. Avoid hoisting the insert-element which packs the scalar value into
9032 // a vector value, as that happens iff all users use the vector value.
9033 for (VPValue *Op : Recipe->operands()) {
9034 auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
9035 if (!PredR)
9036 continue;
9037 auto *RepR =
9038 cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
9039 assert(RepR->isPredicated() &&
9040 "expected Replicate recipe to be predicated");
9041 RepR->setAlsoPack(false);
9044 // Finalize the recipe for Instr, first if it is not predicated.
9045 if (!IsPredicated) {
9046 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
9047 VPBB->appendRecipe(Recipe);
9048 return VPBB;
9050 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
9051 assert(VPBB->getSuccessors().empty() &&
9052 "VPBB has successors when handling predicated replication.");
9053 // Record predicated instructions for above packing optimizations.
9054 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
9055 VPBlockUtils::insertBlockAfter(Region, VPBB);
9056 auto *RegSucc = new VPBasicBlock();
9057 VPBlockUtils::insertBlockAfter(RegSucc, Region);
9058 return RegSucc;
9061 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
9062 VPRecipeBase *PredRecipe,
9063 VPlanPtr &Plan) {
9064 // Instructions marked for predication are replicated and placed under an
9065 // if-then construct to prevent side-effects.
9067 // Generate recipes to compute the block mask for this region.
9068 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
9070 // Build the triangular if-then region.
9071 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
9072 assert(Instr->getParent() && "Predicated instruction not in any basic block");
9073 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
9074 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
9075 auto *PHIRecipe = Instr->getType()->isVoidTy()
9076 ? nullptr
9077 : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
9078 if (PHIRecipe) {
9079 Plan->removeVPValueFor(Instr);
9080 Plan->addVPValue(Instr, PHIRecipe);
9082 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
9083 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
9084 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
9086 // Note: first set Entry as region entry and then connect successors starting
9087 // from it in order, to propagate the "parent" of each VPBasicBlock.
9088 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
9089 VPBlockUtils::connectBlocks(Pred, Exit);
9091 return Region;
9094 VPRecipeOrVPValueTy
9095 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
9096 ArrayRef<VPValue *> Operands,
9097 VFRange &Range, VPlanPtr &Plan) {
9098 // First, check for specific widening recipes that deal with calls, memory
9099 // operations, inductions and Phi nodes.
9100 if (auto *CI = dyn_cast<CallInst>(Instr))
9101 return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
9103 if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
9104 return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
9106 VPRecipeBase *Recipe;
9107 if (auto Phi = dyn_cast<PHINode>(Instr)) {
9108 if (Phi->getParent() != OrigLoop->getHeader())
9109 return tryToBlend(Phi, Operands, Plan);
9110 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands)))
9111 return toVPRecipeResult(Recipe);
9113 VPWidenPHIRecipe *PhiRecipe = nullptr;
9114 if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
9115 VPValue *StartV = Operands[0];
9116 if (Legal->isReductionVariable(Phi)) {
9117 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
9118 assert(RdxDesc.getRecurrenceStartValue() ==
9119 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
9120 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
9121 CM.isInLoopReduction(Phi),
9122 CM.useOrderedReductions(RdxDesc));
9123 } else {
9124 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
9127 // Record the incoming value from the backedge, so we can add the incoming
9128 // value from the backedge after all recipes have been created.
9129 recordRecipeOf(cast<Instruction>(
9130 Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
9131 PhisToFix.push_back(PhiRecipe);
9132 } else {
9133 // TODO: record start and backedge value for remaining pointer induction
9134 // phis.
9135 assert(Phi->getType()->isPointerTy() &&
9136 "only pointer phis should be handled here");
9137 PhiRecipe = new VPWidenPHIRecipe(Phi);
9140 return toVPRecipeResult(PhiRecipe);
9143 if (isa<TruncInst>(Instr) &&
9144 (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
9145 Range, *Plan)))
9146 return toVPRecipeResult(Recipe);
9148 if (!shouldWiden(Instr, Range))
9149 return nullptr;
9151 if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
9152 return toVPRecipeResult(new VPWidenGEPRecipe(
9153 GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
9155 if (auto *SI = dyn_cast<SelectInst>(Instr)) {
9156 bool InvariantCond =
9157 PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
9158 return toVPRecipeResult(new VPWidenSelectRecipe(
9159 *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
9162 return toVPRecipeResult(tryToWiden(Instr, Operands));
9165 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
9166 ElementCount MaxVF) {
9167 assert(OrigLoop->isInnermost() && "Inner loop expected.");
9169 // Collect instructions from the original loop that will become trivially dead
9170 // in the vectorized loop. We don't need to vectorize these instructions. For
9171 // example, original induction update instructions can become dead because we
9172 // separately emit induction "steps" when generating code for the new loop.
9173 // Similarly, we create a new latch condition when setting up the structure
9174 // of the new loop, so the old one can become dead.
9175 SmallPtrSet<Instruction *, 4> DeadInstructions;
9176 collectTriviallyDeadInstructions(DeadInstructions);
9178 // Add assume instructions we need to drop to DeadInstructions, to prevent
9179 // them from being added to the VPlan.
9180 // TODO: We only need to drop assumes in blocks that get flattend. If the
9181 // control flow is preserved, we should keep them.
9182 auto &ConditionalAssumes = Legal->getConditionalAssumes();
9183 DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
9185 MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
9186 // Dead instructions do not need sinking. Remove them from SinkAfter.
9187 for (Instruction *I : DeadInstructions)
9188 SinkAfter.erase(I);
9190 // Cannot sink instructions after dead instructions (there won't be any
9191 // recipes for them). Instead, find the first non-dead previous instruction.
9192 for (auto &P : Legal->getSinkAfter()) {
9193 Instruction *SinkTarget = P.second;
9194 Instruction *FirstInst = &*SinkTarget->getParent()->begin();
9195 (void)FirstInst;
9196 while (DeadInstructions.contains(SinkTarget)) {
9197 assert(
9198 SinkTarget != FirstInst &&
9199 "Must find a live instruction (at least the one feeding the "
9200 "first-order recurrence PHI) before reaching beginning of the block");
9201 SinkTarget = SinkTarget->getPrevNode();
9202 assert(SinkTarget != P.first &&
9203 "sink source equals target, no sinking required");
9205 P.second = SinkTarget;
9208 auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
9209 for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
9210 VFRange SubRange = {VF, MaxVFPlusOne};
9211 VPlans.push_back(
9212 buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
9213 VF = SubRange.End;
9217 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
9218 VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
9219 const MapVector<Instruction *, Instruction *> &SinkAfter) {
9221 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
9223 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
9225 // ---------------------------------------------------------------------------
9226 // Pre-construction: record ingredients whose recipes we'll need to further
9227 // process after constructing the initial VPlan.
9228 // ---------------------------------------------------------------------------
9230 // Mark instructions we'll need to sink later and their targets as
9231 // ingredients whose recipe we'll need to record.
9232 for (auto &Entry : SinkAfter) {
9233 RecipeBuilder.recordRecipeOf(Entry.first);
9234 RecipeBuilder.recordRecipeOf(Entry.second);
9236 for (auto &Reduction : CM.getInLoopReductionChains()) {
9237 PHINode *Phi = Reduction.first;
9238 RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind();
9239 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9241 RecipeBuilder.recordRecipeOf(Phi);
9242 for (auto &R : ReductionOperations) {
9243 RecipeBuilder.recordRecipeOf(R);
9244 // For min/max reducitons, where we have a pair of icmp/select, we also
9245 // need to record the ICmp recipe, so it can be removed later.
9246 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
9247 RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
9251 // For each interleave group which is relevant for this (possibly trimmed)
9252 // Range, add it to the set of groups to be later applied to the VPlan and add
9253 // placeholders for its members' Recipes which we'll be replacing with a
9254 // single VPInterleaveRecipe.
9255 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
9256 auto applyIG = [IG, this](ElementCount VF) -> bool {
9257 return (VF.isVector() && // Query is illegal for VF == 1
9258 CM.getWideningDecision(IG->getInsertPos(), VF) ==
9259 LoopVectorizationCostModel::CM_Interleave);
9261 if (!getDecisionAndClampRange(applyIG, Range))
9262 continue;
9263 InterleaveGroups.insert(IG);
9264 for (unsigned i = 0; i < IG->getFactor(); i++)
9265 if (Instruction *Member = IG->getMember(i))
9266 RecipeBuilder.recordRecipeOf(Member);
9269 // ---------------------------------------------------------------------------
9270 // Build initial VPlan: Scan the body of the loop in a topological order to
9271 // visit each basic block after having visited its predecessor basic blocks.
9272 // ---------------------------------------------------------------------------
9274 // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
9275 auto Plan = std::make_unique<VPlan>();
9276 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
9277 Plan->setEntry(VPBB);
9279 // Scan the body of the loop in a topological order to visit each basic block
9280 // after having visited its predecessor basic blocks.
9281 LoopBlocksDFS DFS(OrigLoop);
9282 DFS.perform(LI);
9284 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9285 // Relevant instructions from basic block BB will be grouped into VPRecipe
9286 // ingredients and fill a new VPBasicBlock.
9287 unsigned VPBBsForBB = 0;
9288 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
9289 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
9290 VPBB = FirstVPBBForBB;
9291 Builder.setInsertPoint(VPBB);
9293 // Introduce each ingredient into VPlan.
9294 // TODO: Model and preserve debug instrinsics in VPlan.
9295 for (Instruction &I : BB->instructionsWithoutDebug()) {
9296 Instruction *Instr = &I;
9298 // First filter out irrelevant instructions, to ensure no recipes are
9299 // built for them.
9300 if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
9301 continue;
9303 SmallVector<VPValue *, 4> Operands;
9304 auto *Phi = dyn_cast<PHINode>(Instr);
9305 if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
9306 Operands.push_back(Plan->getOrAddVPValue(
9307 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9308 } else {
9309 auto OpRange = Plan->mapToVPValues(Instr->operands());
9310 Operands = {OpRange.begin(), OpRange.end()};
9312 if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
9313 Instr, Operands, Range, Plan)) {
9314 // If Instr can be simplified to an existing VPValue, use it.
9315 if (RecipeOrValue.is<VPValue *>()) {
9316 auto *VPV = RecipeOrValue.get<VPValue *>();
9317 Plan->addVPValue(Instr, VPV);
9318 // If the re-used value is a recipe, register the recipe for the
9319 // instruction, in case the recipe for Instr needs to be recorded.
9320 if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
9321 RecipeBuilder.setRecipe(Instr, R);
9322 continue;
9324 // Otherwise, add the new recipe.
9325 VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
9326 for (auto *Def : Recipe->definedValues()) {
9327 auto *UV = Def->getUnderlyingValue();
9328 Plan->addVPValue(UV, Def);
9331 RecipeBuilder.setRecipe(Instr, Recipe);
9332 VPBB->appendRecipe(Recipe);
9333 continue;
9336 // Otherwise, if all widening options failed, Instruction is to be
9337 // replicated. This may create a successor for VPBB.
9338 VPBasicBlock *NextVPBB =
9339 RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
9340 if (NextVPBB != VPBB) {
9341 VPBB = NextVPBB;
9342 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
9343 : "");
9348 RecipeBuilder.fixHeaderPhis();
9350 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
9351 // may also be empty, such as the last one VPBB, reflecting original
9352 // basic-blocks with no recipes.
9353 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
9354 assert(PreEntry->empty() && "Expecting empty pre-entry block.");
9355 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
9356 VPBlockUtils::disconnectBlocks(PreEntry, Entry);
9357 delete PreEntry;
9359 // ---------------------------------------------------------------------------
9360 // Transform initial VPlan: Apply previously taken decisions, in order, to
9361 // bring the VPlan to its final state.
9362 // ---------------------------------------------------------------------------
9364 // Apply Sink-After legal constraints.
9365 auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
9366 auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
9367 if (Region && Region->isReplicator()) {
9368 assert(Region->getNumSuccessors() == 1 &&
9369 Region->getNumPredecessors() == 1 && "Expected SESE region!");
9370 assert(R->getParent()->size() == 1 &&
9371 "A recipe in an original replicator region must be the only "
9372 "recipe in its block");
9373 return Region;
9375 return nullptr;
9377 for (auto &Entry : SinkAfter) {
9378 VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
9379 VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
9381 auto *TargetRegion = GetReplicateRegion(Target);
9382 auto *SinkRegion = GetReplicateRegion(Sink);
9383 if (!SinkRegion) {
9384 // If the sink source is not a replicate region, sink the recipe directly.
9385 if (TargetRegion) {
9386 // The target is in a replication region, make sure to move Sink to
9387 // the block after it, not into the replication region itself.
9388 VPBasicBlock *NextBlock =
9389 cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
9390 Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
9391 } else
9392 Sink->moveAfter(Target);
9393 continue;
9396 // The sink source is in a replicate region. Unhook the region from the CFG.
9397 auto *SinkPred = SinkRegion->getSinglePredecessor();
9398 auto *SinkSucc = SinkRegion->getSingleSuccessor();
9399 VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
9400 VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
9401 VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
9403 if (TargetRegion) {
9404 // The target recipe is also in a replicate region, move the sink region
9405 // after the target region.
9406 auto *TargetSucc = TargetRegion->getSingleSuccessor();
9407 VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
9408 VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
9409 VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
9410 } else {
9411 // The sink source is in a replicate region, we need to move the whole
9412 // replicate region, which should only contain a single recipe in the
9413 // main block.
9414 auto *SplitBlock =
9415 Target->getParent()->splitAt(std::next(Target->getIterator()));
9417 auto *SplitPred = SplitBlock->getSinglePredecessor();
9419 VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
9420 VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
9421 VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
9422 if (VPBB == SplitPred)
9423 VPBB = SplitBlock;
9427 // Introduce a recipe to combine the incoming and previous values of a
9428 // first-order recurrence.
9429 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9430 auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
9431 if (!RecurPhi)
9432 continue;
9434 auto *RecurSplice = cast<VPInstruction>(
9435 Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
9436 {RecurPhi, RecurPhi->getBackedgeValue()}));
9438 VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
9439 if (auto *Region = GetReplicateRegion(PrevRecipe)) {
9440 VPBasicBlock *Succ = cast<VPBasicBlock>(Region->getSingleSuccessor());
9441 RecurSplice->moveBefore(*Succ, Succ->getFirstNonPhi());
9442 } else
9443 RecurSplice->moveAfter(PrevRecipe);
9444 RecurPhi->replaceAllUsesWith(RecurSplice);
9445 // Set the first operand of RecurSplice to RecurPhi again, after replacing
9446 // all users.
9447 RecurSplice->setOperand(0, RecurPhi);
9450 // Interleave memory: for each Interleave Group we marked earlier as relevant
9451 // for this VPlan, replace the Recipes widening its memory instructions with a
9452 // single VPInterleaveRecipe at its insertion point.
9453 for (auto IG : InterleaveGroups) {
9454 auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9455 RecipeBuilder.getRecipe(IG->getInsertPos()));
9456 SmallVector<VPValue *, 4> StoredValues;
9457 for (unsigned i = 0; i < IG->getFactor(); ++i)
9458 if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
9459 auto *StoreR =
9460 cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
9461 StoredValues.push_back(StoreR->getStoredValue());
9464 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9465 Recipe->getMask());
9466 VPIG->insertBefore(Recipe);
9467 unsigned J = 0;
9468 for (unsigned i = 0; i < IG->getFactor(); ++i)
9469 if (Instruction *Member = IG->getMember(i)) {
9470 if (!Member->getType()->isVoidTy()) {
9471 VPValue *OriginalV = Plan->getVPValue(Member);
9472 Plan->removeVPValueFor(Member);
9473 Plan->addVPValue(Member, VPIG->getVPValue(J));
9474 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9475 J++;
9477 RecipeBuilder.getRecipe(Member)->eraseFromParent();
9481 // Adjust the recipes for any inloop reductions.
9482 adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start);
9484 VPlanTransforms::sinkScalarOperands(*Plan);
9485 VPlanTransforms::mergeReplicateRegions(*Plan);
9487 std::string PlanName;
9488 raw_string_ostream RSO(PlanName);
9489 ElementCount VF = Range.Start;
9490 Plan->addVF(VF);
9491 RSO << "Initial VPlan for VF={" << VF;
9492 for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
9493 Plan->addVF(VF);
9494 RSO << "," << VF;
9496 RSO << "},UF>=1";
9497 RSO.flush();
9498 Plan->setName(PlanName);
9500 return Plan;
9503 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9504 // Outer loop handling: They may require CFG and instruction level
9505 // transformations before even evaluating whether vectorization is profitable.
9506 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9507 // the vectorization pipeline.
9508 assert(!OrigLoop->isInnermost());
9509 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9511 // Create new empty VPlan
9512 auto Plan = std::make_unique<VPlan>();
9514 // Build hierarchical CFG
9515 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9516 HCFGBuilder.buildHierarchicalCFG();
9518 for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9519 VF *= 2)
9520 Plan->addVF(VF);
9522 if (EnableVPlanPredication) {
9523 VPlanPredicator VPP(*Plan);
9524 VPP.predicate();
9526 // Avoid running transformation to recipes until masked code generation in
9527 // VPlan-native path is in place.
9528 return Plan;
9531 SmallPtrSet<Instruction *, 1> DeadInstructions;
9532 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan,
9533 Legal->getInductionVars(),
9534 DeadInstructions, *PSE.getSE());
9535 return Plan;
9538 // Adjust the recipes for reductions. For in-loop reductions the chain of
9539 // instructions leading from the loop exit instr to the phi need to be converted
9540 // to reductions, with one operand being vector and the other being the scalar
9541 // reduction chain. For other reductions, a select is introduced between the phi
9542 // and live-out recipes when folding the tail.
9543 void LoopVectorizationPlanner::adjustRecipesForReductions(
9544 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9545 ElementCount MinVF) {
9546 for (auto &Reduction : CM.getInLoopReductionChains()) {
9547 PHINode *Phi = Reduction.first;
9548 RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
9549 const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9551 if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9552 continue;
9554 // ReductionOperations are orders top-down from the phi's use to the
9555 // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9556 // which of the two operands will remain scalar and which will be reduced.
9557 // For minmax the chain will be the select instructions.
9558 Instruction *Chain = Phi;
9559 for (Instruction *R : ReductionOperations) {
9560 VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9561 RecurKind Kind = RdxDesc.getRecurrenceKind();
9563 VPValue *ChainOp = Plan->getVPValue(Chain);
9564 unsigned FirstOpId;
9565 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9566 assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9567 "Expected to replace a VPWidenSelectSC");
9568 FirstOpId = 1;
9569 } else {
9570 assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe)) &&
9571 "Expected to replace a VPWidenSC");
9572 FirstOpId = 0;
9574 unsigned VecOpId =
9575 R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9576 VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9578 auto *CondOp = CM.foldTailByMasking()
9579 ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9580 : nullptr;
9581 VPReductionRecipe *RedRecipe = new VPReductionRecipe(
9582 &RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9583 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9584 Plan->removeVPValueFor(R);
9585 Plan->addVPValue(R, RedRecipe);
9586 WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
9587 WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9588 WidenRecipe->eraseFromParent();
9590 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9591 VPRecipeBase *CompareRecipe =
9592 RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9593 assert(isa<VPWidenRecipe>(CompareRecipe) &&
9594 "Expected to replace a VPWidenSC");
9595 assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9596 "Expected no remaining users");
9597 CompareRecipe->eraseFromParent();
9599 Chain = R;
9603 // If tail is folded by masking, introduce selects between the phi
9604 // and the live-out instruction of each reduction, at the end of the latch.
9605 if (CM.foldTailByMasking()) {
9606 for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9607 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9608 if (!PhiR || PhiR->isInLoop())
9609 continue;
9610 Builder.setInsertPoint(LatchVPBB);
9611 VPValue *Cond =
9612 RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9613 VPValue *Red = PhiR->getBackedgeValue();
9614 Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9619 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9620 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9621 VPSlotTracker &SlotTracker) const {
9622 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9623 IG->getInsertPos()->printAsOperand(O, false);
9624 O << ", ";
9625 getAddr()->printAsOperand(O, SlotTracker);
9626 VPValue *Mask = getMask();
9627 if (Mask) {
9628 O << ", ";
9629 Mask->printAsOperand(O, SlotTracker);
9632 unsigned OpIdx = 0;
9633 for (unsigned i = 0; i < IG->getFactor(); ++i) {
9634 if (!IG->getMember(i))
9635 continue;
9636 if (getNumStoreOperands() > 0) {
9637 O << "\n" << Indent << " store ";
9638 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9639 O << " to index " << i;
9640 } else {
9641 O << "\n" << Indent << " ";
9642 getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9643 O << " = load from index " << i;
9645 ++OpIdx;
9648 #endif
9650 void VPWidenCallRecipe::execute(VPTransformState &State) {
9651 State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9652 *this, State);
9655 void VPWidenSelectRecipe::execute(VPTransformState &State) {
9656 State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
9657 this, *this, InvariantCond, State);
9660 void VPWidenRecipe::execute(VPTransformState &State) {
9661 State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
9664 void VPWidenGEPRecipe::execute(VPTransformState &State) {
9665 State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
9666 *this, State.UF, State.VF, IsPtrLoopInvariant,
9667 IsIndexLoopInvariant, State);
9670 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9671 assert(!State.Instance && "Int or FP induction being replicated.");
9672 State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(),
9673 getTruncInst(), getVPValue(0),
9674 getCastValue(), State);
9677 void VPWidenPHIRecipe::execute(VPTransformState &State) {
9678 State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this,
9679 State);
9682 void VPBlendRecipe::execute(VPTransformState &State) {
9683 State.ILV->setDebugLocFromInst(Phi, &State.Builder);
9684 // We know that all PHIs in non-header blocks are converted into
9685 // selects, so we don't have to worry about the insertion order and we
9686 // can just use the builder.
9687 // At this point we generate the predication tree. There may be
9688 // duplications since this is a simple recursive scan, but future
9689 // optimizations will clean it up.
9691 unsigned NumIncoming = getNumIncomingValues();
9693 // Generate a sequence of selects of the form:
9694 // SELECT(Mask3, In3,
9695 // SELECT(Mask2, In2,
9696 // SELECT(Mask1, In1,
9697 // In0)))
9698 // Note that Mask0 is never used: lanes for which no path reaches this phi and
9699 // are essentially undef are taken from In0.
9700 InnerLoopVectorizer::VectorParts Entry(State.UF);
9701 for (unsigned In = 0; In < NumIncoming; ++In) {
9702 for (unsigned Part = 0; Part < State.UF; ++Part) {
9703 // We might have single edge PHIs (blocks) - use an identity
9704 // 'select' for the first PHI operand.
9705 Value *In0 = State.get(getIncomingValue(In), Part);
9706 if (In == 0)
9707 Entry[Part] = In0; // Initialize with the first incoming value.
9708 else {
9709 // Select between the current value and the previous incoming edge
9710 // based on the incoming mask.
9711 Value *Cond = State.get(getMask(In), Part);
9712 Entry[Part] =
9713 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9717 for (unsigned Part = 0; Part < State.UF; ++Part)
9718 State.set(this, Entry[Part], Part);
9721 void VPInterleaveRecipe::execute(VPTransformState &State) {
9722 assert(!State.Instance && "Interleave group being replicated.");
9723 State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9724 getStoredValues(), getMask());
9727 void VPReductionRecipe::execute(VPTransformState &State) {
9728 assert(!State.Instance && "Reduction being replicated.");
9729 Value *PrevInChain = State.get(getChainOp(), 0);
9730 for (unsigned Part = 0; Part < State.UF; ++Part) {
9731 RecurKind Kind = RdxDesc->getRecurrenceKind();
9732 bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9733 Value *NewVecOp = State.get(getVecOp(), Part);
9734 if (VPValue *Cond = getCondOp()) {
9735 Value *NewCond = State.get(Cond, Part);
9736 VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9737 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
9738 Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9739 Constant *IdenVec =
9740 ConstantVector::getSplat(VecTy->getElementCount(), Iden);
9741 Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9742 NewVecOp = Select;
9744 Value *NewRed;
9745 Value *NextInChain;
9746 if (IsOrdered) {
9747 if (State.VF.isVector())
9748 NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9749 PrevInChain);
9750 else
9751 NewRed = State.Builder.CreateBinOp(
9752 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(),
9753 PrevInChain, NewVecOp);
9754 PrevInChain = NewRed;
9755 } else {
9756 PrevInChain = State.get(getChainOp(), Part);
9757 NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9759 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9760 NextInChain =
9761 createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9762 NewRed, PrevInChain);
9763 } else if (IsOrdered)
9764 NextInChain = NewRed;
9765 else {
9766 NextInChain = State.Builder.CreateBinOp(
9767 (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
9768 PrevInChain);
9770 State.set(this, NextInChain, Part);
9774 void VPReplicateRecipe::execute(VPTransformState &State) {
9775 if (State.Instance) { // Generate a single instance.
9776 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9777 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
9778 *State.Instance, IsPredicated, State);
9779 // Insert scalar instance packing it into a vector.
9780 if (AlsoPack && State.VF.isVector()) {
9781 // If we're constructing lane 0, initialize to start from poison.
9782 if (State.Instance->Lane.isFirstLane()) {
9783 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9784 Value *Poison = PoisonValue::get(
9785 VectorType::get(getUnderlyingValue()->getType(), State.VF));
9786 State.set(this, Poison, State.Instance->Part);
9788 State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9790 return;
9793 // Generate scalar instances for all VF lanes of all UF parts, unless the
9794 // instruction is uniform inwhich case generate only the first lane for each
9795 // of the UF parts.
9796 unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9797 assert((!State.VF.isScalable() || IsUniform) &&
9798 "Can't scalarize a scalable vector");
9799 for (unsigned Part = 0; Part < State.UF; ++Part)
9800 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9801 State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
9802 VPIteration(Part, Lane), IsPredicated,
9803 State);
9806 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
9807 assert(State.Instance && "Branch on Mask works only on single instance.");
9809 unsigned Part = State.Instance->Part;
9810 unsigned Lane = State.Instance->Lane.getKnownLane();
9812 Value *ConditionBit = nullptr;
9813 VPValue *BlockInMask = getMask();
9814 if (BlockInMask) {
9815 ConditionBit = State.get(BlockInMask, Part);
9816 if (ConditionBit->getType()->isVectorTy())
9817 ConditionBit = State.Builder.CreateExtractElement(
9818 ConditionBit, State.Builder.getInt32(Lane));
9819 } else // Block in mask is all-one.
9820 ConditionBit = State.Builder.getTrue();
9822 // Replace the temporary unreachable terminator with a new conditional branch,
9823 // whose two destinations will be set later when they are created.
9824 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9825 assert(isa<UnreachableInst>(CurrentTerminator) &&
9826 "Expected to replace unreachable terminator with conditional branch.");
9827 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9828 CondBr->setSuccessor(0, nullptr);
9829 ReplaceInstWithInst(CurrentTerminator, CondBr);
9832 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9833 assert(State.Instance && "Predicated instruction PHI works per instance.");
9834 Instruction *ScalarPredInst =
9835 cast<Instruction>(State.get(getOperand(0), *State.Instance));
9836 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9837 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9838 assert(PredicatingBB && "Predicated block has no single predecessor.");
9839 assert(isa<VPReplicateRecipe>(getOperand(0)) &&
9840 "operand must be VPReplicateRecipe");
9842 // By current pack/unpack logic we need to generate only a single phi node: if
9843 // a vector value for the predicated instruction exists at this point it means
9844 // the instruction has vector users only, and a phi for the vector value is
9845 // needed. In this case the recipe of the predicated instruction is marked to
9846 // also do that packing, thereby "hoisting" the insert-element sequence.
9847 // Otherwise, a phi node for the scalar value is needed.
9848 unsigned Part = State.Instance->Part;
9849 if (State.hasVectorValue(getOperand(0), Part)) {
9850 Value *VectorValue = State.get(getOperand(0), Part);
9851 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9852 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9853 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9854 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9855 if (State.hasVectorValue(this, Part))
9856 State.reset(this, VPhi, Part);
9857 else
9858 State.set(this, VPhi, Part);
9859 // NOTE: Currently we need to update the value of the operand, so the next
9860 // predicated iteration inserts its generated value in the correct vector.
9861 State.reset(getOperand(0), VPhi, Part);
9862 } else {
9863 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9864 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9865 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9866 PredicatingBB);
9867 Phi->addIncoming(ScalarPredInst, PredicatedBB);
9868 if (State.hasScalarValue(this, *State.Instance))
9869 State.reset(this, Phi, *State.Instance);
9870 else
9871 State.set(this, Phi, *State.Instance);
9872 // NOTE: Currently we need to update the value of the operand, so the next
9873 // predicated iteration inserts its generated value in the correct vector.
9874 State.reset(getOperand(0), Phi, *State.Instance);
9878 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9879 VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9880 State.ILV->vectorizeMemoryInstruction(
9881 &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(),
9882 StoredValue, getMask());
9885 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9886 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9887 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9888 // for predication.
9889 static ScalarEpilogueLowering getScalarEpilogueLowering(
9890 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9891 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9892 AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
9893 LoopVectorizationLegality &LVL) {
9894 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9895 // don't look at hints or options, and don't request a scalar epilogue.
9896 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9897 // LoopAccessInfo (due to code dependency and not being able to reliably get
9898 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9899 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9900 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9901 // back to the old way and vectorize with versioning when forced. See D81345.)
9902 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9903 PGSOQueryType::IRPass) &&
9904 Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9905 return CM_ScalarEpilogueNotAllowedOptSize;
9907 // 2) If set, obey the directives
9908 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9909 switch (PreferPredicateOverEpilogue) {
9910 case PreferPredicateTy::ScalarEpilogue:
9911 return CM_ScalarEpilogueAllowed;
9912 case PreferPredicateTy::PredicateElseScalarEpilogue:
9913 return CM_ScalarEpilogueNotNeededUsePredicate;
9914 case PreferPredicateTy::PredicateOrDontVectorize:
9915 return CM_ScalarEpilogueNotAllowedUsePredicate;
9919 // 3) If set, obey the hints
9920 switch (Hints.getPredicate()) {
9921 case LoopVectorizeHints::FK_Enabled:
9922 return CM_ScalarEpilogueNotNeededUsePredicate;
9923 case LoopVectorizeHints::FK_Disabled:
9924 return CM_ScalarEpilogueAllowed;
9927 // 4) if the TTI hook indicates this is profitable, request predication.
9928 if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
9929 LVL.getLAI()))
9930 return CM_ScalarEpilogueNotNeededUsePredicate;
9932 return CM_ScalarEpilogueAllowed;
9935 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
9936 // If Values have been set for this Def return the one relevant for \p Part.
9937 if (hasVectorValue(Def, Part))
9938 return Data.PerPartOutput[Def][Part];
9940 if (!hasScalarValue(Def, {Part, 0})) {
9941 Value *IRV = Def->getLiveInIRValue();
9942 Value *B = ILV->getBroadcastInstrs(IRV);
9943 set(Def, B, Part);
9944 return B;
9947 Value *ScalarValue = get(Def, {Part, 0});
9948 // If we aren't vectorizing, we can just copy the scalar map values over
9949 // to the vector map.
9950 if (VF.isScalar()) {
9951 set(Def, ScalarValue, Part);
9952 return ScalarValue;
9955 auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
9956 bool IsUniform = RepR && RepR->isUniform();
9958 unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
9959 // Check if there is a scalar value for the selected lane.
9960 if (!hasScalarValue(Def, {Part, LastLane})) {
9961 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
9962 assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) &&
9963 "unexpected recipe found to be invariant");
9964 IsUniform = true;
9965 LastLane = 0;
9968 auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
9969 // Set the insert point after the last scalarized instruction or after the
9970 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
9971 // will directly follow the scalar definitions.
9972 auto OldIP = Builder.saveIP();
9973 auto NewIP =
9974 isa<PHINode>(LastInst)
9975 ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
9976 : std::next(BasicBlock::iterator(LastInst));
9977 Builder.SetInsertPoint(&*NewIP);
9979 // However, if we are vectorizing, we need to construct the vector values.
9980 // If the value is known to be uniform after vectorization, we can just
9981 // broadcast the scalar value corresponding to lane zero for each unroll
9982 // iteration. Otherwise, we construct the vector values using
9983 // insertelement instructions. Since the resulting vectors are stored in
9984 // State, we will only generate the insertelements once.
9985 Value *VectorValue = nullptr;
9986 if (IsUniform) {
9987 VectorValue = ILV->getBroadcastInstrs(ScalarValue);
9988 set(Def, VectorValue, Part);
9989 } else {
9990 // Initialize packing with insertelements to start from undef.
9991 assert(!VF.isScalable() && "VF is assumed to be non scalable.");
9992 Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
9993 set(Def, Undef, Part);
9994 for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
9995 ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
9996 VectorValue = get(Def, Part);
9998 Builder.restoreIP(OldIP);
9999 return VectorValue;
10002 // Process the loop in the VPlan-native vectorization path. This path builds
10003 // VPlan upfront in the vectorization pipeline, which allows to apply
10004 // VPlan-to-VPlan transformations from the very beginning without modifying the
10005 // input LLVM IR.
10006 static bool processLoopInVPlanNativePath(
10007 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
10008 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
10009 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
10010 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
10011 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
10012 LoopVectorizationRequirements &Requirements) {
10014 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10015 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10016 return false;
10018 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10019 Function *F = L->getHeader()->getParent();
10020 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10022 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10023 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
10025 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10026 &Hints, IAI);
10027 // Use the planner for outer loop vectorization.
10028 // TODO: CM is not used at this point inside the planner. Turn CM into an
10029 // optional argument if we don't need it in the future.
10030 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
10031 Requirements, ORE);
10033 // Get user vectorization factor.
10034 ElementCount UserVF = Hints.getWidth();
10036 CM.collectElementTypesForWidening();
10038 // Plan how to best vectorize, return the best VF and its cost.
10039 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10041 // If we are stress testing VPlan builds, do not attempt to generate vector
10042 // code. Masked vector code generation support will follow soon.
10043 // Also, do not attempt to vectorize if no vector code will be produced.
10044 if (VPlanBuildStressTest || EnableVPlanPredication ||
10045 VectorizationFactor::Disabled() == VF)
10046 return false;
10048 LVP.setBestPlan(VF.Width, 1);
10051 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10052 F->getParent()->getDataLayout());
10053 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
10054 &CM, BFI, PSI, Checks);
10055 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10056 << L->getHeader()->getParent()->getName() << "\"\n");
10057 LVP.executePlan(LB, DT);
10060 // Mark the loop as already vectorized to avoid vectorizing again.
10061 Hints.setAlreadyVectorized();
10062 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10063 return true;
10066 // Emit a remark if there are stores to floats that required a floating point
10067 // extension. If the vectorized loop was generated with floating point there
10068 // will be a performance penalty from the conversion overhead and the change in
10069 // the vector width.
10070 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10071 SmallVector<Instruction *, 4> Worklist;
10072 for (BasicBlock *BB : L->getBlocks()) {
10073 for (Instruction &Inst : *BB) {
10074 if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10075 if (S->getValueOperand()->getType()->isFloatTy())
10076 Worklist.push_back(S);
10081 // Traverse the floating point stores upwards searching, for floating point
10082 // conversions.
10083 SmallPtrSet<const Instruction *, 4> Visited;
10084 SmallPtrSet<const Instruction *, 4> EmittedRemark;
10085 while (!Worklist.empty()) {
10086 auto *I = Worklist.pop_back_val();
10087 if (!L->contains(I))
10088 continue;
10089 if (!Visited.insert(I).second)
10090 continue;
10092 // Emit a remark if the floating point store required a floating
10093 // point conversion.
10094 // TODO: More work could be done to identify the root cause such as a
10095 // constant or a function return type and point the user to it.
10096 if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10097 ORE->emit([&]() {
10098 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10099 I->getDebugLoc(), L->getHeader())
10100 << "floating point conversion changes vector width. "
10101 << "Mixed floating point precision requires an up/down "
10102 << "cast that will negatively impact performance.";
10105 for (Use &Op : I->operands())
10106 if (auto *OpI = dyn_cast<Instruction>(Op))
10107 Worklist.push_back(OpI);
10111 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10112 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10113 !EnableLoopInterleaving),
10114 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10115 !EnableLoopVectorization) {}
10117 bool LoopVectorizePass::processLoop(Loop *L) {
10118 assert((EnableVPlanNativePath || L->isInnermost()) &&
10119 "VPlan-native path is not enabled. Only process inner loops.");
10121 #ifndef NDEBUG
10122 const std::string DebugLocStr = getDebugLocString(L);
10123 #endif /* NDEBUG */
10125 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
10126 << L->getHeader()->getParent()->getName() << "\" from "
10127 << DebugLocStr << "\n");
10129 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
10131 LLVM_DEBUG(
10132 dbgs() << "LV: Loop hints:"
10133 << " force="
10134 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10135 ? "disabled"
10136 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10137 ? "enabled"
10138 : "?"))
10139 << " width=" << Hints.getWidth()
10140 << " interleave=" << Hints.getInterleave() << "\n");
10142 // Function containing loop
10143 Function *F = L->getHeader()->getParent();
10145 // Looking at the diagnostic output is the only way to determine if a loop
10146 // was vectorized (other than looking at the IR or machine code), so it
10147 // is important to generate an optimization remark for each loop. Most of
10148 // these messages are generated as OptimizationRemarkAnalysis. Remarks
10149 // generated as OptimizationRemark and OptimizationRemarkMissed are
10150 // less verbose reporting vectorized loops and unvectorized loops that may
10151 // benefit from vectorization, respectively.
10153 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10154 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10155 return false;
10158 PredicatedScalarEvolution PSE(*SE, *L);
10160 // Check if it is legal to vectorize the loop.
10161 LoopVectorizationRequirements Requirements;
10162 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
10163 &Requirements, &Hints, DB, AC, BFI, PSI);
10164 if (!LVL.canVectorize(EnableVPlanNativePath)) {
10165 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10166 Hints.emitRemarkWithHints();
10167 return false;
10170 // Check the function attributes and profiles to find out if this function
10171 // should be optimized for size.
10172 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10173 F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
10175 // Entrance to the VPlan-native vectorization path. Outer loops are processed
10176 // here. They may require CFG and instruction level transformations before
10177 // even evaluating whether vectorization is profitable. Since we cannot modify
10178 // the incoming IR, we need to build VPlan upfront in the vectorization
10179 // pipeline.
10180 if (!L->isInnermost())
10181 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10182 ORE, BFI, PSI, Hints, Requirements);
10184 assert(L->isInnermost() && "Inner loop expected.");
10186 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10187 // count by optimizing for size, to minimize overheads.
10188 auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10189 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10190 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10191 << "This loop is worth vectorizing only if no scalar "
10192 << "iteration overheads are incurred.");
10193 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10194 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10195 else {
10196 LLVM_DEBUG(dbgs() << "\n");
10197 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10201 // Check the function attributes to see if implicit floats are allowed.
10202 // FIXME: This check doesn't seem possibly correct -- what if the loop is
10203 // an integer loop and the vector instructions selected are purely integer
10204 // vector instructions?
10205 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10206 reportVectorizationFailure(
10207 "Can't vectorize when the NoImplicitFloat attribute is used",
10208 "loop not vectorized due to NoImplicitFloat attribute",
10209 "NoImplicitFloat", ORE, L);
10210 Hints.emitRemarkWithHints();
10211 return false;
10214 // Check if the target supports potentially unsafe FP vectorization.
10215 // FIXME: Add a check for the type of safety issue (denormal, signaling)
10216 // for the target we're vectorizing for, to make sure none of the
10217 // additional fp-math flags can help.
10218 if (Hints.isPotentiallyUnsafe() &&
10219 TTI->isFPVectorizationPotentiallyUnsafe()) {
10220 reportVectorizationFailure(
10221 "Potentially unsafe FP op prevents vectorization",
10222 "loop not vectorized due to unsafe FP support.",
10223 "UnsafeFP", ORE, L);
10224 Hints.emitRemarkWithHints();
10225 return false;
10228 if (!LVL.canVectorizeFPMath(ForceOrderedReductions)) {
10229 ORE->emit([&]() {
10230 auto *ExactFPMathInst = Requirements.getExactFPInst();
10231 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10232 ExactFPMathInst->getDebugLoc(),
10233 ExactFPMathInst->getParent())
10234 << "loop not vectorized: cannot prove it is safe to reorder "
10235 "floating-point operations";
10237 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10238 "reorder floating-point operations\n");
10239 Hints.emitRemarkWithHints();
10240 return false;
10243 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10244 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10246 // If an override option has been passed in for interleaved accesses, use it.
10247 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10248 UseInterleaved = EnableInterleavedMemAccesses;
10250 // Analyze interleaved memory accesses.
10251 if (UseInterleaved) {
10252 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10255 // Use the cost model.
10256 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10257 F, &Hints, IAI);
10258 CM.collectValuesToIgnore();
10259 CM.collectElementTypesForWidening();
10261 // Use the planner for vectorization.
10262 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
10263 Requirements, ORE);
10265 // Get user vectorization factor and interleave count.
10266 ElementCount UserVF = Hints.getWidth();
10267 unsigned UserIC = Hints.getInterleave();
10269 // Plan how to best vectorize, return the best VF and its cost.
10270 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10272 VectorizationFactor VF = VectorizationFactor::Disabled();
10273 unsigned IC = 1;
10275 if (MaybeVF) {
10276 VF = *MaybeVF;
10277 // Select the interleave count.
10278 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10281 // Identify the diagnostic messages that should be produced.
10282 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10283 bool VectorizeLoop = true, InterleaveLoop = true;
10284 if (VF.Width.isScalar()) {
10285 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10286 VecDiagMsg = std::make_pair(
10287 "VectorizationNotBeneficial",
10288 "the cost-model indicates that vectorization is not beneficial");
10289 VectorizeLoop = false;
10292 if (!MaybeVF && UserIC > 1) {
10293 // Tell the user interleaving was avoided up-front, despite being explicitly
10294 // requested.
10295 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10296 "interleaving should be avoided up front\n");
10297 IntDiagMsg = std::make_pair(
10298 "InterleavingAvoided",
10299 "Ignoring UserIC, because interleaving was avoided up front");
10300 InterleaveLoop = false;
10301 } else if (IC == 1 && UserIC <= 1) {
10302 // Tell the user interleaving is not beneficial.
10303 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10304 IntDiagMsg = std::make_pair(
10305 "InterleavingNotBeneficial",
10306 "the cost-model indicates that interleaving is not beneficial");
10307 InterleaveLoop = false;
10308 if (UserIC == 1) {
10309 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10310 IntDiagMsg.second +=
10311 " and is explicitly disabled or interleave count is set to 1";
10313 } else if (IC > 1 && UserIC == 1) {
10314 // Tell the user interleaving is beneficial, but it explicitly disabled.
10315 LLVM_DEBUG(
10316 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10317 IntDiagMsg = std::make_pair(
10318 "InterleavingBeneficialButDisabled",
10319 "the cost-model indicates that interleaving is beneficial "
10320 "but is explicitly disabled or interleave count is set to 1");
10321 InterleaveLoop = false;
10324 // Override IC if user provided an interleave count.
10325 IC = UserIC > 0 ? UserIC : IC;
10327 // Emit diagnostic messages, if any.
10328 const char *VAPassName = Hints.vectorizeAnalysisPassName();
10329 if (!VectorizeLoop && !InterleaveLoop) {
10330 // Do not vectorize or interleaving the loop.
10331 ORE->emit([&]() {
10332 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10333 L->getStartLoc(), L->getHeader())
10334 << VecDiagMsg.second;
10336 ORE->emit([&]() {
10337 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10338 L->getStartLoc(), L->getHeader())
10339 << IntDiagMsg.second;
10341 return false;
10342 } else if (!VectorizeLoop && InterleaveLoop) {
10343 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10344 ORE->emit([&]() {
10345 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10346 L->getStartLoc(), L->getHeader())
10347 << VecDiagMsg.second;
10349 } else if (VectorizeLoop && !InterleaveLoop) {
10350 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10351 << ") in " << DebugLocStr << '\n');
10352 ORE->emit([&]() {
10353 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10354 L->getStartLoc(), L->getHeader())
10355 << IntDiagMsg.second;
10357 } else if (VectorizeLoop && InterleaveLoop) {
10358 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10359 << ") in " << DebugLocStr << '\n');
10360 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10363 bool DisableRuntimeUnroll = false;
10364 MDNode *OrigLoopID = L->getLoopID();
10366 // Optimistically generate runtime checks. Drop them if they turn out to not
10367 // be profitable. Limit the scope of Checks, so the cleanup happens
10368 // immediately after vector codegeneration is done.
10369 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10370 F->getParent()->getDataLayout());
10371 if (!VF.Width.isScalar() || IC > 1)
10372 Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
10373 LVP.setBestPlan(VF.Width, IC);
10375 using namespace ore;
10376 if (!VectorizeLoop) {
10377 assert(IC > 1 && "interleave count should not be 1 or 0");
10378 // If we decided that it is not legal to vectorize the loop, then
10379 // interleave it.
10380 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10381 &CM, BFI, PSI, Checks);
10382 LVP.executePlan(Unroller, DT);
10384 ORE->emit([&]() {
10385 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10386 L->getHeader())
10387 << "interleaved loop (interleaved count: "
10388 << NV("InterleaveCount", IC) << ")";
10390 } else {
10391 // If we decided that it is *legal* to vectorize the loop, then do it.
10393 // Consider vectorizing the epilogue too if it's profitable.
10394 VectorizationFactor EpilogueVF =
10395 CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10396 if (EpilogueVF.Width.isVector()) {
10398 // The first pass vectorizes the main loop and creates a scalar epilogue
10399 // to be vectorized by executing the plan (potentially with a different
10400 // factor) again shortly afterwards.
10401 EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
10402 EpilogueVF.Width.getKnownMinValue(),
10404 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10405 EPI, &LVL, &CM, BFI, PSI, Checks);
10407 LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
10408 LVP.executePlan(MainILV, DT);
10409 ++LoopsVectorized;
10411 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10412 formLCSSARecursively(*L, *DT, LI, SE);
10414 // Second pass vectorizes the epilogue and adjusts the control flow
10415 // edges from the first pass.
10416 LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
10417 EPI.MainLoopVF = EPI.EpilogueVF;
10418 EPI.MainLoopUF = EPI.EpilogueUF;
10419 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10420 ORE, EPI, &LVL, &CM, BFI, PSI,
10421 Checks);
10422 LVP.executePlan(EpilogILV, DT);
10423 ++LoopsEpilogueVectorized;
10425 if (!MainILV.areSafetyChecksAdded())
10426 DisableRuntimeUnroll = true;
10427 } else {
10428 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
10429 &LVL, &CM, BFI, PSI, Checks);
10430 LVP.executePlan(LB, DT);
10431 ++LoopsVectorized;
10433 // Add metadata to disable runtime unrolling a scalar loop when there
10434 // are no runtime checks about strides and memory. A scalar loop that is
10435 // rarely used is not worth unrolling.
10436 if (!LB.areSafetyChecksAdded())
10437 DisableRuntimeUnroll = true;
10439 // Report the vectorization decision.
10440 ORE->emit([&]() {
10441 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10442 L->getHeader())
10443 << "vectorized loop (vectorization width: "
10444 << NV("VectorizationFactor", VF.Width)
10445 << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10449 if (ORE->allowExtraAnalysis(LV_NAME))
10450 checkMixedPrecision(L, ORE);
10453 Optional<MDNode *> RemainderLoopID =
10454 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10455 LLVMLoopVectorizeFollowupEpilogue});
10456 if (RemainderLoopID.hasValue()) {
10457 L->setLoopID(RemainderLoopID.getValue());
10458 } else {
10459 if (DisableRuntimeUnroll)
10460 AddRuntimeUnrollDisableMetaData(L);
10462 // Mark the loop as already vectorized to avoid vectorizing again.
10463 Hints.setAlreadyVectorized();
10466 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10467 return true;
10470 LoopVectorizeResult LoopVectorizePass::runImpl(
10471 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10472 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10473 DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10474 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10475 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10476 SE = &SE_;
10477 LI = &LI_;
10478 TTI = &TTI_;
10479 DT = &DT_;
10480 BFI = &BFI_;
10481 TLI = TLI_;
10482 AA = &AA_;
10483 AC = &AC_;
10484 GetLAA = &GetLAA_;
10485 DB = &DB_;
10486 ORE = &ORE_;
10487 PSI = PSI_;
10489 // Don't attempt if
10490 // 1. the target claims to have no vector registers, and
10491 // 2. interleaving won't help ILP.
10493 // The second condition is necessary because, even if the target has no
10494 // vector registers, loop vectorization may still enable scalar
10495 // interleaving.
10496 if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10497 TTI->getMaxInterleaveFactor(1) < 2)
10498 return LoopVectorizeResult(false, false);
10500 bool Changed = false, CFGChanged = false;
10502 // The vectorizer requires loops to be in simplified form.
10503 // Since simplification may add new inner loops, it has to run before the
10504 // legality and profitability checks. This means running the loop vectorizer
10505 // will simplify all loops, regardless of whether anything end up being
10506 // vectorized.
10507 for (auto &L : *LI)
10508 Changed |= CFGChanged |=
10509 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10511 // Build up a worklist of inner-loops to vectorize. This is necessary as
10512 // the act of vectorizing or partially unrolling a loop creates new loops
10513 // and can invalidate iterators across the loops.
10514 SmallVector<Loop *, 8> Worklist;
10516 for (Loop *L : *LI)
10517 collectSupportedLoops(*L, LI, ORE, Worklist);
10519 LoopsAnalyzed += Worklist.size();
10521 // Now walk the identified inner loops.
10522 while (!Worklist.empty()) {
10523 Loop *L = Worklist.pop_back_val();
10525 // For the inner loops we actually process, form LCSSA to simplify the
10526 // transform.
10527 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10529 Changed |= CFGChanged |= processLoop(L);
10532 // Process each loop nest in the function.
10533 return LoopVectorizeResult(Changed, CFGChanged);
10536 PreservedAnalyses LoopVectorizePass::run(Function &F,
10537 FunctionAnalysisManager &AM) {
10538 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10539 auto &LI = AM.getResult<LoopAnalysis>(F);
10540 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10541 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10542 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10543 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10544 auto &AA = AM.getResult<AAManager>(F);
10545 auto &AC = AM.getResult<AssumptionAnalysis>(F);
10546 auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10547 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10549 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10550 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10551 [&](Loop &L) -> const LoopAccessInfo & {
10552 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE,
10553 TLI, TTI, nullptr, nullptr};
10554 return LAM.getResult<LoopAccessAnalysis>(L, AR);
10556 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10557 ProfileSummaryInfo *PSI =
10558 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10559 LoopVectorizeResult Result =
10560 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10561 if (!Result.MadeAnyChange)
10562 return PreservedAnalyses::all();
10563 PreservedAnalyses PA;
10565 // We currently do not preserve loopinfo/dominator analyses with outer loop
10566 // vectorization. Until this is addressed, mark these analyses as preserved
10567 // only for non-VPlan-native path.
10568 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10569 if (!EnableVPlanNativePath) {
10570 PA.preserve<LoopAnalysis>();
10571 PA.preserve<DominatorTreeAnalysis>();
10573 if (!Result.MadeCFGChange)
10574 PA.preserveSet<CFGAnalyses>();
10575 return PA;