1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 // of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 // widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 // of vectorization. It decides on the optimal vector width, which
26 // can be one, if vectorization is not profitable.
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
36 //===----------------------------------------------------------------------===//
38 // The reduction-variable vectorization is based on the paper:
39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
41 // Variable uniformity checks are inspired by:
42 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
44 // The interleaved access vectorization is based on the paper:
45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
48 // Other ideas/concepts are from:
49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52 // Vectorizing Compilers.
54 //===----------------------------------------------------------------------===//
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
91 #include "llvm/Analysis/ProfileSummaryInfo.h"
92 #include "llvm/Analysis/ScalarEvolution.h"
93 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
94 #include "llvm/Analysis/TargetLibraryInfo.h"
95 #include "llvm/Analysis/TargetTransformInfo.h"
96 #include "llvm/Analysis/VectorUtils.h"
97 #include "llvm/IR/Attributes.h"
98 #include "llvm/IR/BasicBlock.h"
99 #include "llvm/IR/CFG.h"
100 #include "llvm/IR/Constant.h"
101 #include "llvm/IR/Constants.h"
102 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/LLVMContext.h"
116 #include "llvm/IR/Metadata.h"
117 #include "llvm/IR/Module.h"
118 #include "llvm/IR/Operator.h"
119 #include "llvm/IR/PatternMatch.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/InstructionCost.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
142 #include "llvm/Transforms/Utils/SizeOpts.h"
143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
148 #include <functional>
156 using namespace llvm
;
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
162 const char VerboseDebug
[] = DEBUG_TYPE
"-verbose";
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll
[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized
[] =
169 "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue
[] =
171 "llvm.loop.vectorize.followup_epilogue";
174 STATISTIC(LoopsVectorized
, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed
, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized
, "Number of epilogues vectorized");
178 static cl::opt
<bool> EnableEpilogueVectorization(
179 "enable-epilogue-vectorization", cl::init(true), cl::Hidden
,
180 cl::desc("Enable vectorization of epilogue loops."));
182 static cl::opt
<unsigned> EpilogueVectorizationForceVF(
183 "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden
,
184 cl::desc("When epilogue vectorization is enabled, and a value greater than "
185 "1 is specified, forces the given VF for all applicable epilogue "
188 static cl::opt
<unsigned> EpilogueVectorizationMinVF(
189 "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden
,
190 cl::desc("Only loops with vectorization factor equal to or larger than "
191 "the specified value are considered for epilogue vectorization."));
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt
<unsigned> TinyTripCountVectorThreshold(
196 "vectorizer-min-trip-count", cl::init(16), cl::Hidden
,
197 cl::desc("Loops with a constant trip count that is smaller than this "
198 "value are vectorized only if no scalar iteration overheads "
201 static cl::opt
<unsigned> PragmaVectorizeMemoryCheckThreshold(
202 "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden
,
203 cl::desc("The maximum allowed number of runtime memory checks with a "
204 "vectorize(enable) pragma."));
206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
207 // that predication is preferred, and this lists all options. I.e., the
208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
209 // and predicate the instructions accordingly. If tail-folding fails, there are
210 // different fallback strategies depending on these values:
211 namespace PreferPredicateTy
{
214 PredicateElseScalarEpilogue
,
215 PredicateOrDontVectorize
217 } // namespace PreferPredicateTy
219 static cl::opt
<PreferPredicateTy::Option
> PreferPredicateOverEpilogue(
220 "prefer-predicate-over-epilogue",
221 cl::init(PreferPredicateTy::ScalarEpilogue
),
223 cl::desc("Tail-folding and predication preferences over creating a scalar "
225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue
,
227 "Don't tail-predicate loops, create scalar epilogue"),
228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue
,
229 "predicate-else-scalar-epilogue",
230 "prefer tail-folding, create scalar epilogue if tail "
232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize
,
233 "predicate-dont-vectorize",
234 "prefers tail-folding, don't attempt vectorization if "
235 "tail-folding fails.")));
237 static cl::opt
<bool> MaximizeBandwidth(
238 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden
,
239 cl::desc("Maximize bandwidth when selecting vectorization factor which "
240 "will be determined by the smallest type in loop."));
242 static cl::opt
<bool> EnableInterleavedMemAccesses(
243 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden
,
244 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
246 /// An interleave-group may need masking if it resides in a block that needs
247 /// predication, or in order to mask away gaps.
248 static cl::opt
<bool> EnableMaskedInterleavedMemAccesses(
249 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden
,
250 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
252 static cl::opt
<unsigned> TinyTripCountInterleaveThreshold(
253 "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden
,
254 cl::desc("We don't interleave loops with a estimated constant trip count "
255 "below this number"));
257 static cl::opt
<unsigned> ForceTargetNumScalarRegs(
258 "force-target-num-scalar-regs", cl::init(0), cl::Hidden
,
259 cl::desc("A flag that overrides the target's number of scalar registers."));
261 static cl::opt
<unsigned> ForceTargetNumVectorRegs(
262 "force-target-num-vector-regs", cl::init(0), cl::Hidden
,
263 cl::desc("A flag that overrides the target's number of vector registers."));
265 static cl::opt
<unsigned> ForceTargetMaxScalarInterleaveFactor(
266 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden
,
267 cl::desc("A flag that overrides the target's max interleave factor for "
270 static cl::opt
<unsigned> ForceTargetMaxVectorInterleaveFactor(
271 "force-target-max-vector-interleave", cl::init(0), cl::Hidden
,
272 cl::desc("A flag that overrides the target's max interleave factor for "
273 "vectorized loops."));
275 static cl::opt
<unsigned> ForceTargetInstructionCost(
276 "force-target-instruction-cost", cl::init(0), cl::Hidden
,
277 cl::desc("A flag that overrides the target's expected cost for "
278 "an instruction to a single constant value. Mostly "
279 "useful for getting consistent testing."));
281 static cl::opt
<bool> ForceTargetSupportsScalableVectors(
282 "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden
,
284 "Pretend that scalable vectors are supported, even if the target does "
285 "not support them. This flag should only be used for testing."));
287 static cl::opt
<unsigned> SmallLoopCost(
288 "small-loop-cost", cl::init(20), cl::Hidden
,
290 "The cost of a loop that is considered 'small' by the interleaver."));
292 static cl::opt
<bool> LoopVectorizeWithBlockFrequency(
293 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden
,
294 cl::desc("Enable the use of the block frequency analysis to access PGO "
295 "heuristics minimizing code growth in cold regions and being more "
296 "aggressive in hot regions."));
298 // Runtime interleave loops for load/store throughput.
299 static cl::opt
<bool> EnableLoadStoreRuntimeInterleave(
300 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden
,
302 "Enable runtime interleaving until load/store ports are saturated"));
304 /// Interleave small loops with scalar reductions.
305 static cl::opt
<bool> InterleaveSmallLoopScalarReduction(
306 "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden
,
307 cl::desc("Enable interleaving for loops with small iteration counts that "
308 "contain scalar reductions to expose ILP."));
310 /// The number of stores in a loop that are allowed to need predication.
311 static cl::opt
<unsigned> NumberOfStoresToPredicate(
312 "vectorize-num-stores-pred", cl::init(1), cl::Hidden
,
313 cl::desc("Max number of stores to be predicated behind an if."));
315 static cl::opt
<bool> EnableIndVarRegisterHeur(
316 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden
,
317 cl::desc("Count the induction variable only once when interleaving"));
319 static cl::opt
<bool> EnableCondStoresVectorization(
320 "enable-cond-stores-vec", cl::init(true), cl::Hidden
,
321 cl::desc("Enable if predication of stores during vectorization."));
323 static cl::opt
<unsigned> MaxNestedScalarReductionIC(
324 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden
,
325 cl::desc("The maximum interleave count to use when interleaving a scalar "
326 "reduction in a nested loop."));
329 PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
331 cl::desc("Prefer in-loop vector reductions, "
332 "overriding the targets preference."));
334 cl::opt
<bool> ForceOrderedReductions(
335 "force-ordered-reductions", cl::init(false), cl::Hidden
,
336 cl::desc("Enable the vectorisation of loops with in-order (strict) "
339 static cl::opt
<bool> PreferPredicatedReductionSelect(
340 "prefer-predicated-reduction-select", cl::init(false), cl::Hidden
,
342 "Prefer predicating a reduction operation over an after loop select."));
344 cl::opt
<bool> EnableVPlanNativePath(
345 "enable-vplan-native-path", cl::init(false), cl::Hidden
,
346 cl::desc("Enable VPlan-native vectorization path with "
347 "support for outer loop vectorization."));
349 // FIXME: Remove this switch once we have divergence analysis. Currently we
350 // assume divergent non-backedge branches when this switch is true.
351 cl::opt
<bool> EnableVPlanPredication(
352 "enable-vplan-predication", cl::init(false), cl::Hidden
,
353 cl::desc("Enable VPlan-native vectorization path predicator with "
354 "support for outer loop vectorization."));
356 // This flag enables the stress testing of the VPlan H-CFG construction in the
357 // VPlan-native vectorization path. It must be used in conjuction with
358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
359 // verification of the H-CFGs built.
360 static cl::opt
<bool> VPlanBuildStressTest(
361 "vplan-build-stress-test", cl::init(false), cl::Hidden
,
363 "Build VPlan for every supported loop nest in the function and bail "
364 "out right after the build (stress test the VPlan H-CFG construction "
365 "in the VPlan-native vectorization path)."));
367 cl::opt
<bool> llvm::EnableLoopInterleaving(
368 "interleave-loops", cl::init(true), cl::Hidden
,
369 cl::desc("Enable loop interleaving in Loop vectorization passes"));
370 cl::opt
<bool> llvm::EnableLoopVectorization(
371 "vectorize-loops", cl::init(true), cl::Hidden
,
372 cl::desc("Run the Loop vectorization passes"));
374 cl::opt
<bool> PrintVPlansInDotFormat(
375 "vplan-print-in-dot-format", cl::init(false), cl::Hidden
,
376 cl::desc("Use dot format instead of plain text when dumping VPlans"));
378 /// A helper function that returns true if the given type is irregular. The
379 /// type is irregular if its allocated size doesn't equal the store size of an
380 /// element of the corresponding vector type.
381 static bool hasIrregularType(Type
*Ty
, const DataLayout
&DL
) {
382 // Determine if an array of N elements of type Ty is "bitcast compatible"
383 // with a <N x Ty> vector.
384 // This is only true if there is no padding between the array elements.
385 return DL
.getTypeAllocSizeInBits(Ty
) != DL
.getTypeSizeInBits(Ty
);
388 /// A helper function that returns the reciprocal of the block probability of
389 /// predicated blocks. If we return X, we are assuming the predicated block
390 /// will execute once for every X iterations of the loop header.
392 /// TODO: We should use actual block probability here, if available. Currently,
393 /// we always assume predicated blocks have a 50% chance of executing.
394 static unsigned getReciprocalPredBlockProb() { return 2; }
396 /// A helper function that returns an integer or floating-point constant with
398 static Constant
*getSignedIntOrFpConstant(Type
*Ty
, int64_t C
) {
399 return Ty
->isIntegerTy() ? ConstantInt::getSigned(Ty
, C
)
400 : ConstantFP::get(Ty
, C
);
403 /// Returns "best known" trip count for the specified loop \p L as defined by
404 /// the following procedure:
405 /// 1) Returns exact trip count if it is known.
406 /// 2) Returns expected trip count according to profile data if any.
407 /// 3) Returns upper bound estimate if it is known.
408 /// 4) Returns None if all of the above failed.
409 static Optional
<unsigned> getSmallBestKnownTC(ScalarEvolution
&SE
, Loop
*L
) {
410 // Check if exact trip count is known.
411 if (unsigned ExpectedTC
= SE
.getSmallConstantTripCount(L
))
414 // Check if there is an expected trip count available from profile data.
415 if (LoopVectorizeWithBlockFrequency
)
416 if (auto EstimatedTC
= getLoopEstimatedTripCount(L
))
419 // Check if upper bound estimate is known.
420 if (unsigned ExpectedTC
= SE
.getSmallConstantMaxTripCount(L
))
426 // Forward declare GeneratedRTChecks.
427 class GeneratedRTChecks
;
431 /// InnerLoopVectorizer vectorizes loops which contain only one basic
432 /// block to a specified vectorization factor (VF).
433 /// This class performs the widening of scalars into vectors, or multiple
434 /// scalars. This class also implements the following features:
435 /// * It inserts an epilogue loop for handling loops that don't have iteration
436 /// counts that are known to be a multiple of the vectorization factor.
437 /// * It handles the code generation for reduction variables.
438 /// * Scalarization (implementation using scalars) of un-vectorizable
440 /// InnerLoopVectorizer does not perform any vectorization-legality
441 /// checks, and relies on the caller to check for the different legality
442 /// aspects. The InnerLoopVectorizer relies on the
443 /// LoopVectorizationLegality class to provide information about the induction
444 /// and reduction variables that were found to a given vectorization factor.
445 class InnerLoopVectorizer
{
447 InnerLoopVectorizer(Loop
*OrigLoop
, PredicatedScalarEvolution
&PSE
,
448 LoopInfo
*LI
, DominatorTree
*DT
,
449 const TargetLibraryInfo
*TLI
,
450 const TargetTransformInfo
*TTI
, AssumptionCache
*AC
,
451 OptimizationRemarkEmitter
*ORE
, ElementCount VecWidth
,
452 unsigned UnrollFactor
, LoopVectorizationLegality
*LVL
,
453 LoopVectorizationCostModel
*CM
, BlockFrequencyInfo
*BFI
,
454 ProfileSummaryInfo
*PSI
, GeneratedRTChecks
&RTChecks
)
455 : OrigLoop(OrigLoop
), PSE(PSE
), LI(LI
), DT(DT
), TLI(TLI
), TTI(TTI
),
456 AC(AC
), ORE(ORE
), VF(VecWidth
), UF(UnrollFactor
),
457 Builder(PSE
.getSE()->getContext()), Legal(LVL
), Cost(CM
), BFI(BFI
),
458 PSI(PSI
), RTChecks(RTChecks
) {
459 // Query this against the original loop and save it here because the profile
460 // of the original loop header may change as the transformation happens.
461 OptForSizeBasedOnProfile
= llvm::shouldOptimizeForSize(
462 OrigLoop
->getHeader(), PSI
, BFI
, PGSOQueryType::IRPass
);
465 virtual ~InnerLoopVectorizer() = default;
467 /// Create a new empty loop that will contain vectorized instructions later
468 /// on, while the old loop will be used as the scalar remainder. Control flow
469 /// is generated around the vectorized (and scalar epilogue) loops consisting
470 /// of various checks and bypasses. Return the pre-header block of the new
472 /// In the case of epilogue vectorization, this function is overriden to
473 /// handle the more complex control flow around the loops.
474 virtual BasicBlock
*createVectorizedLoopSkeleton();
476 /// Widen a single instruction within the innermost loop.
477 void widenInstruction(Instruction
&I
, VPValue
*Def
, VPUser
&Operands
,
478 VPTransformState
&State
);
480 /// Widen a single call instruction within the innermost loop.
481 void widenCallInstruction(CallInst
&I
, VPValue
*Def
, VPUser
&ArgOperands
,
482 VPTransformState
&State
);
484 /// Widen a single select instruction within the innermost loop.
485 void widenSelectInstruction(SelectInst
&I
, VPValue
*VPDef
, VPUser
&Operands
,
486 bool InvariantCond
, VPTransformState
&State
);
488 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
489 void fixVectorizedLoop(VPTransformState
&State
);
491 // Return true if any runtime check is added.
492 bool areSafetyChecksAdded() { return AddedSafetyChecks
; }
494 /// A type for vectorized values in the new loop. Each value from the
495 /// original loop, when vectorized, is represented by UF vector values in the
496 /// new unrolled loop, where UF is the unroll factor.
497 using VectorParts
= SmallVector
<Value
*, 2>;
499 /// Vectorize a single GetElementPtrInst based on information gathered and
500 /// decisions taken during planning.
501 void widenGEP(GetElementPtrInst
*GEP
, VPValue
*VPDef
, VPUser
&Indices
,
502 unsigned UF
, ElementCount VF
, bool IsPtrLoopInvariant
,
503 SmallBitVector
&IsIndexLoopInvariant
, VPTransformState
&State
);
505 /// Vectorize a single first-order recurrence or pointer induction PHINode in
506 /// a block. This method handles the induction variable canonicalization. It
507 /// supports both VF = 1 for unrolled loops and arbitrary length vectors.
508 void widenPHIInstruction(Instruction
*PN
, VPWidenPHIRecipe
*PhiR
,
509 VPTransformState
&State
);
511 /// A helper function to scalarize a single Instruction in the innermost loop.
512 /// Generates a sequence of scalar instances for each lane between \p MinLane
513 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
514 /// inclusive. Uses the VPValue operands from \p Operands instead of \p
515 /// Instr's operands.
516 void scalarizeInstruction(Instruction
*Instr
, VPValue
*Def
, VPUser
&Operands
,
517 const VPIteration
&Instance
, bool IfPredicateInstr
,
518 VPTransformState
&State
);
520 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
521 /// is provided, the integer induction variable will first be truncated to
522 /// the corresponding type.
523 void widenIntOrFpInduction(PHINode
*IV
, Value
*Start
, TruncInst
*Trunc
,
524 VPValue
*Def
, VPValue
*CastDef
,
525 VPTransformState
&State
);
527 /// Construct the vector value of a scalarized value \p V one lane at a time.
528 void packScalarIntoVectorValue(VPValue
*Def
, const VPIteration
&Instance
,
529 VPTransformState
&State
);
531 /// Try to vectorize interleaved access group \p Group with the base address
532 /// given in \p Addr, optionally masking the vector operations if \p
533 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
534 /// values in the vectorized loop.
535 void vectorizeInterleaveGroup(const InterleaveGroup
<Instruction
> *Group
,
536 ArrayRef
<VPValue
*> VPDefs
,
537 VPTransformState
&State
, VPValue
*Addr
,
538 ArrayRef
<VPValue
*> StoredValues
,
539 VPValue
*BlockInMask
= nullptr);
541 /// Vectorize Load and Store instructions with the base address given in \p
542 /// Addr, optionally masking the vector operations if \p BlockInMask is
543 /// non-null. Use \p State to translate given VPValues to IR values in the
545 void vectorizeMemoryInstruction(Instruction
*Instr
, VPTransformState
&State
,
546 VPValue
*Def
, VPValue
*Addr
,
547 VPValue
*StoredValue
, VPValue
*BlockInMask
);
549 /// Set the debug location in the builder \p Ptr using the debug location in
550 /// \p V. If \p Ptr is None then it uses the class member's Builder.
551 void setDebugLocFromInst(const Value
*V
,
552 Optional
<IRBuilder
<> *> CustomBuilder
= None
);
554 /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
555 void fixNonInductionPHIs(VPTransformState
&State
);
557 /// Returns true if the reordering of FP operations is not allowed, but we are
558 /// able to vectorize with strict in-order reductions for the given RdxDesc.
559 bool useOrderedReductions(RecurrenceDescriptor
&RdxDesc
);
561 /// Create a broadcast instruction. This method generates a broadcast
562 /// instruction (shuffle) for loop invariant values and for the induction
563 /// value. If this is the induction variable then we extend it to N, N+1, ...
564 /// this is needed because each iteration in the loop corresponds to a SIMD
566 virtual Value
*getBroadcastInstrs(Value
*V
);
569 friend class LoopVectorizationPlanner
;
571 /// A small list of PHINodes.
572 using PhiVector
= SmallVector
<PHINode
*, 4>;
574 /// A type for scalarized values in the new loop. Each value from the
575 /// original loop, when scalarized, is represented by UF x VF scalar values
576 /// in the new unrolled loop, where UF is the unroll factor and VF is the
577 /// vectorization factor.
578 using ScalarParts
= SmallVector
<SmallVector
<Value
*, 4>, 2>;
580 /// Set up the values of the IVs correctly when exiting the vector loop.
581 void fixupIVUsers(PHINode
*OrigPhi
, const InductionDescriptor
&II
,
582 Value
*CountRoundDown
, Value
*EndValue
,
583 BasicBlock
*MiddleBlock
);
585 /// Create a new induction variable inside L.
586 PHINode
*createInductionVariable(Loop
*L
, Value
*Start
, Value
*End
,
587 Value
*Step
, Instruction
*DL
);
589 /// Handle all cross-iteration phis in the header.
590 void fixCrossIterationPHIs(VPTransformState
&State
);
592 /// Create the exit value of first order recurrences in the middle block and
593 /// update their users.
594 void fixFirstOrderRecurrence(VPWidenPHIRecipe
*PhiR
, VPTransformState
&State
);
596 /// Create code for the loop exit value of the reduction.
597 void fixReduction(VPReductionPHIRecipe
*Phi
, VPTransformState
&State
);
599 /// Clear NSW/NUW flags from reduction instructions if necessary.
600 void clearReductionWrapFlags(const RecurrenceDescriptor
&RdxDesc
,
601 VPTransformState
&State
);
603 /// Fixup the LCSSA phi nodes in the unique exit block. This simply
604 /// means we need to add the appropriate incoming value from the middle
605 /// block as exiting edges from the scalar epilogue loop (if present) are
606 /// already in place, and we exit the vector loop exclusively to the middle
608 void fixLCSSAPHIs(VPTransformState
&State
);
610 /// Iteratively sink the scalarized operands of a predicated instruction into
611 /// the block that was created for it.
612 void sinkScalarOperands(Instruction
*PredInst
);
614 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
616 void truncateToMinimalBitwidths(VPTransformState
&State
);
618 /// This function adds
619 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
620 /// to each vector element of Val. The sequence starts at StartIndex.
621 /// \p Opcode is relevant for FP induction variable.
622 virtual Value
*getStepVector(Value
*Val
, int StartIdx
, Value
*Step
,
623 Instruction::BinaryOps Opcode
=
624 Instruction::BinaryOpsEnd
);
626 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
627 /// variable on which to base the steps, \p Step is the size of the step, and
628 /// \p EntryVal is the value from the original loop that maps to the steps.
629 /// Note that \p EntryVal doesn't have to be an induction variable - it
630 /// can also be a truncate instruction.
631 void buildScalarSteps(Value
*ScalarIV
, Value
*Step
, Instruction
*EntryVal
,
632 const InductionDescriptor
&ID
, VPValue
*Def
,
633 VPValue
*CastDef
, VPTransformState
&State
);
635 /// Create a vector induction phi node based on an existing scalar one. \p
636 /// EntryVal is the value from the original loop that maps to the vector phi
637 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
638 /// truncate instruction, instead of widening the original IV, we widen a
639 /// version of the IV truncated to \p EntryVal's type.
640 void createVectorIntOrFpInductionPHI(const InductionDescriptor
&II
,
641 Value
*Step
, Value
*Start
,
642 Instruction
*EntryVal
, VPValue
*Def
,
644 VPTransformState
&State
);
646 /// Returns true if an instruction \p I should be scalarized instead of
647 /// vectorized for the chosen vectorization factor.
648 bool shouldScalarizeInstruction(Instruction
*I
) const;
650 /// Returns true if we should generate a scalar version of \p IV.
651 bool needsScalarInduction(Instruction
*IV
) const;
653 /// If there is a cast involved in the induction variable \p ID, which should
654 /// be ignored in the vectorized loop body, this function records the
655 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
656 /// cast. We had already proved that the casted Phi is equal to the uncasted
657 /// Phi in the vectorized loop (under a runtime guard), and therefore
658 /// there is no need to vectorize the cast - the same value can be used in the
659 /// vector loop for both the Phi and the cast.
660 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
661 /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
663 /// \p EntryVal is the value from the original loop that maps to the vector
664 /// phi node and is used to distinguish what is the IV currently being
665 /// processed - original one (if \p EntryVal is a phi corresponding to the
666 /// original IV) or the "newly-created" one based on the proof mentioned above
667 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
668 /// latter case \p EntryVal is a TruncInst and we must not record anything for
669 /// that IV, but it's error-prone to expect callers of this routine to care
670 /// about that, hence this explicit parameter.
671 void recordVectorLoopValueForInductionCast(
672 const InductionDescriptor
&ID
, const Instruction
*EntryVal
,
673 Value
*VectorLoopValue
, VPValue
*CastDef
, VPTransformState
&State
,
674 unsigned Part
, unsigned Lane
= UINT_MAX
);
676 /// Generate a shuffle sequence that will reverse the vector Vec.
677 virtual Value
*reverseVector(Value
*Vec
);
679 /// Returns (and creates if needed) the original loop trip count.
680 Value
*getOrCreateTripCount(Loop
*NewLoop
);
682 /// Returns (and creates if needed) the trip count of the widened loop.
683 Value
*getOrCreateVectorTripCount(Loop
*NewLoop
);
685 /// Returns a bitcasted value to the requested vector type.
686 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
687 Value
*createBitOrPointerCast(Value
*V
, VectorType
*DstVTy
,
688 const DataLayout
&DL
);
690 /// Emit a bypass check to see if the vector trip count is zero, including if
692 void emitMinimumIterationCountCheck(Loop
*L
, BasicBlock
*Bypass
);
694 /// Emit a bypass check to see if all of the SCEV assumptions we've
695 /// had to make are correct. Returns the block containing the checks or
696 /// nullptr if no checks have been added.
697 BasicBlock
*emitSCEVChecks(Loop
*L
, BasicBlock
*Bypass
);
699 /// Emit bypass checks to check any memory assumptions we may have made.
700 /// Returns the block containing the checks or nullptr if no checks have been
702 BasicBlock
*emitMemRuntimeChecks(Loop
*L
, BasicBlock
*Bypass
);
704 /// Compute the transformed value of Index at offset StartValue using step
706 /// For integer induction, returns StartValue + Index * StepValue.
707 /// For pointer induction, returns StartValue[Index * StepValue].
708 /// FIXME: The newly created binary instructions should contain nsw/nuw
709 /// flags, which can be found from the original scalar operations.
710 Value
*emitTransformedIndex(IRBuilder
<> &B
, Value
*Index
, ScalarEvolution
*SE
,
711 const DataLayout
&DL
,
712 const InductionDescriptor
&ID
) const;
714 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
715 /// vector loop preheader, middle block and scalar preheader. Also
716 /// allocate a loop object for the new vector loop and return it.
717 Loop
*createVectorLoopSkeleton(StringRef Prefix
);
719 /// Create new phi nodes for the induction variables to resume iteration count
720 /// in the scalar epilogue, from where the vectorized loop left off (given by
721 /// \p VectorTripCount).
722 /// In cases where the loop skeleton is more complicated (eg. epilogue
723 /// vectorization) and the resume values can come from an additional bypass
724 /// block, the \p AdditionalBypass pair provides information about the bypass
725 /// block and the end value on the edge from bypass to this loop.
726 void createInductionResumeValues(
727 Loop
*L
, Value
*VectorTripCount
,
728 std::pair
<BasicBlock
*, Value
*> AdditionalBypass
= {nullptr, nullptr});
730 /// Complete the loop skeleton by adding debug MDs, creating appropriate
731 /// conditional branches in the middle block, preparing the builder and
732 /// running the verifier. Take in the vector loop \p L as argument, and return
733 /// the preheader of the completed vector loop.
734 BasicBlock
*completeLoopSkeleton(Loop
*L
, MDNode
*OrigLoopID
);
736 /// Add additional metadata to \p To that was not present on \p Orig.
738 /// Currently this is used to add the noalias annotations based on the
739 /// inserted memchecks. Use this for instructions that are *cloned* into the
741 void addNewMetadata(Instruction
*To
, const Instruction
*Orig
);
743 /// Add metadata from one instruction to another.
745 /// This includes both the original MDs from \p From and additional ones (\see
746 /// addNewMetadata). Use this for *newly created* instructions in the vector
748 void addMetadata(Instruction
*To
, Instruction
*From
);
750 /// Similar to the previous function but it adds the metadata to a
751 /// vector of instructions.
752 void addMetadata(ArrayRef
<Value
*> To
, Instruction
*From
);
754 /// Allow subclasses to override and print debug traces before/after vplan
755 /// execution, when trace information is requested.
756 virtual void printDebugTracesAtStart(){};
757 virtual void printDebugTracesAtEnd(){};
759 /// The original loop.
762 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
763 /// dynamic knowledge to simplify SCEV expressions and converts them to a
764 /// more usable form.
765 PredicatedScalarEvolution
&PSE
;
776 /// Target Library Info.
777 const TargetLibraryInfo
*TLI
;
779 /// Target Transform Info.
780 const TargetTransformInfo
*TTI
;
782 /// Assumption Cache.
785 /// Interface to emit optimization remarks.
786 OptimizationRemarkEmitter
*ORE
;
788 /// LoopVersioning. It's only set up (non-null) if memchecks were
791 /// This is currently only used to add no-alias metadata based on the
792 /// memchecks. The actually versioning is performed manually.
793 std::unique_ptr
<LoopVersioning
> LVer
;
795 /// The vectorization SIMD factor to use. Each vector will have this many
799 /// The vectorization unroll factor to use. Each scalar is vectorized to this
800 /// many different vector instructions.
803 /// The builder that we use
806 // --- Vectorization state ---
808 /// The vector-loop preheader.
809 BasicBlock
*LoopVectorPreHeader
;
811 /// The scalar-loop preheader.
812 BasicBlock
*LoopScalarPreHeader
;
814 /// Middle Block between the vector and the scalar.
815 BasicBlock
*LoopMiddleBlock
;
817 /// The unique ExitBlock of the scalar loop if one exists. Note that
818 /// there can be multiple exiting edges reaching this block.
819 BasicBlock
*LoopExitBlock
;
821 /// The vector loop body.
822 BasicBlock
*LoopVectorBody
;
824 /// The scalar loop body.
825 BasicBlock
*LoopScalarBody
;
827 /// A list of all bypass blocks. The first block is the entry of the loop.
828 SmallVector
<BasicBlock
*, 4> LoopBypassBlocks
;
830 /// The new Induction variable which was added to the new block.
831 PHINode
*Induction
= nullptr;
833 /// The induction variable of the old basic block.
834 PHINode
*OldInduction
= nullptr;
836 /// Store instructions that were predicated.
837 SmallVector
<Instruction
*, 4> PredicatedInstructions
;
839 /// Trip count of the original loop.
840 Value
*TripCount
= nullptr;
842 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
843 Value
*VectorTripCount
= nullptr;
845 /// The legality analysis.
846 LoopVectorizationLegality
*Legal
;
848 /// The profitablity analysis.
849 LoopVectorizationCostModel
*Cost
;
851 // Record whether runtime checks are added.
852 bool AddedSafetyChecks
= false;
854 // Holds the end values for each induction variable. We save the end values
855 // so we can later fix-up the external users of the induction variables.
856 DenseMap
<PHINode
*, Value
*> IVEndValues
;
858 // Vector of original scalar PHIs whose corresponding widened PHIs need to be
859 // fixed up at the end of vector code generation.
860 SmallVector
<PHINode
*, 8> OrigPHIsToFix
;
862 /// BFI and PSI are used to check for profile guided size optimizations.
863 BlockFrequencyInfo
*BFI
;
864 ProfileSummaryInfo
*PSI
;
866 // Whether this loop should be optimized for size based on profile guided size
868 bool OptForSizeBasedOnProfile
;
870 /// Structure to hold information about generated runtime checks, responsible
871 /// for cleaning the checks, if vectorization turns out unprofitable.
872 GeneratedRTChecks
&RTChecks
;
875 class InnerLoopUnroller
: public InnerLoopVectorizer
{
877 InnerLoopUnroller(Loop
*OrigLoop
, PredicatedScalarEvolution
&PSE
,
878 LoopInfo
*LI
, DominatorTree
*DT
,
879 const TargetLibraryInfo
*TLI
,
880 const TargetTransformInfo
*TTI
, AssumptionCache
*AC
,
881 OptimizationRemarkEmitter
*ORE
, unsigned UnrollFactor
,
882 LoopVectorizationLegality
*LVL
,
883 LoopVectorizationCostModel
*CM
, BlockFrequencyInfo
*BFI
,
884 ProfileSummaryInfo
*PSI
, GeneratedRTChecks
&Check
)
885 : InnerLoopVectorizer(OrigLoop
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
,
886 ElementCount::getFixed(1), UnrollFactor
, LVL
, CM
,
890 Value
*getBroadcastInstrs(Value
*V
) override
;
891 Value
*getStepVector(Value
*Val
, int StartIdx
, Value
*Step
,
892 Instruction::BinaryOps Opcode
=
893 Instruction::BinaryOpsEnd
) override
;
894 Value
*reverseVector(Value
*Vec
) override
;
897 /// Encapsulate information regarding vectorization of a loop and its epilogue.
898 /// This information is meant to be updated and used across two stages of
899 /// epilogue vectorization.
900 struct EpilogueLoopVectorizationInfo
{
901 ElementCount MainLoopVF
= ElementCount::getFixed(0);
902 unsigned MainLoopUF
= 0;
903 ElementCount EpilogueVF
= ElementCount::getFixed(0);
904 unsigned EpilogueUF
= 0;
905 BasicBlock
*MainLoopIterationCountCheck
= nullptr;
906 BasicBlock
*EpilogueIterationCountCheck
= nullptr;
907 BasicBlock
*SCEVSafetyCheck
= nullptr;
908 BasicBlock
*MemSafetyCheck
= nullptr;
909 Value
*TripCount
= nullptr;
910 Value
*VectorTripCount
= nullptr;
912 EpilogueLoopVectorizationInfo(unsigned MVF
, unsigned MUF
, unsigned EVF
,
914 : MainLoopVF(ElementCount::getFixed(MVF
)), MainLoopUF(MUF
),
915 EpilogueVF(ElementCount::getFixed(EVF
)), EpilogueUF(EUF
) {
917 "A high UF for the epilogue loop is likely not beneficial.");
921 /// An extension of the inner loop vectorizer that creates a skeleton for a
922 /// vectorized loop that has its epilogue (residual) also vectorized.
923 /// The idea is to run the vplan on a given loop twice, firstly to setup the
924 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
925 /// from the first step and vectorize the epilogue. This is achieved by
926 /// deriving two concrete strategy classes from this base class and invoking
927 /// them in succession from the loop vectorizer planner.
928 class InnerLoopAndEpilogueVectorizer
: public InnerLoopVectorizer
{
930 InnerLoopAndEpilogueVectorizer(
931 Loop
*OrigLoop
, PredicatedScalarEvolution
&PSE
, LoopInfo
*LI
,
932 DominatorTree
*DT
, const TargetLibraryInfo
*TLI
,
933 const TargetTransformInfo
*TTI
, AssumptionCache
*AC
,
934 OptimizationRemarkEmitter
*ORE
, EpilogueLoopVectorizationInfo
&EPI
,
935 LoopVectorizationLegality
*LVL
, llvm::LoopVectorizationCostModel
*CM
,
936 BlockFrequencyInfo
*BFI
, ProfileSummaryInfo
*PSI
,
937 GeneratedRTChecks
&Checks
)
938 : InnerLoopVectorizer(OrigLoop
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
,
939 EPI
.MainLoopVF
, EPI
.MainLoopUF
, LVL
, CM
, BFI
, PSI
,
943 // Override this function to handle the more complex control flow around the
945 BasicBlock
*createVectorizedLoopSkeleton() final override
{
946 return createEpilogueVectorizedLoopSkeleton();
949 /// The interface for creating a vectorized skeleton using one of two
950 /// different strategies, each corresponding to one execution of the vplan
951 /// as described above.
952 virtual BasicBlock
*createEpilogueVectorizedLoopSkeleton() = 0;
954 /// Holds and updates state information required to vectorize the main loop
955 /// and its epilogue in two separate passes. This setup helps us avoid
956 /// regenerating and recomputing runtime safety checks. It also helps us to
957 /// shorten the iteration-count-check path length for the cases where the
958 /// iteration count of the loop is so small that the main vector loop is
959 /// completely skipped.
960 EpilogueLoopVectorizationInfo
&EPI
;
963 /// A specialized derived class of inner loop vectorizer that performs
964 /// vectorization of *main* loops in the process of vectorizing loops and their
966 class EpilogueVectorizerMainLoop
: public InnerLoopAndEpilogueVectorizer
{
968 EpilogueVectorizerMainLoop(
969 Loop
*OrigLoop
, PredicatedScalarEvolution
&PSE
, LoopInfo
*LI
,
970 DominatorTree
*DT
, const TargetLibraryInfo
*TLI
,
971 const TargetTransformInfo
*TTI
, AssumptionCache
*AC
,
972 OptimizationRemarkEmitter
*ORE
, EpilogueLoopVectorizationInfo
&EPI
,
973 LoopVectorizationLegality
*LVL
, llvm::LoopVectorizationCostModel
*CM
,
974 BlockFrequencyInfo
*BFI
, ProfileSummaryInfo
*PSI
,
975 GeneratedRTChecks
&Check
)
976 : InnerLoopAndEpilogueVectorizer(OrigLoop
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
,
977 EPI
, LVL
, CM
, BFI
, PSI
, Check
) {}
978 /// Implements the interface for creating a vectorized skeleton using the
979 /// *main loop* strategy (ie the first pass of vplan execution).
980 BasicBlock
*createEpilogueVectorizedLoopSkeleton() final override
;
983 /// Emits an iteration count bypass check once for the main loop (when \p
984 /// ForEpilogue is false) and once for the epilogue loop (when \p
985 /// ForEpilogue is true).
986 BasicBlock
*emitMinimumIterationCountCheck(Loop
*L
, BasicBlock
*Bypass
,
988 void printDebugTracesAtStart() override
;
989 void printDebugTracesAtEnd() override
;
992 // A specialized derived class of inner loop vectorizer that performs
993 // vectorization of *epilogue* loops in the process of vectorizing loops and
995 class EpilogueVectorizerEpilogueLoop
: public InnerLoopAndEpilogueVectorizer
{
997 EpilogueVectorizerEpilogueLoop(
998 Loop
*OrigLoop
, PredicatedScalarEvolution
&PSE
, LoopInfo
*LI
,
999 DominatorTree
*DT
, const TargetLibraryInfo
*TLI
,
1000 const TargetTransformInfo
*TTI
, AssumptionCache
*AC
,
1001 OptimizationRemarkEmitter
*ORE
, EpilogueLoopVectorizationInfo
&EPI
,
1002 LoopVectorizationLegality
*LVL
, llvm::LoopVectorizationCostModel
*CM
,
1003 BlockFrequencyInfo
*BFI
, ProfileSummaryInfo
*PSI
,
1004 GeneratedRTChecks
&Checks
)
1005 : InnerLoopAndEpilogueVectorizer(OrigLoop
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
,
1006 EPI
, LVL
, CM
, BFI
, PSI
, Checks
) {}
1007 /// Implements the interface for creating a vectorized skeleton using the
1008 /// *epilogue loop* strategy (ie the second pass of vplan execution).
1009 BasicBlock
*createEpilogueVectorizedLoopSkeleton() final override
;
1012 /// Emits an iteration count bypass check after the main vector loop has
1013 /// finished to see if there are any iterations left to execute by either
1014 /// the vector epilogue or the scalar epilogue.
1015 BasicBlock
*emitMinimumVectorEpilogueIterCountCheck(Loop
*L
,
1017 BasicBlock
*Insert
);
1018 void printDebugTracesAtStart() override
;
1019 void printDebugTracesAtEnd() override
;
1021 } // end namespace llvm
1023 /// Look for a meaningful debug location on the instruction or it's
1025 static Instruction
*getDebugLocFromInstOrOperands(Instruction
*I
) {
1030 if (I
->getDebugLoc() != Empty
)
1033 for (Use
&Op
: I
->operands()) {
1034 if (Instruction
*OpInst
= dyn_cast
<Instruction
>(Op
))
1035 if (OpInst
->getDebugLoc() != Empty
)
1042 void InnerLoopVectorizer::setDebugLocFromInst(
1043 const Value
*V
, Optional
<IRBuilder
<> *> CustomBuilder
) {
1044 IRBuilder
<> *B
= (CustomBuilder
== None
) ? &Builder
: *CustomBuilder
;
1045 if (const Instruction
*Inst
= dyn_cast_or_null
<Instruction
>(V
)) {
1046 const DILocation
*DIL
= Inst
->getDebugLoc();
1048 // When a FSDiscriminator is enabled, we don't need to add the multiply
1049 // factors to the discriminators.
1050 if (DIL
&& Inst
->getFunction()->isDebugInfoForProfiling() &&
1051 !isa
<DbgInfoIntrinsic
>(Inst
) && !EnableFSDiscriminator
) {
1052 // FIXME: For scalable vectors, assume vscale=1.
1054 DIL
->cloneByMultiplyingDuplicationFactor(UF
* VF
.getKnownMinValue());
1056 B
->SetCurrentDebugLocation(NewDIL
.getValue());
1059 << "Failed to create new discriminator: "
1060 << DIL
->getFilename() << " Line: " << DIL
->getLine());
1062 B
->SetCurrentDebugLocation(DIL
);
1064 B
->SetCurrentDebugLocation(DebugLoc());
1067 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
1068 /// is passed, the message relates to that particular instruction.
1070 static void debugVectorizationMessage(const StringRef Prefix
,
1071 const StringRef DebugMsg
,
1073 dbgs() << "LV: " << Prefix
<< DebugMsg
;
1075 dbgs() << " " << *I
;
1082 /// Create an analysis remark that explains why vectorization failed
1084 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
1085 /// RemarkName is the identifier for the remark. If \p I is passed it is an
1086 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for
1087 /// the location of the remark. \return the remark object that can be
1089 static OptimizationRemarkAnalysis
createLVAnalysis(const char *PassName
,
1090 StringRef RemarkName
, Loop
*TheLoop
, Instruction
*I
) {
1091 Value
*CodeRegion
= TheLoop
->getHeader();
1092 DebugLoc DL
= TheLoop
->getStartLoc();
1095 CodeRegion
= I
->getParent();
1096 // If there is no debug location attached to the instruction, revert back to
1097 // using the loop's.
1098 if (I
->getDebugLoc())
1099 DL
= I
->getDebugLoc();
1102 return OptimizationRemarkAnalysis(PassName
, RemarkName
, DL
, CodeRegion
);
1105 /// Return a value for Step multiplied by VF.
1106 static Value
*createStepForVF(IRBuilder
<> &B
, Constant
*Step
, ElementCount VF
) {
1107 assert(isa
<ConstantInt
>(Step
) && "Expected an integer step");
1108 Constant
*StepVal
= ConstantInt::get(
1110 cast
<ConstantInt
>(Step
)->getSExtValue() * VF
.getKnownMinValue());
1111 return VF
.isScalable() ? B
.CreateVScale(StepVal
) : StepVal
;
1116 /// Return the runtime value for VF.
1117 Value
*getRuntimeVF(IRBuilder
<> &B
, Type
*Ty
, ElementCount VF
) {
1118 Constant
*EC
= ConstantInt::get(Ty
, VF
.getKnownMinValue());
1119 return VF
.isScalable() ? B
.CreateVScale(EC
) : EC
;
1122 void reportVectorizationFailure(const StringRef DebugMsg
,
1123 const StringRef OREMsg
, const StringRef ORETag
,
1124 OptimizationRemarkEmitter
*ORE
, Loop
*TheLoop
,
1126 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg
, I
));
1127 LoopVectorizeHints
Hints(TheLoop
, true /* doesn't matter */, *ORE
);
1129 createLVAnalysis(Hints
.vectorizeAnalysisPassName(), ORETag
, TheLoop
, I
)
1130 << "loop not vectorized: " << OREMsg
);
1133 void reportVectorizationInfo(const StringRef Msg
, const StringRef ORETag
,
1134 OptimizationRemarkEmitter
*ORE
, Loop
*TheLoop
,
1136 LLVM_DEBUG(debugVectorizationMessage("", Msg
, I
));
1137 LoopVectorizeHints
Hints(TheLoop
, true /* doesn't matter */, *ORE
);
1139 createLVAnalysis(Hints
.vectorizeAnalysisPassName(), ORETag
, TheLoop
, I
)
1143 } // end namespace llvm
1146 /// \return string containing a file name and a line # for the given loop.
1147 static std::string
getDebugLocString(const Loop
*L
) {
1150 raw_string_ostream
OS(Result
);
1151 if (const DebugLoc LoopDbgLoc
= L
->getStartLoc())
1152 LoopDbgLoc
.print(OS
);
1154 // Just print the module name.
1155 OS
<< L
->getHeader()->getParent()->getParent()->getModuleIdentifier();
1162 void InnerLoopVectorizer::addNewMetadata(Instruction
*To
,
1163 const Instruction
*Orig
) {
1164 // If the loop was versioned with memchecks, add the corresponding no-alias
1166 if (LVer
&& (isa
<LoadInst
>(Orig
) || isa
<StoreInst
>(Orig
)))
1167 LVer
->annotateInstWithNoAlias(To
, Orig
);
1170 void InnerLoopVectorizer::addMetadata(Instruction
*To
,
1171 Instruction
*From
) {
1172 propagateMetadata(To
, From
);
1173 addNewMetadata(To
, From
);
1176 void InnerLoopVectorizer::addMetadata(ArrayRef
<Value
*> To
,
1177 Instruction
*From
) {
1178 for (Value
*V
: To
) {
1179 if (Instruction
*I
= dyn_cast
<Instruction
>(V
))
1180 addMetadata(I
, From
);
1186 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1188 enum ScalarEpilogueLowering
{
1190 // The default: allowing scalar epilogues.
1191 CM_ScalarEpilogueAllowed
,
1193 // Vectorization with OptForSize: don't allow epilogues.
1194 CM_ScalarEpilogueNotAllowedOptSize
,
1196 // A special case of vectorisation with OptForSize: loops with a very small
1197 // trip count are considered for vectorization under OptForSize, thereby
1198 // making sure the cost of their loop body is dominant, free of runtime
1199 // guards and scalar iteration overheads.
1200 CM_ScalarEpilogueNotAllowedLowTripLoop
,
1202 // Loop hint predicate indicating an epilogue is undesired.
1203 CM_ScalarEpilogueNotNeededUsePredicate
,
1205 // Directive indicating we must either tail fold or not vectorize
1206 CM_ScalarEpilogueNotAllowedUsePredicate
1209 /// ElementCountComparator creates a total ordering for ElementCount
1210 /// for the purposes of using it in a set structure.
1211 struct ElementCountComparator
{
1212 bool operator()(const ElementCount
&LHS
, const ElementCount
&RHS
) const {
1213 return std::make_tuple(LHS
.isScalable(), LHS
.getKnownMinValue()) <
1214 std::make_tuple(RHS
.isScalable(), RHS
.getKnownMinValue());
1217 using ElementCountSet
= SmallSet
<ElementCount
, 16, ElementCountComparator
>;
1219 /// LoopVectorizationCostModel - estimates the expected speedups due to
1221 /// In many cases vectorization is not profitable. This can happen because of
1222 /// a number of reasons. In this class we mainly attempt to predict the
1223 /// expected speedup/slowdowns due to the supported instruction set. We use the
1224 /// TargetTransformInfo to query the different backends for the cost of
1225 /// different operations.
1226 class LoopVectorizationCostModel
{
1228 LoopVectorizationCostModel(ScalarEpilogueLowering SEL
, Loop
*L
,
1229 PredicatedScalarEvolution
&PSE
, LoopInfo
*LI
,
1230 LoopVectorizationLegality
*Legal
,
1231 const TargetTransformInfo
&TTI
,
1232 const TargetLibraryInfo
*TLI
, DemandedBits
*DB
,
1233 AssumptionCache
*AC
,
1234 OptimizationRemarkEmitter
*ORE
, const Function
*F
,
1235 const LoopVectorizeHints
*Hints
,
1236 InterleavedAccessInfo
&IAI
)
1237 : ScalarEpilogueStatus(SEL
), TheLoop(L
), PSE(PSE
), LI(LI
), Legal(Legal
),
1238 TTI(TTI
), TLI(TLI
), DB(DB
), AC(AC
), ORE(ORE
), TheFunction(F
),
1239 Hints(Hints
), InterleaveInfo(IAI
) {}
1241 /// \return An upper bound for the vectorization factors (both fixed and
1242 /// scalable). If the factors are 0, vectorization and interleaving should be
1243 /// avoided up front.
1244 FixedScalableVFPair
computeMaxVF(ElementCount UserVF
, unsigned UserIC
);
1246 /// \return True if runtime checks are required for vectorization, and false
1248 bool runtimeChecksRequired();
1250 /// \return The most profitable vectorization factor and the cost of that VF.
1251 /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1252 /// then this vectorization factor will be selected if vectorization is
1255 selectVectorizationFactor(const ElementCountSet
&CandidateVFs
);
1258 selectEpilogueVectorizationFactor(const ElementCount MaxVF
,
1259 const LoopVectorizationPlanner
&LVP
);
1261 /// Setup cost-based decisions for user vectorization factor.
1262 /// \return true if the UserVF is a feasible VF to be chosen.
1263 bool selectUserVectorizationFactor(ElementCount UserVF
) {
1264 collectUniformsAndScalars(UserVF
);
1265 collectInstsToScalarize(UserVF
);
1266 return expectedCost(UserVF
).first
.isValid();
1269 /// \return The size (in bits) of the smallest and widest types in the code
1270 /// that needs to be vectorized. We ignore values that remain scalar such as
1271 /// 64 bit loop indices.
1272 std::pair
<unsigned, unsigned> getSmallestAndWidestTypes();
1274 /// \return The desired interleave count.
1275 /// If interleave count has been specified by metadata it will be returned.
1276 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1277 /// are the selected vectorization factor and the cost of the selected VF.
1278 unsigned selectInterleaveCount(ElementCount VF
, unsigned LoopCost
);
1280 /// Memory access instruction may be vectorized in more than one way.
1281 /// Form of instruction after vectorization depends on cost.
1282 /// This function takes cost-based decisions for Load/Store instructions
1283 /// and collects them in a map. This decisions map is used for building
1284 /// the lists of loop-uniform and loop-scalar instructions.
1285 /// The calculated cost is saved with widening decision in order to
1286 /// avoid redundant calculations.
1287 void setCostBasedWideningDecision(ElementCount VF
);
1289 /// A struct that represents some properties of the register usage
1291 struct RegisterUsage
{
1292 /// Holds the number of loop invariant values that are used in the loop.
1293 /// The key is ClassID of target-provided register class.
1294 SmallMapVector
<unsigned, unsigned, 4> LoopInvariantRegs
;
1295 /// Holds the maximum number of concurrent live intervals in the loop.
1296 /// The key is ClassID of target-provided register class.
1297 SmallMapVector
<unsigned, unsigned, 4> MaxLocalUsers
;
1300 /// \return Returns information about the register usages of the loop for the
1301 /// given vectorization factors.
1302 SmallVector
<RegisterUsage
, 8>
1303 calculateRegisterUsage(ArrayRef
<ElementCount
> VFs
);
1305 /// Collect values we want to ignore in the cost model.
1306 void collectValuesToIgnore();
1308 /// Collect all element types in the loop for which widening is needed.
1309 void collectElementTypesForWidening();
1311 /// Split reductions into those that happen in the loop, and those that happen
1312 /// outside. In loop reductions are collected into InLoopReductionChains.
1313 void collectInLoopReductions();
1315 /// Returns true if we should use strict in-order reductions for the given
1316 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1317 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1318 /// of FP operations.
1319 bool useOrderedReductions(const RecurrenceDescriptor
&RdxDesc
) {
1320 return ForceOrderedReductions
&& !Hints
->allowReordering() &&
1321 RdxDesc
.isOrdered();
1324 /// \returns The smallest bitwidth each instruction can be represented with.
1325 /// The vector equivalents of these instructions should be truncated to this
1327 const MapVector
<Instruction
*, uint64_t> &getMinimalBitwidths() const {
1331 /// \returns True if it is more profitable to scalarize instruction \p I for
1332 /// vectorization factor \p VF.
1333 bool isProfitableToScalarize(Instruction
*I
, ElementCount VF
) const {
1334 assert(VF
.isVector() &&
1335 "Profitable to scalarize relevant only for VF > 1.");
1337 // Cost model is not run in the VPlan-native path - return conservative
1338 // result until this changes.
1339 if (EnableVPlanNativePath
)
1342 auto Scalars
= InstsToScalarize
.find(VF
);
1343 assert(Scalars
!= InstsToScalarize
.end() &&
1344 "VF not yet analyzed for scalarization profitability");
1345 return Scalars
->second
.find(I
) != Scalars
->second
.end();
1348 /// Returns true if \p I is known to be uniform after vectorization.
1349 bool isUniformAfterVectorization(Instruction
*I
, ElementCount VF
) const {
1353 // Cost model is not run in the VPlan-native path - return conservative
1354 // result until this changes.
1355 if (EnableVPlanNativePath
)
1358 auto UniformsPerVF
= Uniforms
.find(VF
);
1359 assert(UniformsPerVF
!= Uniforms
.end() &&
1360 "VF not yet analyzed for uniformity");
1361 return UniformsPerVF
->second
.count(I
);
1364 /// Returns true if \p I is known to be scalar after vectorization.
1365 bool isScalarAfterVectorization(Instruction
*I
, ElementCount VF
) const {
1369 // Cost model is not run in the VPlan-native path - return conservative
1370 // result until this changes.
1371 if (EnableVPlanNativePath
)
1374 auto ScalarsPerVF
= Scalars
.find(VF
);
1375 assert(ScalarsPerVF
!= Scalars
.end() &&
1376 "Scalar values are not calculated for VF");
1377 return ScalarsPerVF
->second
.count(I
);
1380 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1381 /// for vectorization factor \p VF.
1382 bool canTruncateToMinimalBitwidth(Instruction
*I
, ElementCount VF
) const {
1383 return VF
.isVector() && MinBWs
.find(I
) != MinBWs
.end() &&
1384 !isProfitableToScalarize(I
, VF
) &&
1385 !isScalarAfterVectorization(I
, VF
);
1388 /// Decision that was taken during cost calculation for memory instruction.
1391 CM_Widen
, // For consecutive accesses with stride +1.
1392 CM_Widen_Reverse
, // For consecutive accesses with stride -1.
1398 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1399 /// instruction \p I and vector width \p VF.
1400 void setWideningDecision(Instruction
*I
, ElementCount VF
, InstWidening W
,
1401 InstructionCost Cost
) {
1402 assert(VF
.isVector() && "Expected VF >=2");
1403 WideningDecisions
[std::make_pair(I
, VF
)] = std::make_pair(W
, Cost
);
1406 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1407 /// interleaving group \p Grp and vector width \p VF.
1408 void setWideningDecision(const InterleaveGroup
<Instruction
> *Grp
,
1409 ElementCount VF
, InstWidening W
,
1410 InstructionCost Cost
) {
1411 assert(VF
.isVector() && "Expected VF >=2");
1412 /// Broadcast this decicion to all instructions inside the group.
1413 /// But the cost will be assigned to one instruction only.
1414 for (unsigned i
= 0; i
< Grp
->getFactor(); ++i
) {
1415 if (auto *I
= Grp
->getMember(i
)) {
1416 if (Grp
->getInsertPos() == I
)
1417 WideningDecisions
[std::make_pair(I
, VF
)] = std::make_pair(W
, Cost
);
1419 WideningDecisions
[std::make_pair(I
, VF
)] = std::make_pair(W
, 0);
1424 /// Return the cost model decision for the given instruction \p I and vector
1425 /// width \p VF. Return CM_Unknown if this instruction did not pass
1426 /// through the cost modeling.
1427 InstWidening
getWideningDecision(Instruction
*I
, ElementCount VF
) const {
1428 assert(VF
.isVector() && "Expected VF to be a vector VF");
1429 // Cost model is not run in the VPlan-native path - return conservative
1430 // result until this changes.
1431 if (EnableVPlanNativePath
)
1432 return CM_GatherScatter
;
1434 std::pair
<Instruction
*, ElementCount
> InstOnVF
= std::make_pair(I
, VF
);
1435 auto Itr
= WideningDecisions
.find(InstOnVF
);
1436 if (Itr
== WideningDecisions
.end())
1438 return Itr
->second
.first
;
1441 /// Return the vectorization cost for the given instruction \p I and vector
1443 InstructionCost
getWideningCost(Instruction
*I
, ElementCount VF
) {
1444 assert(VF
.isVector() && "Expected VF >=2");
1445 std::pair
<Instruction
*, ElementCount
> InstOnVF
= std::make_pair(I
, VF
);
1446 assert(WideningDecisions
.find(InstOnVF
) != WideningDecisions
.end() &&
1447 "The cost is not calculated");
1448 return WideningDecisions
[InstOnVF
].second
;
1451 /// Return True if instruction \p I is an optimizable truncate whose operand
1452 /// is an induction variable. Such a truncate will be removed by adding a new
1453 /// induction variable with the destination type.
1454 bool isOptimizableIVTruncate(Instruction
*I
, ElementCount VF
) {
1455 // If the instruction is not a truncate, return false.
1456 auto *Trunc
= dyn_cast
<TruncInst
>(I
);
1460 // Get the source and destination types of the truncate.
1461 Type
*SrcTy
= ToVectorTy(cast
<CastInst
>(I
)->getSrcTy(), VF
);
1462 Type
*DestTy
= ToVectorTy(cast
<CastInst
>(I
)->getDestTy(), VF
);
1464 // If the truncate is free for the given types, return false. Replacing a
1465 // free truncate with an induction variable would add an induction variable
1466 // update instruction to each iteration of the loop. We exclude from this
1467 // check the primary induction variable since it will need an update
1468 // instruction regardless.
1469 Value
*Op
= Trunc
->getOperand(0);
1470 if (Op
!= Legal
->getPrimaryInduction() && TTI
.isTruncateFree(SrcTy
, DestTy
))
1473 // If the truncated value is not an induction variable, return false.
1474 return Legal
->isInductionPhi(Op
);
1477 /// Collects the instructions to scalarize for each predicated instruction in
1479 void collectInstsToScalarize(ElementCount VF
);
1481 /// Collect Uniform and Scalar values for the given \p VF.
1482 /// The sets depend on CM decision for Load/Store instructions
1483 /// that may be vectorized as interleave, gather-scatter or scalarized.
1484 void collectUniformsAndScalars(ElementCount VF
) {
1485 // Do the analysis once.
1486 if (VF
.isScalar() || Uniforms
.find(VF
) != Uniforms
.end())
1488 setCostBasedWideningDecision(VF
);
1489 collectLoopUniforms(VF
);
1490 collectLoopScalars(VF
);
1493 /// Returns true if the target machine supports masked store operation
1494 /// for the given \p DataType and kind of access to \p Ptr.
1495 bool isLegalMaskedStore(Type
*DataType
, Value
*Ptr
, Align Alignment
) const {
1496 return Legal
->isConsecutivePtr(Ptr
) &&
1497 TTI
.isLegalMaskedStore(DataType
, Alignment
);
1500 /// Returns true if the target machine supports masked load operation
1501 /// for the given \p DataType and kind of access to \p Ptr.
1502 bool isLegalMaskedLoad(Type
*DataType
, Value
*Ptr
, Align Alignment
) const {
1503 return Legal
->isConsecutivePtr(Ptr
) &&
1504 TTI
.isLegalMaskedLoad(DataType
, Alignment
);
1507 /// Returns true if the target machine can represent \p V as a masked gather
1508 /// or scatter operation.
1509 bool isLegalGatherOrScatter(Value
*V
) {
1510 bool LI
= isa
<LoadInst
>(V
);
1511 bool SI
= isa
<StoreInst
>(V
);
1514 auto *Ty
= getLoadStoreType(V
);
1515 Align Align
= getLoadStoreAlignment(V
);
1516 return (LI
&& TTI
.isLegalMaskedGather(Ty
, Align
)) ||
1517 (SI
&& TTI
.isLegalMaskedScatter(Ty
, Align
));
1520 /// Returns true if the target machine supports all of the reduction
1521 /// variables found for the given VF.
1522 bool canVectorizeReductions(ElementCount VF
) const {
1523 return (all_of(Legal
->getReductionVars(), [&](auto &Reduction
) -> bool {
1524 const RecurrenceDescriptor
&RdxDesc
= Reduction
.second
;
1525 return TTI
.isLegalToVectorizeReduction(RdxDesc
, VF
);
1529 /// Returns true if \p I is an instruction that will be scalarized with
1530 /// predication. Such instructions include conditional stores and
1531 /// instructions that may divide by zero.
1532 /// If a non-zero VF has been calculated, we check if I will be scalarized
1533 /// predication for that VF.
1534 bool isScalarWithPredication(Instruction
*I
) const;
1536 // Returns true if \p I is an instruction that will be predicated either
1537 // through scalar predication or masked load/store or masked gather/scatter.
1538 // Superset of instructions that return true for isScalarWithPredication.
1539 bool isPredicatedInst(Instruction
*I
) {
1540 if (!blockNeedsPredication(I
->getParent()))
1542 // Loads and stores that need some form of masked operation are predicated
1544 if (isa
<LoadInst
>(I
) || isa
<StoreInst
>(I
))
1545 return Legal
->isMaskRequired(I
);
1546 return isScalarWithPredication(I
);
1549 /// Returns true if \p I is a memory instruction with consecutive memory
1550 /// access that can be widened.
1552 memoryInstructionCanBeWidened(Instruction
*I
,
1553 ElementCount VF
= ElementCount::getFixed(1));
1555 /// Returns true if \p I is a memory instruction in an interleaved-group
1556 /// of memory accesses that can be vectorized with wide vector loads/stores
1559 interleavedAccessCanBeWidened(Instruction
*I
,
1560 ElementCount VF
= ElementCount::getFixed(1));
1562 /// Check if \p Instr belongs to any interleaved access group.
1563 bool isAccessInterleaved(Instruction
*Instr
) {
1564 return InterleaveInfo
.isInterleaved(Instr
);
1567 /// Get the interleaved access group that \p Instr belongs to.
1568 const InterleaveGroup
<Instruction
> *
1569 getInterleavedAccessGroup(Instruction
*Instr
) {
1570 return InterleaveInfo
.getInterleaveGroup(Instr
);
1573 /// Returns true if we're required to use a scalar epilogue for at least
1574 /// the final iteration of the original loop.
1575 bool requiresScalarEpilogue(ElementCount VF
) const {
1576 if (!isScalarEpilogueAllowed())
1578 // If we might exit from anywhere but the latch, must run the exiting
1579 // iteration in scalar form.
1580 if (TheLoop
->getExitingBlock() != TheLoop
->getLoopLatch())
1582 return VF
.isVector() && InterleaveInfo
.requiresScalarEpilogue();
1585 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1586 /// loop hint annotation.
1587 bool isScalarEpilogueAllowed() const {
1588 return ScalarEpilogueStatus
== CM_ScalarEpilogueAllowed
;
1591 /// Returns true if all loop blocks should be masked to fold tail loop.
1592 bool foldTailByMasking() const { return FoldTailByMasking
; }
1594 bool blockNeedsPredication(BasicBlock
*BB
) const {
1595 return foldTailByMasking() || Legal
->blockNeedsPredication(BB
);
1598 /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1599 /// nodes to the chain of instructions representing the reductions. Uses a
1600 /// MapVector to ensure deterministic iteration order.
1601 using ReductionChainMap
=
1602 SmallMapVector
<PHINode
*, SmallVector
<Instruction
*, 4>, 4>;
1604 /// Return the chain of instructions representing an inloop reduction.
1605 const ReductionChainMap
&getInLoopReductionChains() const {
1606 return InLoopReductionChains
;
1609 /// Returns true if the Phi is part of an inloop reduction.
1610 bool isInLoopReduction(PHINode
*Phi
) const {
1611 return InLoopReductionChains
.count(Phi
);
1614 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1615 /// with factor VF. Return the cost of the instruction, including
1616 /// scalarization overhead if it's needed.
1617 InstructionCost
getVectorIntrinsicCost(CallInst
*CI
, ElementCount VF
) const;
1619 /// Estimate cost of a call instruction CI if it were vectorized with factor
1620 /// VF. Return the cost of the instruction, including scalarization overhead
1621 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1623 /// i.e. either vector version isn't available, or is too expensive.
1624 InstructionCost
getVectorCallCost(CallInst
*CI
, ElementCount VF
,
1625 bool &NeedToScalarize
) const;
1627 /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1629 bool isMoreProfitable(const VectorizationFactor
&A
,
1630 const VectorizationFactor
&B
) const;
1632 /// Invalidates decisions already taken by the cost model.
1633 void invalidateCostModelingDecisions() {
1634 WideningDecisions
.clear();
1640 unsigned NumPredStores
= 0;
1642 /// \return An upper bound for the vectorization factors for both
1643 /// fixed and scalable vectorization, where the minimum-known number of
1644 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1645 /// disabled or unsupported, then the scalable part will be equal to
1646 /// ElementCount::getScalable(0).
1647 FixedScalableVFPair
computeFeasibleMaxVF(unsigned ConstTripCount
,
1648 ElementCount UserVF
);
1650 /// \return the maximized element count based on the targets vector
1651 /// registers and the loop trip-count, but limited to a maximum safe VF.
1652 /// This is a helper function of computeFeasibleMaxVF.
1653 /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1654 /// issue that occurred on one of the buildbots which cannot be reproduced
1655 /// without having access to the properietary compiler (see comments on
1656 /// D98509). The issue is currently under investigation and this workaround
1657 /// will be removed as soon as possible.
1658 ElementCount
getMaximizedVFForTarget(unsigned ConstTripCount
,
1659 unsigned SmallestType
,
1660 unsigned WidestType
,
1661 const ElementCount
&MaxSafeVF
);
1663 /// \return the maximum legal scalable VF, based on the safe max number
1665 ElementCount
getMaxLegalScalableVF(unsigned MaxSafeElements
);
1667 /// The vectorization cost is a combination of the cost itself and a boolean
1668 /// indicating whether any of the contributing operations will actually
1669 /// operate on vector values after type legalization in the backend. If this
1670 /// latter value is false, then all operations will be scalarized (i.e. no
1671 /// vectorization has actually taken place).
1672 using VectorizationCostTy
= std::pair
<InstructionCost
, bool>;
1674 /// Returns the expected execution cost. The unit of the cost does
1675 /// not matter because we use the 'cost' units to compare different
1676 /// vector widths. The cost that is returned is *not* normalized by
1677 /// the factor width. If \p Invalid is not nullptr, this function
1678 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1679 /// each instruction that has an Invalid cost for the given VF.
1680 using InstructionVFPair
= std::pair
<Instruction
*, ElementCount
>;
1682 expectedCost(ElementCount VF
,
1683 SmallVectorImpl
<InstructionVFPair
> *Invalid
= nullptr);
1685 /// Returns the execution time cost of an instruction for a given vector
1686 /// width. Vector width of one means scalar.
1687 VectorizationCostTy
getInstructionCost(Instruction
*I
, ElementCount VF
);
1689 /// The cost-computation logic from getInstructionCost which provides
1690 /// the vector type as an output parameter.
1691 InstructionCost
getInstructionCost(Instruction
*I
, ElementCount VF
,
1694 /// Return the cost of instructions in an inloop reduction pattern, if I is
1695 /// part of that pattern.
1696 Optional
<InstructionCost
>
1697 getReductionPatternCost(Instruction
*I
, ElementCount VF
, Type
*VectorTy
,
1698 TTI::TargetCostKind CostKind
);
1700 /// Calculate vectorization cost of memory instruction \p I.
1701 InstructionCost
getMemoryInstructionCost(Instruction
*I
, ElementCount VF
);
1703 /// The cost computation for scalarized memory instruction.
1704 InstructionCost
getMemInstScalarizationCost(Instruction
*I
, ElementCount VF
);
1706 /// The cost computation for interleaving group of memory instructions.
1707 InstructionCost
getInterleaveGroupCost(Instruction
*I
, ElementCount VF
);
1709 /// The cost computation for Gather/Scatter instruction.
1710 InstructionCost
getGatherScatterCost(Instruction
*I
, ElementCount VF
);
1712 /// The cost computation for widening instruction \p I with consecutive
1714 InstructionCost
getConsecutiveMemOpCost(Instruction
*I
, ElementCount VF
);
1716 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1717 /// Load: scalar load + broadcast.
1718 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1720 InstructionCost
getUniformMemOpCost(Instruction
*I
, ElementCount VF
);
1722 /// Estimate the overhead of scalarizing an instruction. This is a
1723 /// convenience wrapper for the type-based getScalarizationOverhead API.
1724 InstructionCost
getScalarizationOverhead(Instruction
*I
,
1725 ElementCount VF
) const;
1727 /// Returns whether the instruction is a load or store and will be a emitted
1728 /// as a vector operation.
1729 bool isConsecutiveLoadOrStore(Instruction
*I
);
1731 /// Returns true if an artificially high cost for emulated masked memrefs
1733 bool useEmulatedMaskMemRefHack(Instruction
*I
);
1735 /// Map of scalar integer values to the smallest bitwidth they can be legally
1736 /// represented as. The vector equivalents of these values should be truncated
1738 MapVector
<Instruction
*, uint64_t> MinBWs
;
1740 /// A type representing the costs for instructions if they were to be
1741 /// scalarized rather than vectorized. The entries are Instruction-Cost
1743 using ScalarCostsTy
= DenseMap
<Instruction
*, InstructionCost
>;
1745 /// A set containing all BasicBlocks that are known to present after
1746 /// vectorization as a predicated block.
1747 SmallPtrSet
<BasicBlock
*, 4> PredicatedBBsAfterVectorization
;
1749 /// Records whether it is allowed to have the original scalar loop execute at
1750 /// least once. This may be needed as a fallback loop in case runtime
1751 /// aliasing/dependence checks fail, or to handle the tail/remainder
1752 /// iterations when the trip count is unknown or doesn't divide by the VF,
1753 /// or as a peel-loop to handle gaps in interleave-groups.
1754 /// Under optsize and when the trip count is very small we don't allow any
1755 /// iterations to execute in the scalar loop.
1756 ScalarEpilogueLowering ScalarEpilogueStatus
= CM_ScalarEpilogueAllowed
;
1758 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1759 bool FoldTailByMasking
= false;
1761 /// A map holding scalar costs for different vectorization factors. The
1762 /// presence of a cost for an instruction in the mapping indicates that the
1763 /// instruction will be scalarized when vectorizing with the associated
1764 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1765 DenseMap
<ElementCount
, ScalarCostsTy
> InstsToScalarize
;
1767 /// Holds the instructions known to be uniform after vectorization.
1768 /// The data is collected per VF.
1769 DenseMap
<ElementCount
, SmallPtrSet
<Instruction
*, 4>> Uniforms
;
1771 /// Holds the instructions known to be scalar after vectorization.
1772 /// The data is collected per VF.
1773 DenseMap
<ElementCount
, SmallPtrSet
<Instruction
*, 4>> Scalars
;
1775 /// Holds the instructions (address computations) that are forced to be
1777 DenseMap
<ElementCount
, SmallPtrSet
<Instruction
*, 4>> ForcedScalars
;
1779 /// PHINodes of the reductions that should be expanded in-loop along with
1780 /// their associated chains of reduction operations, in program order from top
1782 ReductionChainMap InLoopReductionChains
;
1784 /// A Map of inloop reduction operations and their immediate chain operand.
1785 /// FIXME: This can be removed once reductions can be costed correctly in
1786 /// vplan. This was added to allow quick lookup to the inloop operations,
1787 /// without having to loop through InLoopReductionChains.
1788 DenseMap
<Instruction
*, Instruction
*> InLoopReductionImmediateChains
;
1790 /// Returns the expected difference in cost from scalarizing the expression
1791 /// feeding a predicated instruction \p PredInst. The instructions to
1792 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1793 /// non-negative return value implies the expression will be scalarized.
1794 /// Currently, only single-use chains are considered for scalarization.
1795 int computePredInstDiscount(Instruction
*PredInst
, ScalarCostsTy
&ScalarCosts
,
1798 /// Collect the instructions that are uniform after vectorization. An
1799 /// instruction is uniform if we represent it with a single scalar value in
1800 /// the vectorized loop corresponding to each vector iteration. Examples of
1801 /// uniform instructions include pointer operands of consecutive or
1802 /// interleaved memory accesses. Note that although uniformity implies an
1803 /// instruction will be scalar, the reverse is not true. In general, a
1804 /// scalarized instruction will be represented by VF scalar values in the
1805 /// vectorized loop, each corresponding to an iteration of the original
1807 void collectLoopUniforms(ElementCount VF
);
1809 /// Collect the instructions that are scalar after vectorization. An
1810 /// instruction is scalar if it is known to be uniform or will be scalarized
1811 /// during vectorization. Non-uniform scalarized instructions will be
1812 /// represented by VF values in the vectorized loop, each corresponding to an
1813 /// iteration of the original scalar loop.
1814 void collectLoopScalars(ElementCount VF
);
1816 /// Keeps cost model vectorization decision and cost for instructions.
1817 /// Right now it is used for memory instructions only.
1818 using DecisionList
= DenseMap
<std::pair
<Instruction
*, ElementCount
>,
1819 std::pair
<InstWidening
, InstructionCost
>>;
1821 DecisionList WideningDecisions
;
1823 /// Returns true if \p V is expected to be vectorized and it needs to be
1825 bool needsExtract(Value
*V
, ElementCount VF
) const {
1826 Instruction
*I
= dyn_cast
<Instruction
>(V
);
1827 if (VF
.isScalar() || !I
|| !TheLoop
->contains(I
) ||
1828 TheLoop
->isLoopInvariant(I
))
1831 // Assume we can vectorize V (and hence we need extraction) if the
1832 // scalars are not computed yet. This can happen, because it is called
1833 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1834 // the scalars are collected. That should be a safe assumption in most
1835 // cases, because we check if the operands have vectorizable types
1836 // beforehand in LoopVectorizationLegality.
1837 return Scalars
.find(VF
) == Scalars
.end() ||
1838 !isScalarAfterVectorization(I
, VF
);
1841 /// Returns a range containing only operands needing to be extracted.
1842 SmallVector
<Value
*, 4> filterExtractingOperands(Instruction::op_range Ops
,
1843 ElementCount VF
) const {
1844 return SmallVector
<Value
*, 4>(make_filter_range(
1845 Ops
, [this, VF
](Value
*V
) { return this->needsExtract(V
, VF
); }));
1848 /// Determines if we have the infrastructure to vectorize loop \p L and its
1849 /// epilogue, assuming the main loop is vectorized by \p VF.
1850 bool isCandidateForEpilogueVectorization(const Loop
&L
,
1851 const ElementCount VF
) const;
1853 /// Returns true if epilogue vectorization is considered profitable, and
1854 /// false otherwise.
1855 /// \p VF is the vectorization factor chosen for the original loop.
1856 bool isEpilogueVectorizationProfitable(const ElementCount VF
) const;
1859 /// The loop that we evaluate.
1862 /// Predicated scalar evolution analysis.
1863 PredicatedScalarEvolution
&PSE
;
1865 /// Loop Info analysis.
1868 /// Vectorization legality.
1869 LoopVectorizationLegality
*Legal
;
1871 /// Vector target information.
1872 const TargetTransformInfo
&TTI
;
1874 /// Target Library Info.
1875 const TargetLibraryInfo
*TLI
;
1877 /// Demanded bits analysis.
1880 /// Assumption cache.
1881 AssumptionCache
*AC
;
1883 /// Interface to emit optimization remarks.
1884 OptimizationRemarkEmitter
*ORE
;
1886 const Function
*TheFunction
;
1888 /// Loop Vectorize Hint.
1889 const LoopVectorizeHints
*Hints
;
1891 /// The interleave access information contains groups of interleaved accesses
1892 /// with the same stride and close to each other.
1893 InterleavedAccessInfo
&InterleaveInfo
;
1895 /// Values to ignore in the cost model.
1896 SmallPtrSet
<const Value
*, 16> ValuesToIgnore
;
1898 /// Values to ignore in the cost model when VF > 1.
1899 SmallPtrSet
<const Value
*, 16> VecValuesToIgnore
;
1901 /// All element types found in the loop.
1902 SmallPtrSet
<Type
*, 16> ElementTypesInLoop
;
1904 /// Profitable vector factors.
1905 SmallVector
<VectorizationFactor
, 8> ProfitableVFs
;
1907 } // end namespace llvm
1909 /// Helper struct to manage generating runtime checks for vectorization.
1911 /// The runtime checks are created up-front in temporary blocks to allow better
1912 /// estimating the cost and un-linked from the existing IR. After deciding to
1913 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1914 /// temporary blocks are completely removed.
1915 class GeneratedRTChecks
{
1916 /// Basic block which contains the generated SCEV checks, if any.
1917 BasicBlock
*SCEVCheckBlock
= nullptr;
1919 /// The value representing the result of the generated SCEV checks. If it is
1920 /// nullptr, either no SCEV checks have been generated or they have been used.
1921 Value
*SCEVCheckCond
= nullptr;
1923 /// Basic block which contains the generated memory runtime checks, if any.
1924 BasicBlock
*MemCheckBlock
= nullptr;
1926 /// The value representing the result of the generated memory runtime checks.
1927 /// If it is nullptr, either no memory runtime checks have been generated or
1928 /// they have been used.
1929 Instruction
*MemRuntimeCheckCond
= nullptr;
1934 SCEVExpander SCEVExp
;
1935 SCEVExpander MemCheckExp
;
1938 GeneratedRTChecks(ScalarEvolution
&SE
, DominatorTree
*DT
, LoopInfo
*LI
,
1939 const DataLayout
&DL
)
1940 : DT(DT
), LI(LI
), SCEVExp(SE
, DL
, "scev.check"),
1941 MemCheckExp(SE
, DL
, "scev.check") {}
1943 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1944 /// accurately estimate the cost of the runtime checks. The blocks are
1945 /// un-linked from the IR and is added back during vector code generation. If
1946 /// there is no vector code generation, the check blocks are removed
1948 void Create(Loop
*L
, const LoopAccessInfo
&LAI
,
1949 const SCEVUnionPredicate
&UnionPred
) {
1951 BasicBlock
*LoopHeader
= L
->getHeader();
1952 BasicBlock
*Preheader
= L
->getLoopPreheader();
1954 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1955 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1956 // may be used by SCEVExpander. The blocks will be un-linked from their
1957 // predecessors and removed from LI & DT at the end of the function.
1958 if (!UnionPred
.isAlwaysTrue()) {
1959 SCEVCheckBlock
= SplitBlock(Preheader
, Preheader
->getTerminator(), DT
, LI
,
1960 nullptr, "vector.scevcheck");
1962 SCEVCheckCond
= SCEVExp
.expandCodeForPredicate(
1963 &UnionPred
, SCEVCheckBlock
->getTerminator());
1966 const auto &RtPtrChecking
= *LAI
.getRuntimePointerChecking();
1967 if (RtPtrChecking
.Need
) {
1968 auto *Pred
= SCEVCheckBlock
? SCEVCheckBlock
: Preheader
;
1969 MemCheckBlock
= SplitBlock(Pred
, Pred
->getTerminator(), DT
, LI
, nullptr,
1972 std::tie(std::ignore
, MemRuntimeCheckCond
) =
1973 addRuntimeChecks(MemCheckBlock
->getTerminator(), L
,
1974 RtPtrChecking
.getChecks(), MemCheckExp
);
1975 assert(MemRuntimeCheckCond
&&
1976 "no RT checks generated although RtPtrChecking "
1977 "claimed checks are required");
1980 if (!MemCheckBlock
&& !SCEVCheckBlock
)
1983 // Unhook the temporary block with the checks, update various places
1986 SCEVCheckBlock
->replaceAllUsesWith(Preheader
);
1988 MemCheckBlock
->replaceAllUsesWith(Preheader
);
1990 if (SCEVCheckBlock
) {
1991 SCEVCheckBlock
->getTerminator()->moveBefore(Preheader
->getTerminator());
1992 new UnreachableInst(Preheader
->getContext(), SCEVCheckBlock
);
1993 Preheader
->getTerminator()->eraseFromParent();
1995 if (MemCheckBlock
) {
1996 MemCheckBlock
->getTerminator()->moveBefore(Preheader
->getTerminator());
1997 new UnreachableInst(Preheader
->getContext(), MemCheckBlock
);
1998 Preheader
->getTerminator()->eraseFromParent();
2001 DT
->changeImmediateDominator(LoopHeader
, Preheader
);
2002 if (MemCheckBlock
) {
2003 DT
->eraseNode(MemCheckBlock
);
2004 LI
->removeBlock(MemCheckBlock
);
2006 if (SCEVCheckBlock
) {
2007 DT
->eraseNode(SCEVCheckBlock
);
2008 LI
->removeBlock(SCEVCheckBlock
);
2012 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2014 ~GeneratedRTChecks() {
2015 SCEVExpanderCleaner
SCEVCleaner(SCEVExp
, *DT
);
2016 SCEVExpanderCleaner
MemCheckCleaner(MemCheckExp
, *DT
);
2018 SCEVCleaner
.markResultUsed();
2020 if (!MemRuntimeCheckCond
)
2021 MemCheckCleaner
.markResultUsed();
2023 if (MemRuntimeCheckCond
) {
2024 auto &SE
= *MemCheckExp
.getSE();
2025 // Memory runtime check generation creates compares that use expanded
2026 // values. Remove them before running the SCEVExpanderCleaners.
2027 for (auto &I
: make_early_inc_range(reverse(*MemCheckBlock
))) {
2028 if (MemCheckExp
.isInsertedInstruction(&I
))
2031 SE
.eraseValueFromMap(&I
);
2032 I
.eraseFromParent();
2035 MemCheckCleaner
.cleanup();
2036 SCEVCleaner
.cleanup();
2039 SCEVCheckBlock
->eraseFromParent();
2040 if (MemRuntimeCheckCond
)
2041 MemCheckBlock
->eraseFromParent();
2044 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2045 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2046 /// depending on the generated condition.
2047 BasicBlock
*emitSCEVChecks(Loop
*L
, BasicBlock
*Bypass
,
2048 BasicBlock
*LoopVectorPreHeader
,
2049 BasicBlock
*LoopExitBlock
) {
2052 if (auto *C
= dyn_cast
<ConstantInt
>(SCEVCheckCond
))
2056 auto *Pred
= LoopVectorPreHeader
->getSinglePredecessor();
2058 BranchInst::Create(LoopVectorPreHeader
, SCEVCheckBlock
);
2059 // Create new preheader for vector loop.
2060 if (auto *PL
= LI
->getLoopFor(LoopVectorPreHeader
))
2061 PL
->addBasicBlockToLoop(SCEVCheckBlock
, *LI
);
2063 SCEVCheckBlock
->getTerminator()->eraseFromParent();
2064 SCEVCheckBlock
->moveBefore(LoopVectorPreHeader
);
2065 Pred
->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader
,
2068 DT
->addNewBlock(SCEVCheckBlock
, Pred
);
2069 DT
->changeImmediateDominator(LoopVectorPreHeader
, SCEVCheckBlock
);
2071 ReplaceInstWithInst(
2072 SCEVCheckBlock
->getTerminator(),
2073 BranchInst::Create(Bypass
, LoopVectorPreHeader
, SCEVCheckCond
));
2074 // Mark the check as used, to prevent it from being removed during cleanup.
2075 SCEVCheckCond
= nullptr;
2076 return SCEVCheckBlock
;
2079 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2080 /// the branches to branch to the vector preheader or \p Bypass, depending on
2081 /// the generated condition.
2082 BasicBlock
*emitMemRuntimeChecks(Loop
*L
, BasicBlock
*Bypass
,
2083 BasicBlock
*LoopVectorPreHeader
) {
2084 // Check if we generated code that checks in runtime if arrays overlap.
2085 if (!MemRuntimeCheckCond
)
2088 auto *Pred
= LoopVectorPreHeader
->getSinglePredecessor();
2089 Pred
->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader
,
2092 DT
->addNewBlock(MemCheckBlock
, Pred
);
2093 DT
->changeImmediateDominator(LoopVectorPreHeader
, MemCheckBlock
);
2094 MemCheckBlock
->moveBefore(LoopVectorPreHeader
);
2096 if (auto *PL
= LI
->getLoopFor(LoopVectorPreHeader
))
2097 PL
->addBasicBlockToLoop(MemCheckBlock
, *LI
);
2099 ReplaceInstWithInst(
2100 MemCheckBlock
->getTerminator(),
2101 BranchInst::Create(Bypass
, LoopVectorPreHeader
, MemRuntimeCheckCond
));
2102 MemCheckBlock
->getTerminator()->setDebugLoc(
2103 Pred
->getTerminator()->getDebugLoc());
2105 // Mark the check as used, to prevent it from being removed during cleanup.
2106 MemRuntimeCheckCond
= nullptr;
2107 return MemCheckBlock
;
2111 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2112 // vectorization. The loop needs to be annotated with #pragma omp simd
2113 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2114 // vector length information is not provided, vectorization is not considered
2115 // explicit. Interleave hints are not allowed either. These limitations will be
2116 // relaxed in the future.
2117 // Please, note that we are currently forced to abuse the pragma 'clang
2118 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2119 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2120 // provides *explicit vectorization hints* (LV can bypass legal checks and
2121 // assume that vectorization is legal). However, both hints are implemented
2122 // using the same metadata (llvm.loop.vectorize, processed by
2123 // LoopVectorizeHints). This will be fixed in the future when the native IR
2124 // representation for pragma 'omp simd' is introduced.
2125 static bool isExplicitVecOuterLoop(Loop
*OuterLp
,
2126 OptimizationRemarkEmitter
*ORE
) {
2127 assert(!OuterLp
->isInnermost() && "This is not an outer loop");
2128 LoopVectorizeHints
Hints(OuterLp
, true /*DisableInterleaving*/, *ORE
);
2130 // Only outer loops with an explicit vectorization hint are supported.
2131 // Unannotated outer loops are ignored.
2132 if (Hints
.getForce() == LoopVectorizeHints::FK_Undefined
)
2135 Function
*Fn
= OuterLp
->getHeader()->getParent();
2136 if (!Hints
.allowVectorization(Fn
, OuterLp
,
2137 true /*VectorizeOnlyWhenForced*/)) {
2138 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2142 if (Hints
.getInterleave() > 1) {
2143 // TODO: Interleave support is future work.
2144 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2146 Hints
.emitRemarkWithHints();
2153 static void collectSupportedLoops(Loop
&L
, LoopInfo
*LI
,
2154 OptimizationRemarkEmitter
*ORE
,
2155 SmallVectorImpl
<Loop
*> &V
) {
2156 // Collect inner loops and outer loops without irreducible control flow. For
2157 // now, only collect outer loops that have explicit vectorization hints. If we
2158 // are stress testing the VPlan H-CFG construction, we collect the outermost
2159 // loop of every loop nest.
2160 if (L
.isInnermost() || VPlanBuildStressTest
||
2161 (EnableVPlanNativePath
&& isExplicitVecOuterLoop(&L
, ORE
))) {
2162 LoopBlocksRPO
RPOT(&L
);
2164 if (!containsIrreducibleCFG
<const BasicBlock
*>(RPOT
, *LI
)) {
2166 // TODO: Collect inner loops inside marked outer loops in case
2167 // vectorization fails for the outer loop. Do not invoke
2168 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2169 // already known to be reducible. We can use an inherited attribute for
2174 for (Loop
*InnerL
: L
)
2175 collectSupportedLoops(*InnerL
, LI
, ORE
, V
);
2180 /// The LoopVectorize Pass.
2181 struct LoopVectorize
: public FunctionPass
{
2182 /// Pass identification, replacement for typeid
2185 LoopVectorizePass Impl
;
2187 explicit LoopVectorize(bool InterleaveOnlyWhenForced
= false,
2188 bool VectorizeOnlyWhenForced
= false)
2190 Impl({InterleaveOnlyWhenForced
, VectorizeOnlyWhenForced
}) {
2191 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2194 bool runOnFunction(Function
&F
) override
{
2195 if (skipFunction(F
))
2198 auto *SE
= &getAnalysis
<ScalarEvolutionWrapperPass
>().getSE();
2199 auto *LI
= &getAnalysis
<LoopInfoWrapperPass
>().getLoopInfo();
2200 auto *TTI
= &getAnalysis
<TargetTransformInfoWrapperPass
>().getTTI(F
);
2201 auto *DT
= &getAnalysis
<DominatorTreeWrapperPass
>().getDomTree();
2202 auto *BFI
= &getAnalysis
<BlockFrequencyInfoWrapperPass
>().getBFI();
2203 auto *TLIP
= getAnalysisIfAvailable
<TargetLibraryInfoWrapperPass
>();
2204 auto *TLI
= TLIP
? &TLIP
->getTLI(F
) : nullptr;
2205 auto *AA
= &getAnalysis
<AAResultsWrapperPass
>().getAAResults();
2206 auto *AC
= &getAnalysis
<AssumptionCacheTracker
>().getAssumptionCache(F
);
2207 auto *LAA
= &getAnalysis
<LoopAccessLegacyAnalysis
>();
2208 auto *DB
= &getAnalysis
<DemandedBitsWrapperPass
>().getDemandedBits();
2209 auto *ORE
= &getAnalysis
<OptimizationRemarkEmitterWrapperPass
>().getORE();
2210 auto *PSI
= &getAnalysis
<ProfileSummaryInfoWrapperPass
>().getPSI();
2212 std::function
<const LoopAccessInfo
&(Loop
&)> GetLAA
=
2213 [&](Loop
&L
) -> const LoopAccessInfo
& { return LAA
->getInfo(&L
); };
2215 return Impl
.runImpl(F
, *SE
, *LI
, *TTI
, *DT
, *BFI
, TLI
, *DB
, *AA
, *AC
,
2216 GetLAA
, *ORE
, PSI
).MadeAnyChange
;
2219 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
2220 AU
.addRequired
<AssumptionCacheTracker
>();
2221 AU
.addRequired
<BlockFrequencyInfoWrapperPass
>();
2222 AU
.addRequired
<DominatorTreeWrapperPass
>();
2223 AU
.addRequired
<LoopInfoWrapperPass
>();
2224 AU
.addRequired
<ScalarEvolutionWrapperPass
>();
2225 AU
.addRequired
<TargetTransformInfoWrapperPass
>();
2226 AU
.addRequired
<AAResultsWrapperPass
>();
2227 AU
.addRequired
<LoopAccessLegacyAnalysis
>();
2228 AU
.addRequired
<DemandedBitsWrapperPass
>();
2229 AU
.addRequired
<OptimizationRemarkEmitterWrapperPass
>();
2230 AU
.addRequired
<InjectTLIMappingsLegacy
>();
2232 // We currently do not preserve loopinfo/dominator analyses with outer loop
2233 // vectorization. Until this is addressed, mark these analyses as preserved
2234 // only for non-VPlan-native path.
2235 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2236 if (!EnableVPlanNativePath
) {
2237 AU
.addPreserved
<LoopInfoWrapperPass
>();
2238 AU
.addPreserved
<DominatorTreeWrapperPass
>();
2241 AU
.addPreserved
<BasicAAWrapperPass
>();
2242 AU
.addPreserved
<GlobalsAAWrapperPass
>();
2243 AU
.addRequired
<ProfileSummaryInfoWrapperPass
>();
2247 } // end anonymous namespace
2249 //===----------------------------------------------------------------------===//
2250 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2251 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2252 //===----------------------------------------------------------------------===//
2254 Value
*InnerLoopVectorizer::getBroadcastInstrs(Value
*V
) {
2255 // We need to place the broadcast of invariant variables outside the loop,
2256 // but only if it's proven safe to do so. Else, broadcast will be inside
2257 // vector loop body.
2258 Instruction
*Instr
= dyn_cast
<Instruction
>(V
);
2259 bool SafeToHoist
= OrigLoop
->isLoopInvariant(V
) &&
2261 DT
->dominates(Instr
->getParent(), LoopVectorPreHeader
));
2262 // Place the code for broadcasting invariant variables in the new preheader.
2263 IRBuilder
<>::InsertPointGuard
Guard(Builder
);
2265 Builder
.SetInsertPoint(LoopVectorPreHeader
->getTerminator());
2267 // Broadcast the scalar into all locations in the vector.
2268 Value
*Shuf
= Builder
.CreateVectorSplat(VF
, V
, "broadcast");
2273 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2274 const InductionDescriptor
&II
, Value
*Step
, Value
*Start
,
2275 Instruction
*EntryVal
, VPValue
*Def
, VPValue
*CastDef
,
2276 VPTransformState
&State
) {
2277 assert((isa
<PHINode
>(EntryVal
) || isa
<TruncInst
>(EntryVal
)) &&
2278 "Expected either an induction phi-node or a truncate of it!");
2280 // Construct the initial value of the vector IV in the vector loop preheader
2281 auto CurrIP
= Builder
.saveIP();
2282 Builder
.SetInsertPoint(LoopVectorPreHeader
->getTerminator());
2283 if (isa
<TruncInst
>(EntryVal
)) {
2284 assert(Start
->getType()->isIntegerTy() &&
2285 "Truncation requires an integer type");
2286 auto *TruncType
= cast
<IntegerType
>(EntryVal
->getType());
2287 Step
= Builder
.CreateTrunc(Step
, TruncType
);
2288 Start
= Builder
.CreateCast(Instruction::Trunc
, Start
, TruncType
);
2290 Value
*SplatStart
= Builder
.CreateVectorSplat(VF
, Start
);
2291 Value
*SteppedStart
=
2292 getStepVector(SplatStart
, 0, Step
, II
.getInductionOpcode());
2294 // We create vector phi nodes for both integer and floating-point induction
2295 // variables. Here, we determine the kind of arithmetic we will perform.
2296 Instruction::BinaryOps AddOp
;
2297 Instruction::BinaryOps MulOp
;
2298 if (Step
->getType()->isIntegerTy()) {
2299 AddOp
= Instruction::Add
;
2300 MulOp
= Instruction::Mul
;
2302 AddOp
= II
.getInductionOpcode();
2303 MulOp
= Instruction::FMul
;
2306 // Multiply the vectorization factor by the step using integer or
2307 // floating-point arithmetic as appropriate.
2308 Type
*StepType
= Step
->getType();
2309 if (Step
->getType()->isFloatingPointTy())
2310 StepType
= IntegerType::get(StepType
->getContext(),
2311 StepType
->getScalarSizeInBits());
2312 Value
*RuntimeVF
= getRuntimeVF(Builder
, StepType
, VF
);
2313 if (Step
->getType()->isFloatingPointTy())
2314 RuntimeVF
= Builder
.CreateSIToFP(RuntimeVF
, Step
->getType());
2315 Value
*Mul
= Builder
.CreateBinOp(MulOp
, Step
, RuntimeVF
);
2317 // Create a vector splat to use in the induction update.
2319 // FIXME: If the step is non-constant, we create the vector splat with
2320 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2321 // handle a constant vector splat.
2322 Value
*SplatVF
= isa
<Constant
>(Mul
)
2323 ? ConstantVector::getSplat(VF
, cast
<Constant
>(Mul
))
2324 : Builder
.CreateVectorSplat(VF
, Mul
);
2325 Builder
.restoreIP(CurrIP
);
2327 // We may need to add the step a number of times, depending on the unroll
2328 // factor. The last of those goes into the PHI.
2329 PHINode
*VecInd
= PHINode::Create(SteppedStart
->getType(), 2, "vec.ind",
2330 &*LoopVectorBody
->getFirstInsertionPt());
2331 VecInd
->setDebugLoc(EntryVal
->getDebugLoc());
2332 Instruction
*LastInduction
= VecInd
;
2333 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
2334 State
.set(Def
, LastInduction
, Part
);
2336 if (isa
<TruncInst
>(EntryVal
))
2337 addMetadata(LastInduction
, EntryVal
);
2338 recordVectorLoopValueForInductionCast(II
, EntryVal
, LastInduction
, CastDef
,
2341 LastInduction
= cast
<Instruction
>(
2342 Builder
.CreateBinOp(AddOp
, LastInduction
, SplatVF
, "step.add"));
2343 LastInduction
->setDebugLoc(EntryVal
->getDebugLoc());
2346 // Move the last step to the end of the latch block. This ensures consistent
2347 // placement of all induction updates.
2348 auto *LoopVectorLatch
= LI
->getLoopFor(LoopVectorBody
)->getLoopLatch();
2349 auto *Br
= cast
<BranchInst
>(LoopVectorLatch
->getTerminator());
2350 auto *ICmp
= cast
<Instruction
>(Br
->getCondition());
2351 LastInduction
->moveBefore(ICmp
);
2352 LastInduction
->setName("vec.ind.next");
2354 VecInd
->addIncoming(SteppedStart
, LoopVectorPreHeader
);
2355 VecInd
->addIncoming(LastInduction
, LoopVectorLatch
);
2358 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction
*I
) const {
2359 return Cost
->isScalarAfterVectorization(I
, VF
) ||
2360 Cost
->isProfitableToScalarize(I
, VF
);
2363 bool InnerLoopVectorizer::needsScalarInduction(Instruction
*IV
) const {
2364 if (shouldScalarizeInstruction(IV
))
2366 auto isScalarInst
= [&](User
*U
) -> bool {
2367 auto *I
= cast
<Instruction
>(U
);
2368 return (OrigLoop
->contains(I
) && shouldScalarizeInstruction(I
));
2370 return llvm::any_of(IV
->users(), isScalarInst
);
2373 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2374 const InductionDescriptor
&ID
, const Instruction
*EntryVal
,
2375 Value
*VectorLoopVal
, VPValue
*CastDef
, VPTransformState
&State
,
2376 unsigned Part
, unsigned Lane
) {
2377 assert((isa
<PHINode
>(EntryVal
) || isa
<TruncInst
>(EntryVal
)) &&
2378 "Expected either an induction phi-node or a truncate of it!");
2380 // This induction variable is not the phi from the original loop but the
2381 // newly-created IV based on the proof that casted Phi is equal to the
2382 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2383 // re-uses the same InductionDescriptor that original IV uses but we don't
2384 // have to do any recording in this case - that is done when original IV is
2386 if (isa
<TruncInst
>(EntryVal
))
2389 const SmallVectorImpl
<Instruction
*> &Casts
= ID
.getCastInsts();
2392 // Only the first Cast instruction in the Casts vector is of interest.
2393 // The rest of the Casts (if exist) have no uses outside the
2394 // induction update chain itself.
2395 if (Lane
< UINT_MAX
)
2396 State
.set(CastDef
, VectorLoopVal
, VPIteration(Part
, Lane
));
2398 State
.set(CastDef
, VectorLoopVal
, Part
);
2401 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode
*IV
, Value
*Start
,
2402 TruncInst
*Trunc
, VPValue
*Def
,
2404 VPTransformState
&State
) {
2405 assert((IV
->getType()->isIntegerTy() || IV
!= OldInduction
) &&
2406 "Primary induction variable must have an integer type");
2408 auto II
= Legal
->getInductionVars().find(IV
);
2409 assert(II
!= Legal
->getInductionVars().end() && "IV is not an induction");
2411 auto ID
= II
->second
;
2412 assert(IV
->getType() == ID
.getStartValue()->getType() && "Types must match");
2414 // The value from the original loop to which we are mapping the new induction
2416 Instruction
*EntryVal
= Trunc
? cast
<Instruction
>(Trunc
) : IV
;
2418 auto &DL
= OrigLoop
->getHeader()->getModule()->getDataLayout();
2420 // Generate code for the induction step. Note that induction steps are
2421 // required to be loop-invariant
2422 auto CreateStepValue
= [&](const SCEV
*Step
) -> Value
* {
2423 assert(PSE
.getSE()->isLoopInvariant(Step
, OrigLoop
) &&
2424 "Induction step should be loop invariant");
2425 if (PSE
.getSE()->isSCEVable(IV
->getType())) {
2426 SCEVExpander
Exp(*PSE
.getSE(), DL
, "induction");
2427 return Exp
.expandCodeFor(Step
, Step
->getType(),
2428 LoopVectorPreHeader
->getTerminator());
2430 return cast
<SCEVUnknown
>(Step
)->getValue();
2433 // The scalar value to broadcast. This is derived from the canonical
2434 // induction variable. If a truncation type is given, truncate the canonical
2435 // induction variable and step. Otherwise, derive these values from the
2436 // induction descriptor.
2437 auto CreateScalarIV
= [&](Value
*&Step
) -> Value
* {
2438 Value
*ScalarIV
= Induction
;
2439 if (IV
!= OldInduction
) {
2440 ScalarIV
= IV
->getType()->isIntegerTy()
2441 ? Builder
.CreateSExtOrTrunc(Induction
, IV
->getType())
2442 : Builder
.CreateCast(Instruction::SIToFP
, Induction
,
2444 ScalarIV
= emitTransformedIndex(Builder
, ScalarIV
, PSE
.getSE(), DL
, ID
);
2445 ScalarIV
->setName("offset.idx");
2448 auto *TruncType
= cast
<IntegerType
>(Trunc
->getType());
2449 assert(Step
->getType()->isIntegerTy() &&
2450 "Truncation requires an integer step");
2451 ScalarIV
= Builder
.CreateTrunc(ScalarIV
, TruncType
);
2452 Step
= Builder
.CreateTrunc(Step
, TruncType
);
2457 // Create the vector values from the scalar IV, in the absence of creating a
2459 auto CreateSplatIV
= [&](Value
*ScalarIV
, Value
*Step
) {
2460 Value
*Broadcasted
= getBroadcastInstrs(ScalarIV
);
2461 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
2462 assert(!VF
.isScalable() && "scalable vectors not yet supported.");
2464 getStepVector(Broadcasted
, VF
.getKnownMinValue() * Part
, Step
,
2465 ID
.getInductionOpcode());
2466 State
.set(Def
, EntryPart
, Part
);
2468 addMetadata(EntryPart
, Trunc
);
2469 recordVectorLoopValueForInductionCast(ID
, EntryVal
, EntryPart
, CastDef
,
2474 // Fast-math-flags propagate from the original induction instruction.
2475 IRBuilder
<>::FastMathFlagGuard
FMFG(Builder
);
2476 if (ID
.getInductionBinOp() && isa
<FPMathOperator
>(ID
.getInductionBinOp()))
2477 Builder
.setFastMathFlags(ID
.getInductionBinOp()->getFastMathFlags());
2479 // Now do the actual transformations, and start with creating the step value.
2480 Value
*Step
= CreateStepValue(ID
.getStep());
2481 if (VF
.isZero() || VF
.isScalar()) {
2482 Value
*ScalarIV
= CreateScalarIV(Step
);
2483 CreateSplatIV(ScalarIV
, Step
);
2487 // Determine if we want a scalar version of the induction variable. This is
2488 // true if the induction variable itself is not widened, or if it has at
2489 // least one user in the loop that is not widened.
2490 auto NeedsScalarIV
= needsScalarInduction(EntryVal
);
2491 if (!NeedsScalarIV
) {
2492 createVectorIntOrFpInductionPHI(ID
, Step
, Start
, EntryVal
, Def
, CastDef
,
2497 // Try to create a new independent vector induction variable. If we can't
2498 // create the phi node, we will splat the scalar induction variable in each
2500 if (!shouldScalarizeInstruction(EntryVal
)) {
2501 createVectorIntOrFpInductionPHI(ID
, Step
, Start
, EntryVal
, Def
, CastDef
,
2503 Value
*ScalarIV
= CreateScalarIV(Step
);
2504 // Create scalar steps that can be used by instructions we will later
2505 // scalarize. Note that the addition of the scalar steps will not increase
2506 // the number of instructions in the loop in the common case prior to
2507 // InstCombine. We will be trading one vector extract for each scalar step.
2508 buildScalarSteps(ScalarIV
, Step
, EntryVal
, ID
, Def
, CastDef
, State
);
2512 // All IV users are scalar instructions, so only emit a scalar IV, not a
2513 // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2514 // predicate used by the masked loads/stores.
2515 Value
*ScalarIV
= CreateScalarIV(Step
);
2516 if (!Cost
->isScalarEpilogueAllowed())
2517 CreateSplatIV(ScalarIV
, Step
);
2518 buildScalarSteps(ScalarIV
, Step
, EntryVal
, ID
, Def
, CastDef
, State
);
2521 Value
*InnerLoopVectorizer::getStepVector(Value
*Val
, int StartIdx
, Value
*Step
,
2522 Instruction::BinaryOps BinOp
) {
2523 // Create and check the types.
2524 auto *ValVTy
= cast
<VectorType
>(Val
->getType());
2525 ElementCount VLen
= ValVTy
->getElementCount();
2527 Type
*STy
= Val
->getType()->getScalarType();
2528 assert((STy
->isIntegerTy() || STy
->isFloatingPointTy()) &&
2529 "Induction Step must be an integer or FP");
2530 assert(Step
->getType() == STy
&& "Step has wrong type");
2532 SmallVector
<Constant
*, 8> Indices
;
2534 // Create a vector of consecutive numbers from zero to VF.
2535 VectorType
*InitVecValVTy
= ValVTy
;
2536 Type
*InitVecValSTy
= STy
;
2537 if (STy
->isFloatingPointTy()) {
2539 IntegerType::get(STy
->getContext(), STy
->getScalarSizeInBits());
2540 InitVecValVTy
= VectorType::get(InitVecValSTy
, VLen
);
2542 Value
*InitVec
= Builder
.CreateStepVector(InitVecValVTy
);
2545 Value
*StartIdxSplat
= Builder
.CreateVectorSplat(
2546 VLen
, ConstantInt::get(InitVecValSTy
, StartIdx
));
2547 InitVec
= Builder
.CreateAdd(InitVec
, StartIdxSplat
);
2549 if (STy
->isIntegerTy()) {
2550 Step
= Builder
.CreateVectorSplat(VLen
, Step
);
2551 assert(Step
->getType() == Val
->getType() && "Invalid step vec");
2552 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2553 // which can be found from the original scalar operations.
2554 Step
= Builder
.CreateMul(InitVec
, Step
);
2555 return Builder
.CreateAdd(Val
, Step
, "induction");
2558 // Floating point induction.
2559 assert((BinOp
== Instruction::FAdd
|| BinOp
== Instruction::FSub
) &&
2560 "Binary Opcode should be specified for FP induction");
2561 InitVec
= Builder
.CreateUIToFP(InitVec
, ValVTy
);
2562 Step
= Builder
.CreateVectorSplat(VLen
, Step
);
2563 Value
*MulOp
= Builder
.CreateFMul(InitVec
, Step
);
2564 return Builder
.CreateBinOp(BinOp
, Val
, MulOp
, "induction");
2567 void InnerLoopVectorizer::buildScalarSteps(Value
*ScalarIV
, Value
*Step
,
2568 Instruction
*EntryVal
,
2569 const InductionDescriptor
&ID
,
2570 VPValue
*Def
, VPValue
*CastDef
,
2571 VPTransformState
&State
) {
2572 // We shouldn't have to build scalar steps if we aren't vectorizing.
2573 assert(VF
.isVector() && "VF should be greater than one");
2574 // Get the value type and ensure it and the step have the same integer type.
2575 Type
*ScalarIVTy
= ScalarIV
->getType()->getScalarType();
2576 assert(ScalarIVTy
== Step
->getType() &&
2577 "Val and Step should have the same type");
2579 // We build scalar steps for both integer and floating-point induction
2580 // variables. Here, we determine the kind of arithmetic we will perform.
2581 Instruction::BinaryOps AddOp
;
2582 Instruction::BinaryOps MulOp
;
2583 if (ScalarIVTy
->isIntegerTy()) {
2584 AddOp
= Instruction::Add
;
2585 MulOp
= Instruction::Mul
;
2587 AddOp
= ID
.getInductionOpcode();
2588 MulOp
= Instruction::FMul
;
2591 // Determine the number of scalars we need to generate for each unroll
2592 // iteration. If EntryVal is uniform, we only need to generate the first
2593 // lane. Otherwise, we generate all VF values.
2595 Cost
->isUniformAfterVectorization(cast
<Instruction
>(EntryVal
), VF
);
2596 unsigned Lanes
= IsUniform
? 1 : VF
.getKnownMinValue();
2597 // Compute the scalar steps and save the results in State.
2598 Type
*IntStepTy
= IntegerType::get(ScalarIVTy
->getContext(),
2599 ScalarIVTy
->getScalarSizeInBits());
2600 Type
*VecIVTy
= nullptr;
2601 Value
*UnitStepVec
= nullptr, *SplatStep
= nullptr, *SplatIV
= nullptr;
2602 if (!IsUniform
&& VF
.isScalable()) {
2603 VecIVTy
= VectorType::get(ScalarIVTy
, VF
);
2604 UnitStepVec
= Builder
.CreateStepVector(VectorType::get(IntStepTy
, VF
));
2605 SplatStep
= Builder
.CreateVectorSplat(VF
, Step
);
2606 SplatIV
= Builder
.CreateVectorSplat(VF
, ScalarIV
);
2609 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
2611 createStepForVF(Builder
, ConstantInt::get(IntStepTy
, Part
), VF
);
2613 if (!IsUniform
&& VF
.isScalable()) {
2614 auto *SplatStartIdx
= Builder
.CreateVectorSplat(VF
, StartIdx0
);
2615 auto *InitVec
= Builder
.CreateAdd(SplatStartIdx
, UnitStepVec
);
2616 if (ScalarIVTy
->isFloatingPointTy())
2617 InitVec
= Builder
.CreateSIToFP(InitVec
, VecIVTy
);
2618 auto *Mul
= Builder
.CreateBinOp(MulOp
, InitVec
, SplatStep
);
2619 auto *Add
= Builder
.CreateBinOp(AddOp
, SplatIV
, Mul
);
2620 State
.set(Def
, Add
, Part
);
2621 recordVectorLoopValueForInductionCast(ID
, EntryVal
, Add
, CastDef
, State
,
2623 // It's useful to record the lane values too for the known minimum number
2624 // of elements so we do those below. This improves the code quality when
2625 // trying to extract the first element, for example.
2628 if (ScalarIVTy
->isFloatingPointTy())
2629 StartIdx0
= Builder
.CreateSIToFP(StartIdx0
, ScalarIVTy
);
2631 for (unsigned Lane
= 0; Lane
< Lanes
; ++Lane
) {
2632 Value
*StartIdx
= Builder
.CreateBinOp(
2633 AddOp
, StartIdx0
, getSignedIntOrFpConstant(ScalarIVTy
, Lane
));
2634 // The step returned by `createStepForVF` is a runtime-evaluated value
2635 // when VF is scalable. Otherwise, it should be folded into a Constant.
2636 assert((VF
.isScalable() || isa
<Constant
>(StartIdx
)) &&
2637 "Expected StartIdx to be folded to a constant when VF is not "
2639 auto *Mul
= Builder
.CreateBinOp(MulOp
, StartIdx
, Step
);
2640 auto *Add
= Builder
.CreateBinOp(AddOp
, ScalarIV
, Mul
);
2641 State
.set(Def
, Add
, VPIteration(Part
, Lane
));
2642 recordVectorLoopValueForInductionCast(ID
, EntryVal
, Add
, CastDef
, State
,
2648 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue
*Def
,
2649 const VPIteration
&Instance
,
2650 VPTransformState
&State
) {
2651 Value
*ScalarInst
= State
.get(Def
, Instance
);
2652 Value
*VectorValue
= State
.get(Def
, Instance
.Part
);
2653 VectorValue
= Builder
.CreateInsertElement(
2654 VectorValue
, ScalarInst
,
2655 Instance
.Lane
.getAsRuntimeExpr(State
.Builder
, VF
));
2656 State
.set(Def
, VectorValue
, Instance
.Part
);
2659 Value
*InnerLoopVectorizer::reverseVector(Value
*Vec
) {
2660 assert(Vec
->getType()->isVectorTy() && "Invalid type");
2661 return Builder
.CreateVectorReverse(Vec
, "reverse");
2664 // Return whether we allow using masked interleave-groups (for dealing with
2665 // strided loads/stores that reside in predicated blocks, or for dealing
2667 static bool useMaskedInterleavedAccesses(const TargetTransformInfo
&TTI
) {
2668 // If an override option has been passed in for interleaved accesses, use it.
2669 if (EnableMaskedInterleavedMemAccesses
.getNumOccurrences() > 0)
2670 return EnableMaskedInterleavedMemAccesses
;
2672 return TTI
.enableMaskedInterleavedAccessVectorization();
2675 // Try to vectorize the interleave group that \p Instr belongs to.
2677 // E.g. Translate following interleaved load group (factor = 3):
2678 // for (i = 0; i < N; i+=3) {
2679 // R = Pic[i]; // Member of index 0
2680 // G = Pic[i+1]; // Member of index 1
2681 // B = Pic[i+2]; // Member of index 2
2682 // ... // do something to R, G, B
2685 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2686 // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2687 // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2688 // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2690 // Or translate following interleaved store group (factor = 3):
2691 // for (i = 0; i < N; i+=3) {
2692 // ... do something to R, G, B
2693 // Pic[i] = R; // Member of index 0
2694 // Pic[i+1] = G; // Member of index 1
2695 // Pic[i+2] = B; // Member of index 2
2698 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2699 // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2700 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2701 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2702 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2703 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2704 const InterleaveGroup
<Instruction
> *Group
, ArrayRef
<VPValue
*> VPDefs
,
2705 VPTransformState
&State
, VPValue
*Addr
, ArrayRef
<VPValue
*> StoredValues
,
2706 VPValue
*BlockInMask
) {
2707 Instruction
*Instr
= Group
->getInsertPos();
2708 const DataLayout
&DL
= Instr
->getModule()->getDataLayout();
2710 // Prepare for the vector type of the interleaved load/store.
2711 Type
*ScalarTy
= getLoadStoreType(Instr
);
2712 unsigned InterleaveFactor
= Group
->getFactor();
2713 assert(!VF
.isScalable() && "scalable vectors not yet supported.");
2714 auto *VecTy
= VectorType::get(ScalarTy
, VF
* InterleaveFactor
);
2716 // Prepare for the new pointers.
2717 SmallVector
<Value
*, 2> AddrParts
;
2718 unsigned Index
= Group
->getIndex(Instr
);
2720 // TODO: extend the masked interleaved-group support to reversed access.
2721 assert((!BlockInMask
|| !Group
->isReverse()) &&
2722 "Reversed masked interleave-group not supported.");
2724 // If the group is reverse, adjust the index to refer to the last vector lane
2725 // instead of the first. We adjust the index from the first vector lane,
2726 // rather than directly getting the pointer for lane VF - 1, because the
2727 // pointer operand of the interleaved access is supposed to be uniform. For
2728 // uniform instructions, we're only required to generate a value for the
2729 // first vector lane in each unroll iteration.
2730 if (Group
->isReverse())
2731 Index
+= (VF
.getKnownMinValue() - 1) * Group
->getFactor();
2733 for (unsigned Part
= 0; Part
< UF
; Part
++) {
2734 Value
*AddrPart
= State
.get(Addr
, VPIteration(Part
, 0));
2735 setDebugLocFromInst(AddrPart
);
2737 // Notice current instruction could be any index. Need to adjust the address
2738 // to the member of index 0.
2740 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2741 // b = A[i]; // Member of index 0
2742 // Current pointer is pointed to A[i+1], adjust it to A[i].
2744 // E.g. A[i+1] = a; // Member of index 1
2745 // A[i] = b; // Member of index 0
2746 // A[i+2] = c; // Member of index 2 (Current instruction)
2747 // Current pointer is pointed to A[i+2], adjust it to A[i].
2749 bool InBounds
= false;
2750 if (auto *gep
= dyn_cast
<GetElementPtrInst
>(AddrPart
->stripPointerCasts()))
2751 InBounds
= gep
->isInBounds();
2752 AddrPart
= Builder
.CreateGEP(ScalarTy
, AddrPart
, Builder
.getInt32(-Index
));
2753 cast
<GetElementPtrInst
>(AddrPart
)->setIsInBounds(InBounds
);
2755 // Cast to the vector pointer type.
2756 unsigned AddressSpace
= AddrPart
->getType()->getPointerAddressSpace();
2757 Type
*PtrTy
= VecTy
->getPointerTo(AddressSpace
);
2758 AddrParts
.push_back(Builder
.CreateBitCast(AddrPart
, PtrTy
));
2761 setDebugLocFromInst(Instr
);
2762 Value
*PoisonVec
= PoisonValue::get(VecTy
);
2764 Value
*MaskForGaps
= nullptr;
2765 if (Group
->requiresScalarEpilogue() && !Cost
->isScalarEpilogueAllowed()) {
2766 MaskForGaps
= createBitMaskForGaps(Builder
, VF
.getKnownMinValue(), *Group
);
2767 assert(MaskForGaps
&& "Mask for Gaps is required but it is null");
2770 // Vectorize the interleaved load group.
2771 if (isa
<LoadInst
>(Instr
)) {
2772 // For each unroll part, create a wide load for the group.
2773 SmallVector
<Value
*, 2> NewLoads
;
2774 for (unsigned Part
= 0; Part
< UF
; Part
++) {
2775 Instruction
*NewLoad
;
2776 if (BlockInMask
|| MaskForGaps
) {
2777 assert(useMaskedInterleavedAccesses(*TTI
) &&
2778 "masked interleaved groups are not allowed.");
2779 Value
*GroupMask
= MaskForGaps
;
2781 Value
*BlockInMaskPart
= State
.get(BlockInMask
, Part
);
2782 Value
*ShuffledMask
= Builder
.CreateShuffleVector(
2784 createReplicatedMask(InterleaveFactor
, VF
.getKnownMinValue()),
2785 "interleaved.mask");
2786 GroupMask
= MaskForGaps
2787 ? Builder
.CreateBinOp(Instruction::And
, ShuffledMask
,
2792 Builder
.CreateMaskedLoad(VecTy
, AddrParts
[Part
], Group
->getAlign(),
2793 GroupMask
, PoisonVec
, "wide.masked.vec");
2796 NewLoad
= Builder
.CreateAlignedLoad(VecTy
, AddrParts
[Part
],
2797 Group
->getAlign(), "wide.vec");
2798 Group
->addMetadata(NewLoad
);
2799 NewLoads
.push_back(NewLoad
);
2802 // For each member in the group, shuffle out the appropriate data from the
2805 for (unsigned I
= 0; I
< InterleaveFactor
; ++I
) {
2806 Instruction
*Member
= Group
->getMember(I
);
2808 // Skip the gaps in the group.
2813 createStrideMask(I
, InterleaveFactor
, VF
.getKnownMinValue());
2814 for (unsigned Part
= 0; Part
< UF
; Part
++) {
2815 Value
*StridedVec
= Builder
.CreateShuffleVector(
2816 NewLoads
[Part
], StrideMask
, "strided.vec");
2818 // If this member has different type, cast the result type.
2819 if (Member
->getType() != ScalarTy
) {
2820 assert(!VF
.isScalable() && "VF is assumed to be non scalable.");
2821 VectorType
*OtherVTy
= VectorType::get(Member
->getType(), VF
);
2822 StridedVec
= createBitOrPointerCast(StridedVec
, OtherVTy
, DL
);
2825 if (Group
->isReverse())
2826 StridedVec
= reverseVector(StridedVec
);
2828 State
.set(VPDefs
[J
], StridedVec
, Part
);
2835 // The sub vector type for current instruction.
2836 auto *SubVT
= VectorType::get(ScalarTy
, VF
);
2838 // Vectorize the interleaved store group.
2839 MaskForGaps
= createBitMaskForGaps(Builder
, VF
.getKnownMinValue(), *Group
);
2840 assert((!MaskForGaps
|| useMaskedInterleavedAccesses(*TTI
)) &&
2841 "masked interleaved groups are not allowed.");
2842 assert((!MaskForGaps
|| !VF
.isScalable()) &&
2843 "masking gaps for scalable vectors is not yet supported.");
2844 for (unsigned Part
= 0; Part
< UF
; Part
++) {
2845 // Collect the stored vector from each member.
2846 SmallVector
<Value
*, 4> StoredVecs
;
2847 for (unsigned i
= 0; i
< InterleaveFactor
; i
++) {
2848 assert((Group
->getMember(i
) || MaskForGaps
) &&
2849 "Fail to get a member from an interleaved store group");
2850 Instruction
*Member
= Group
->getMember(i
);
2852 // Skip the gaps in the group.
2854 Value
*Undef
= PoisonValue::get(SubVT
);
2855 StoredVecs
.push_back(Undef
);
2859 Value
*StoredVec
= State
.get(StoredValues
[i
], Part
);
2861 if (Group
->isReverse())
2862 StoredVec
= reverseVector(StoredVec
);
2864 // If this member has different type, cast it to a unified type.
2866 if (StoredVec
->getType() != SubVT
)
2867 StoredVec
= createBitOrPointerCast(StoredVec
, SubVT
, DL
);
2869 StoredVecs
.push_back(StoredVec
);
2872 // Concatenate all vectors into a wide vector.
2873 Value
*WideVec
= concatenateVectors(Builder
, StoredVecs
);
2875 // Interleave the elements in the wide vector.
2876 Value
*IVec
= Builder
.CreateShuffleVector(
2877 WideVec
, createInterleaveMask(VF
.getKnownMinValue(), InterleaveFactor
),
2880 Instruction
*NewStoreInstr
;
2881 if (BlockInMask
|| MaskForGaps
) {
2882 Value
*GroupMask
= MaskForGaps
;
2884 Value
*BlockInMaskPart
= State
.get(BlockInMask
, Part
);
2885 Value
*ShuffledMask
= Builder
.CreateShuffleVector(
2887 createReplicatedMask(InterleaveFactor
, VF
.getKnownMinValue()),
2888 "interleaved.mask");
2889 GroupMask
= MaskForGaps
? Builder
.CreateBinOp(Instruction::And
,
2890 ShuffledMask
, MaskForGaps
)
2893 NewStoreInstr
= Builder
.CreateMaskedStore(IVec
, AddrParts
[Part
],
2894 Group
->getAlign(), GroupMask
);
2897 Builder
.CreateAlignedStore(IVec
, AddrParts
[Part
], Group
->getAlign());
2899 Group
->addMetadata(NewStoreInstr
);
2903 void InnerLoopVectorizer::vectorizeMemoryInstruction(
2904 Instruction
*Instr
, VPTransformState
&State
, VPValue
*Def
, VPValue
*Addr
,
2905 VPValue
*StoredValue
, VPValue
*BlockInMask
) {
2906 // Attempt to issue a wide load.
2907 LoadInst
*LI
= dyn_cast
<LoadInst
>(Instr
);
2908 StoreInst
*SI
= dyn_cast
<StoreInst
>(Instr
);
2910 assert((LI
|| SI
) && "Invalid Load/Store instruction");
2911 assert((!SI
|| StoredValue
) && "No stored value provided for widened store");
2912 assert((!LI
|| !StoredValue
) && "Stored value provided for widened load");
2914 LoopVectorizationCostModel::InstWidening Decision
=
2915 Cost
->getWideningDecision(Instr
, VF
);
2916 assert((Decision
== LoopVectorizationCostModel::CM_Widen
||
2917 Decision
== LoopVectorizationCostModel::CM_Widen_Reverse
||
2918 Decision
== LoopVectorizationCostModel::CM_GatherScatter
) &&
2919 "CM decision is not to widen the memory instruction");
2921 Type
*ScalarDataTy
= getLoadStoreType(Instr
);
2923 auto *DataTy
= VectorType::get(ScalarDataTy
, VF
);
2924 const Align Alignment
= getLoadStoreAlignment(Instr
);
2926 // Determine if the pointer operand of the access is either consecutive or
2927 // reverse consecutive.
2928 bool Reverse
= (Decision
== LoopVectorizationCostModel::CM_Widen_Reverse
);
2929 bool ConsecutiveStride
=
2930 Reverse
|| (Decision
== LoopVectorizationCostModel::CM_Widen
);
2931 bool CreateGatherScatter
=
2932 (Decision
== LoopVectorizationCostModel::CM_GatherScatter
);
2934 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2935 // gather/scatter. Otherwise Decision should have been to Scalarize.
2936 assert((ConsecutiveStride
|| CreateGatherScatter
) &&
2937 "The instruction should be scalarized");
2938 (void)ConsecutiveStride
;
2940 VectorParts
BlockInMaskParts(UF
);
2941 bool isMaskRequired
= BlockInMask
;
2943 for (unsigned Part
= 0; Part
< UF
; ++Part
)
2944 BlockInMaskParts
[Part
] = State
.get(BlockInMask
, Part
);
2946 const auto CreateVecPtr
= [&](unsigned Part
, Value
*Ptr
) -> Value
* {
2947 // Calculate the pointer for the specific unroll-part.
2948 GetElementPtrInst
*PartPtr
= nullptr;
2950 bool InBounds
= false;
2951 if (auto *gep
= dyn_cast
<GetElementPtrInst
>(Ptr
->stripPointerCasts()))
2952 InBounds
= gep
->isInBounds();
2954 // If the address is consecutive but reversed, then the
2955 // wide store needs to start at the last vector element.
2956 // RunTimeVF = VScale * VF.getKnownMinValue()
2957 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
2958 Value
*RunTimeVF
= getRuntimeVF(Builder
, Builder
.getInt32Ty(), VF
);
2959 // NumElt = -Part * RunTimeVF
2960 Value
*NumElt
= Builder
.CreateMul(Builder
.getInt32(-Part
), RunTimeVF
);
2961 // LastLane = 1 - RunTimeVF
2962 Value
*LastLane
= Builder
.CreateSub(Builder
.getInt32(1), RunTimeVF
);
2964 cast
<GetElementPtrInst
>(Builder
.CreateGEP(ScalarDataTy
, Ptr
, NumElt
));
2965 PartPtr
->setIsInBounds(InBounds
);
2966 PartPtr
= cast
<GetElementPtrInst
>(
2967 Builder
.CreateGEP(ScalarDataTy
, PartPtr
, LastLane
));
2968 PartPtr
->setIsInBounds(InBounds
);
2969 if (isMaskRequired
) // Reverse of a null all-one mask is a null mask.
2970 BlockInMaskParts
[Part
] = reverseVector(BlockInMaskParts
[Part
]);
2972 Value
*Increment
= createStepForVF(Builder
, Builder
.getInt32(Part
), VF
);
2973 PartPtr
= cast
<GetElementPtrInst
>(
2974 Builder
.CreateGEP(ScalarDataTy
, Ptr
, Increment
));
2975 PartPtr
->setIsInBounds(InBounds
);
2978 unsigned AddressSpace
= Ptr
->getType()->getPointerAddressSpace();
2979 return Builder
.CreateBitCast(PartPtr
, DataTy
->getPointerTo(AddressSpace
));
2984 setDebugLocFromInst(SI
);
2986 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
2987 Instruction
*NewSI
= nullptr;
2988 Value
*StoredVal
= State
.get(StoredValue
, Part
);
2989 if (CreateGatherScatter
) {
2990 Value
*MaskPart
= isMaskRequired
? BlockInMaskParts
[Part
] : nullptr;
2991 Value
*VectorGep
= State
.get(Addr
, Part
);
2992 NewSI
= Builder
.CreateMaskedScatter(StoredVal
, VectorGep
, Alignment
,
2996 // If we store to reverse consecutive memory locations, then we need
2997 // to reverse the order of elements in the stored value.
2998 StoredVal
= reverseVector(StoredVal
);
2999 // We don't want to update the value in the map as it might be used in
3000 // another expression. So don't call resetVectorValue(StoredVal).
3002 auto *VecPtr
= CreateVecPtr(Part
, State
.get(Addr
, VPIteration(0, 0)));
3004 NewSI
= Builder
.CreateMaskedStore(StoredVal
, VecPtr
, Alignment
,
3005 BlockInMaskParts
[Part
]);
3007 NewSI
= Builder
.CreateAlignedStore(StoredVal
, VecPtr
, Alignment
);
3009 addMetadata(NewSI
, SI
);
3015 assert(LI
&& "Must have a load instruction");
3016 setDebugLocFromInst(LI
);
3017 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3019 if (CreateGatherScatter
) {
3020 Value
*MaskPart
= isMaskRequired
? BlockInMaskParts
[Part
] : nullptr;
3021 Value
*VectorGep
= State
.get(Addr
, Part
);
3022 NewLI
= Builder
.CreateMaskedGather(DataTy
, VectorGep
, Alignment
, MaskPart
,
3023 nullptr, "wide.masked.gather");
3024 addMetadata(NewLI
, LI
);
3026 auto *VecPtr
= CreateVecPtr(Part
, State
.get(Addr
, VPIteration(0, 0)));
3028 NewLI
= Builder
.CreateMaskedLoad(
3029 DataTy
, VecPtr
, Alignment
, BlockInMaskParts
[Part
],
3030 PoisonValue::get(DataTy
), "wide.masked.load");
3033 Builder
.CreateAlignedLoad(DataTy
, VecPtr
, Alignment
, "wide.load");
3035 // Add metadata to the load, but setVectorValue to the reverse shuffle.
3036 addMetadata(NewLI
, LI
);
3038 NewLI
= reverseVector(NewLI
);
3041 State
.set(Def
, NewLI
, Part
);
3045 void InnerLoopVectorizer::scalarizeInstruction(Instruction
*Instr
, VPValue
*Def
,
3047 const VPIteration
&Instance
,
3048 bool IfPredicateInstr
,
3049 VPTransformState
&State
) {
3050 assert(!Instr
->getType()->isAggregateType() && "Can't handle vectors");
3052 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
3053 // the first lane and part.
3054 if (isa
<NoAliasScopeDeclInst
>(Instr
))
3055 if (!Instance
.isFirstIteration())
3058 setDebugLocFromInst(Instr
);
3060 // Does this instruction return a value ?
3061 bool IsVoidRetTy
= Instr
->getType()->isVoidTy();
3063 Instruction
*Cloned
= Instr
->clone();
3065 Cloned
->setName(Instr
->getName() + ".cloned");
3067 State
.Builder
.SetInsertPoint(Builder
.GetInsertBlock(),
3068 Builder
.GetInsertPoint());
3069 // Replace the operands of the cloned instructions with their scalar
3070 // equivalents in the new loop.
3071 for (unsigned op
= 0, e
= User
.getNumOperands(); op
!= e
; ++op
) {
3072 auto *Operand
= dyn_cast
<Instruction
>(Instr
->getOperand(op
));
3073 auto InputInstance
= Instance
;
3074 if (!Operand
|| !OrigLoop
->contains(Operand
) ||
3075 (Cost
->isUniformAfterVectorization(Operand
, State
.VF
)))
3076 InputInstance
.Lane
= VPLane::getFirstLane();
3077 auto *NewOp
= State
.get(User
.getOperand(op
), InputInstance
);
3078 Cloned
->setOperand(op
, NewOp
);
3080 addNewMetadata(Cloned
, Instr
);
3082 // Place the cloned scalar in the new loop.
3083 Builder
.Insert(Cloned
);
3085 State
.set(Def
, Cloned
, Instance
);
3087 // If we just cloned a new assumption, add it the assumption cache.
3088 if (auto *II
= dyn_cast
<AssumeInst
>(Cloned
))
3089 AC
->registerAssumption(II
);
3092 if (IfPredicateInstr
)
3093 PredicatedInstructions
.push_back(Cloned
);
3096 PHINode
*InnerLoopVectorizer::createInductionVariable(Loop
*L
, Value
*Start
,
3097 Value
*End
, Value
*Step
,
3099 BasicBlock
*Header
= L
->getHeader();
3100 BasicBlock
*Latch
= L
->getLoopLatch();
3101 // As we're just creating this loop, it's possible no latch exists
3102 // yet. If so, use the header as this will be a single block loop.
3106 IRBuilder
<> B(&*Header
->getFirstInsertionPt());
3107 Instruction
*OldInst
= getDebugLocFromInstOrOperands(OldInduction
);
3108 setDebugLocFromInst(OldInst
, &B
);
3109 auto *Induction
= B
.CreatePHI(Start
->getType(), 2, "index");
3111 B
.SetInsertPoint(Latch
->getTerminator());
3112 setDebugLocFromInst(OldInst
, &B
);
3114 // Create i+1 and fill the PHINode.
3116 // If the tail is not folded, we know that End - Start >= Step (either
3117 // statically or through the minimum iteration checks). We also know that both
3118 // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV +
3119 // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned
3120 // overflows and we can mark the induction increment as NUW.
3121 Value
*Next
= B
.CreateAdd(Induction
, Step
, "index.next",
3122 /*NUW=*/!Cost
->foldTailByMasking(), /*NSW=*/false);
3123 Induction
->addIncoming(Start
, L
->getLoopPreheader());
3124 Induction
->addIncoming(Next
, Latch
);
3125 // Create the compare.
3126 Value
*ICmp
= B
.CreateICmpEQ(Next
, End
);
3127 B
.CreateCondBr(ICmp
, L
->getUniqueExitBlock(), Header
);
3129 // Now we have two terminators. Remove the old one from the block.
3130 Latch
->getTerminator()->eraseFromParent();
3135 Value
*InnerLoopVectorizer::getOrCreateTripCount(Loop
*L
) {
3139 assert(L
&& "Create Trip Count for null loop.");
3140 IRBuilder
<> Builder(L
->getLoopPreheader()->getTerminator());
3141 // Find the loop boundaries.
3142 ScalarEvolution
*SE
= PSE
.getSE();
3143 const SCEV
*BackedgeTakenCount
= PSE
.getBackedgeTakenCount();
3144 assert(!isa
<SCEVCouldNotCompute
>(BackedgeTakenCount
) &&
3145 "Invalid loop count");
3147 Type
*IdxTy
= Legal
->getWidestInductionType();
3148 assert(IdxTy
&& "No type for induction");
3150 // The exit count might have the type of i64 while the phi is i32. This can
3151 // happen if we have an induction variable that is sign extended before the
3152 // compare. The only way that we get a backedge taken count is that the
3153 // induction variable was signed and as such will not overflow. In such a case
3154 // truncation is legal.
3155 if (SE
->getTypeSizeInBits(BackedgeTakenCount
->getType()) >
3156 IdxTy
->getPrimitiveSizeInBits())
3157 BackedgeTakenCount
= SE
->getTruncateOrNoop(BackedgeTakenCount
, IdxTy
);
3158 BackedgeTakenCount
= SE
->getNoopOrZeroExtend(BackedgeTakenCount
, IdxTy
);
3160 // Get the total trip count from the count by adding 1.
3161 const SCEV
*ExitCount
= SE
->getAddExpr(
3162 BackedgeTakenCount
, SE
->getOne(BackedgeTakenCount
->getType()));
3164 const DataLayout
&DL
= L
->getHeader()->getModule()->getDataLayout();
3166 // Expand the trip count and place the new instructions in the preheader.
3167 // Notice that the pre-header does not change, only the loop body.
3168 SCEVExpander
Exp(*SE
, DL
, "induction");
3170 // Count holds the overall loop count (N).
3171 TripCount
= Exp
.expandCodeFor(ExitCount
, ExitCount
->getType(),
3172 L
->getLoopPreheader()->getTerminator());
3174 if (TripCount
->getType()->isPointerTy())
3176 CastInst::CreatePointerCast(TripCount
, IdxTy
, "exitcount.ptrcnt.to.int",
3177 L
->getLoopPreheader()->getTerminator());
3182 Value
*InnerLoopVectorizer::getOrCreateVectorTripCount(Loop
*L
) {
3183 if (VectorTripCount
)
3184 return VectorTripCount
;
3186 Value
*TC
= getOrCreateTripCount(L
);
3187 IRBuilder
<> Builder(L
->getLoopPreheader()->getTerminator());
3189 Type
*Ty
= TC
->getType();
3190 // This is where we can make the step a runtime constant.
3191 Value
*Step
= createStepForVF(Builder
, ConstantInt::get(Ty
, UF
), VF
);
3193 // If the tail is to be folded by masking, round the number of iterations N
3194 // up to a multiple of Step instead of rounding down. This is done by first
3195 // adding Step-1 and then rounding down. Note that it's ok if this addition
3196 // overflows: the vector induction variable will eventually wrap to zero given
3197 // that it starts at zero and its Step is a power of two; the loop will then
3198 // exit, with the last early-exit vector comparison also producing all-true.
3199 if (Cost
->foldTailByMasking()) {
3200 assert(isPowerOf2_32(VF
.getKnownMinValue() * UF
) &&
3201 "VF*UF must be a power of 2 when folding tail by masking");
3202 assert(!VF
.isScalable() &&
3203 "Tail folding not yet supported for scalable vectors");
3204 TC
= Builder
.CreateAdd(
3205 TC
, ConstantInt::get(Ty
, VF
.getKnownMinValue() * UF
- 1), "n.rnd.up");
3208 // Now we need to generate the expression for the part of the loop that the
3209 // vectorized body will execute. This is equal to N - (N % Step) if scalar
3210 // iterations are not required for correctness, or N - Step, otherwise. Step
3211 // is equal to the vectorization factor (number of SIMD elements) times the
3212 // unroll factor (number of SIMD instructions).
3213 Value
*R
= Builder
.CreateURem(TC
, Step
, "n.mod.vf");
3215 // There are cases where we *must* run at least one iteration in the remainder
3216 // loop. See the cost model for when this can happen. If the step evenly
3217 // divides the trip count, we set the remainder to be equal to the step. If
3218 // the step does not evenly divide the trip count, no adjustment is necessary
3219 // since there will already be scalar iterations. Note that the minimum
3220 // iterations check ensures that N >= Step.
3221 if (Cost
->requiresScalarEpilogue(VF
)) {
3222 auto *IsZero
= Builder
.CreateICmpEQ(R
, ConstantInt::get(R
->getType(), 0));
3223 R
= Builder
.CreateSelect(IsZero
, Step
, R
);
3226 VectorTripCount
= Builder
.CreateSub(TC
, R
, "n.vec");
3228 return VectorTripCount
;
3231 Value
*InnerLoopVectorizer::createBitOrPointerCast(Value
*V
, VectorType
*DstVTy
,
3232 const DataLayout
&DL
) {
3233 // Verify that V is a vector type with same number of elements as DstVTy.
3234 auto *DstFVTy
= cast
<FixedVectorType
>(DstVTy
);
3235 unsigned VF
= DstFVTy
->getNumElements();
3236 auto *SrcVecTy
= cast
<FixedVectorType
>(V
->getType());
3237 assert((VF
== SrcVecTy
->getNumElements()) && "Vector dimensions do not match");
3238 Type
*SrcElemTy
= SrcVecTy
->getElementType();
3239 Type
*DstElemTy
= DstFVTy
->getElementType();
3240 assert((DL
.getTypeSizeInBits(SrcElemTy
) == DL
.getTypeSizeInBits(DstElemTy
)) &&
3241 "Vector elements must have same size");
3243 // Do a direct cast if element types are castable.
3244 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy
, DstElemTy
, DL
)) {
3245 return Builder
.CreateBitOrPointerCast(V
, DstFVTy
);
3247 // V cannot be directly casted to desired vector type.
3248 // May happen when V is a floating point vector but DstVTy is a vector of
3249 // pointers or vice-versa. Handle this using a two-step bitcast using an
3250 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3251 assert((DstElemTy
->isPointerTy() != SrcElemTy
->isPointerTy()) &&
3252 "Only one type should be a pointer type");
3253 assert((DstElemTy
->isFloatingPointTy() != SrcElemTy
->isFloatingPointTy()) &&
3254 "Only one type should be a floating point type");
3256 IntegerType::getIntNTy(V
->getContext(), DL
.getTypeSizeInBits(SrcElemTy
));
3257 auto *VecIntTy
= FixedVectorType::get(IntTy
, VF
);
3258 Value
*CastVal
= Builder
.CreateBitOrPointerCast(V
, VecIntTy
);
3259 return Builder
.CreateBitOrPointerCast(CastVal
, DstFVTy
);
3262 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop
*L
,
3263 BasicBlock
*Bypass
) {
3264 Value
*Count
= getOrCreateTripCount(L
);
3265 // Reuse existing vector loop preheader for TC checks.
3266 // Note that new preheader block is generated for vector loop.
3267 BasicBlock
*const TCCheckBlock
= LoopVectorPreHeader
;
3268 IRBuilder
<> Builder(TCCheckBlock
->getTerminator());
3270 // Generate code to check if the loop's trip count is less than VF * UF, or
3271 // equal to it in case a scalar epilogue is required; this implies that the
3272 // vector trip count is zero. This check also covers the case where adding one
3273 // to the backedge-taken count overflowed leading to an incorrect trip count
3274 // of zero. In this case we will also jump to the scalar loop.
3275 auto P
= Cost
->requiresScalarEpilogue(VF
) ? ICmpInst::ICMP_ULE
3276 : ICmpInst::ICMP_ULT
;
3278 // If tail is to be folded, vector loop takes care of all iterations.
3279 Value
*CheckMinIters
= Builder
.getFalse();
3280 if (!Cost
->foldTailByMasking()) {
3282 createStepForVF(Builder
, ConstantInt::get(Count
->getType(), UF
), VF
);
3283 CheckMinIters
= Builder
.CreateICmp(P
, Count
, Step
, "min.iters.check");
3285 // Create new preheader for vector loop.
3286 LoopVectorPreHeader
=
3287 SplitBlock(TCCheckBlock
, TCCheckBlock
->getTerminator(), DT
, LI
, nullptr,
3290 assert(DT
->properlyDominates(DT
->getNode(TCCheckBlock
),
3291 DT
->getNode(Bypass
)->getIDom()) &&
3292 "TC check is expected to dominate Bypass");
3294 // Update dominator for Bypass & LoopExit (if needed).
3295 DT
->changeImmediateDominator(Bypass
, TCCheckBlock
);
3296 if (!Cost
->requiresScalarEpilogue(VF
))
3297 // If there is an epilogue which must run, there's no edge from the
3298 // middle block to exit blocks and thus no need to update the immediate
3299 // dominator of the exit blocks.
3300 DT
->changeImmediateDominator(LoopExitBlock
, TCCheckBlock
);
3302 ReplaceInstWithInst(
3303 TCCheckBlock
->getTerminator(),
3304 BranchInst::Create(Bypass
, LoopVectorPreHeader
, CheckMinIters
));
3305 LoopBypassBlocks
.push_back(TCCheckBlock
);
3308 BasicBlock
*InnerLoopVectorizer::emitSCEVChecks(Loop
*L
, BasicBlock
*Bypass
) {
3310 BasicBlock
*const SCEVCheckBlock
=
3311 RTChecks
.emitSCEVChecks(L
, Bypass
, LoopVectorPreHeader
, LoopExitBlock
);
3312 if (!SCEVCheckBlock
)
3315 assert(!(SCEVCheckBlock
->getParent()->hasOptSize() ||
3316 (OptForSizeBasedOnProfile
&&
3317 Cost
->Hints
->getForce() != LoopVectorizeHints::FK_Enabled
)) &&
3318 "Cannot SCEV check stride or overflow when optimizing for size");
3321 // Update dominator only if this is first RT check.
3322 if (LoopBypassBlocks
.empty()) {
3323 DT
->changeImmediateDominator(Bypass
, SCEVCheckBlock
);
3324 if (!Cost
->requiresScalarEpilogue(VF
))
3325 // If there is an epilogue which must run, there's no edge from the
3326 // middle block to exit blocks and thus no need to update the immediate
3327 // dominator of the exit blocks.
3328 DT
->changeImmediateDominator(LoopExitBlock
, SCEVCheckBlock
);
3331 LoopBypassBlocks
.push_back(SCEVCheckBlock
);
3332 AddedSafetyChecks
= true;
3333 return SCEVCheckBlock
;
3336 BasicBlock
*InnerLoopVectorizer::emitMemRuntimeChecks(Loop
*L
,
3337 BasicBlock
*Bypass
) {
3338 // VPlan-native path does not do any analysis for runtime checks currently.
3339 if (EnableVPlanNativePath
)
3342 BasicBlock
*const MemCheckBlock
=
3343 RTChecks
.emitMemRuntimeChecks(L
, Bypass
, LoopVectorPreHeader
);
3345 // Check if we generated code that checks in runtime if arrays overlap. We put
3346 // the checks into a separate block to make the more common case of few
3351 if (MemCheckBlock
->getParent()->hasOptSize() || OptForSizeBasedOnProfile
) {
3352 assert(Cost
->Hints
->getForce() == LoopVectorizeHints::FK_Enabled
&&
3353 "Cannot emit memory checks when optimizing for size, unless forced "
3356 return OptimizationRemarkAnalysis(DEBUG_TYPE
, "VectorizationCodeSize",
3357 L
->getStartLoc(), L
->getHeader())
3358 << "Code-size may be reduced by not forcing "
3359 "vectorization, or by source-code modifications "
3360 "eliminating the need for runtime checks "
3361 "(e.g., adding 'restrict').";
3365 LoopBypassBlocks
.push_back(MemCheckBlock
);
3367 AddedSafetyChecks
= true;
3369 // We currently don't use LoopVersioning for the actual loop cloning but we
3370 // still use it to add the noalias metadata.
3371 LVer
= std::make_unique
<LoopVersioning
>(
3373 Legal
->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop
, LI
,
3375 LVer
->prepareNoAliasMetadata();
3376 return MemCheckBlock
;
3379 Value
*InnerLoopVectorizer::emitTransformedIndex(
3380 IRBuilder
<> &B
, Value
*Index
, ScalarEvolution
*SE
, const DataLayout
&DL
,
3381 const InductionDescriptor
&ID
) const {
3383 SCEVExpander
Exp(*SE
, DL
, "induction");
3384 auto Step
= ID
.getStep();
3385 auto StartValue
= ID
.getStartValue();
3386 assert(Index
->getType()->getScalarType() == Step
->getType() &&
3387 "Index scalar type does not match StepValue type");
3389 // Note: the IR at this point is broken. We cannot use SE to create any new
3390 // SCEV and then expand it, hoping that SCEV's simplification will give us
3391 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3392 // lead to various SCEV crashes. So all we can do is to use builder and rely
3393 // on InstCombine for future simplifications. Here we handle some trivial
3395 auto CreateAdd
= [&B
](Value
*X
, Value
*Y
) {
3396 assert(X
->getType() == Y
->getType() && "Types don't match!");
3397 if (auto *CX
= dyn_cast
<ConstantInt
>(X
))
3400 if (auto *CY
= dyn_cast
<ConstantInt
>(Y
))
3403 return B
.CreateAdd(X
, Y
);
3406 // We allow X to be a vector type, in which case Y will potentially be
3407 // splatted into a vector with the same element count.
3408 auto CreateMul
= [&B
](Value
*X
, Value
*Y
) {
3409 assert(X
->getType()->getScalarType() == Y
->getType() &&
3410 "Types don't match!");
3411 if (auto *CX
= dyn_cast
<ConstantInt
>(X
))
3414 if (auto *CY
= dyn_cast
<ConstantInt
>(Y
))
3417 VectorType
*XVTy
= dyn_cast
<VectorType
>(X
->getType());
3418 if (XVTy
&& !isa
<VectorType
>(Y
->getType()))
3419 Y
= B
.CreateVectorSplat(XVTy
->getElementCount(), Y
);
3420 return B
.CreateMul(X
, Y
);
3423 // Get a suitable insert point for SCEV expansion. For blocks in the vector
3424 // loop, choose the end of the vector loop header (=LoopVectorBody), because
3425 // the DomTree is not kept up-to-date for additional blocks generated in the
3426 // vector loop. By using the header as insertion point, we guarantee that the
3427 // expanded instructions dominate all their uses.
3428 auto GetInsertPoint
= [this, &B
]() {
3429 BasicBlock
*InsertBB
= B
.GetInsertPoint()->getParent();
3430 if (InsertBB
!= LoopVectorBody
&&
3431 LI
->getLoopFor(LoopVectorBody
) == LI
->getLoopFor(InsertBB
))
3432 return LoopVectorBody
->getTerminator();
3433 return &*B
.GetInsertPoint();
3436 switch (ID
.getKind()) {
3437 case InductionDescriptor::IK_IntInduction
: {
3438 assert(!isa
<VectorType
>(Index
->getType()) &&
3439 "Vector indices not supported for integer inductions yet");
3440 assert(Index
->getType() == StartValue
->getType() &&
3441 "Index type does not match StartValue type");
3442 if (ID
.getConstIntStepValue() && ID
.getConstIntStepValue()->isMinusOne())
3443 return B
.CreateSub(StartValue
, Index
);
3444 auto *Offset
= CreateMul(
3445 Index
, Exp
.expandCodeFor(Step
, Index
->getType(), GetInsertPoint()));
3446 return CreateAdd(StartValue
, Offset
);
3448 case InductionDescriptor::IK_PtrInduction
: {
3449 assert(isa
<SCEVConstant
>(Step
) &&
3450 "Expected constant step for pointer induction");
3452 StartValue
->getType()->getPointerElementType(), StartValue
,
3454 Exp
.expandCodeFor(Step
, Index
->getType()->getScalarType(),
3455 GetInsertPoint())));
3457 case InductionDescriptor::IK_FpInduction
: {
3458 assert(!isa
<VectorType
>(Index
->getType()) &&
3459 "Vector indices not supported for FP inductions yet");
3460 assert(Step
->getType()->isFloatingPointTy() && "Expected FP Step value");
3461 auto InductionBinOp
= ID
.getInductionBinOp();
3462 assert(InductionBinOp
&&
3463 (InductionBinOp
->getOpcode() == Instruction::FAdd
||
3464 InductionBinOp
->getOpcode() == Instruction::FSub
) &&
3465 "Original bin op should be defined for FP induction");
3467 Value
*StepValue
= cast
<SCEVUnknown
>(Step
)->getValue();
3468 Value
*MulExp
= B
.CreateFMul(StepValue
, Index
);
3469 return B
.CreateBinOp(InductionBinOp
->getOpcode(), StartValue
, MulExp
,
3472 case InductionDescriptor::IK_NoInduction
:
3475 llvm_unreachable("invalid enum");
3478 Loop
*InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix
) {
3479 LoopScalarBody
= OrigLoop
->getHeader();
3480 LoopVectorPreHeader
= OrigLoop
->getLoopPreheader();
3481 assert(LoopVectorPreHeader
&& "Invalid loop structure");
3482 LoopExitBlock
= OrigLoop
->getUniqueExitBlock(); // may be nullptr
3483 assert((LoopExitBlock
|| Cost
->requiresScalarEpilogue(VF
)) &&
3484 "multiple exit loop without required epilogue?");
3487 SplitBlock(LoopVectorPreHeader
, LoopVectorPreHeader
->getTerminator(), DT
,
3488 LI
, nullptr, Twine(Prefix
) + "middle.block");
3489 LoopScalarPreHeader
=
3490 SplitBlock(LoopMiddleBlock
, LoopMiddleBlock
->getTerminator(), DT
, LI
,
3491 nullptr, Twine(Prefix
) + "scalar.ph");
3493 auto *ScalarLatchTerm
= OrigLoop
->getLoopLatch()->getTerminator();
3495 // Set up the middle block terminator. Two cases:
3496 // 1) If we know that we must execute the scalar epilogue, emit an
3497 // unconditional branch.
3498 // 2) Otherwise, we must have a single unique exit block (due to how we
3499 // implement the multiple exit case). In this case, set up a conditonal
3500 // branch from the middle block to the loop scalar preheader, and the
3501 // exit block. completeLoopSkeleton will update the condition to use an
3502 // iteration check, if required to decide whether to execute the remainder.
3503 BranchInst
*BrInst
= Cost
->requiresScalarEpilogue(VF
) ?
3504 BranchInst::Create(LoopScalarPreHeader
) :
3505 BranchInst::Create(LoopExitBlock
, LoopScalarPreHeader
,
3507 BrInst
->setDebugLoc(ScalarLatchTerm
->getDebugLoc());
3508 ReplaceInstWithInst(LoopMiddleBlock
->getTerminator(), BrInst
);
3510 // We intentionally don't let SplitBlock to update LoopInfo since
3511 // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3512 // LoopVectorBody is explicitly added to the correct place few lines later.
3514 SplitBlock(LoopVectorPreHeader
, LoopVectorPreHeader
->getTerminator(), DT
,
3515 nullptr, nullptr, Twine(Prefix
) + "vector.body");
3517 // Update dominator for loop exit.
3518 if (!Cost
->requiresScalarEpilogue(VF
))
3519 // If there is an epilogue which must run, there's no edge from the
3520 // middle block to exit blocks and thus no need to update the immediate
3521 // dominator of the exit blocks.
3522 DT
->changeImmediateDominator(LoopExitBlock
, LoopMiddleBlock
);
3524 // Create and register the new vector loop.
3525 Loop
*Lp
= LI
->AllocateLoop();
3526 Loop
*ParentLoop
= OrigLoop
->getParentLoop();
3528 // Insert the new loop into the loop nest and register the new basic blocks
3529 // before calling any utilities such as SCEV that require valid LoopInfo.
3531 ParentLoop
->addChildLoop(Lp
);
3533 LI
->addTopLevelLoop(Lp
);
3535 Lp
->addBasicBlockToLoop(LoopVectorBody
, *LI
);
3539 void InnerLoopVectorizer::createInductionResumeValues(
3540 Loop
*L
, Value
*VectorTripCount
,
3541 std::pair
<BasicBlock
*, Value
*> AdditionalBypass
) {
3542 assert(VectorTripCount
&& L
&& "Expected valid arguments");
3543 assert(((AdditionalBypass
.first
&& AdditionalBypass
.second
) ||
3544 (!AdditionalBypass
.first
&& !AdditionalBypass
.second
)) &&
3545 "Inconsistent information about additional bypass.");
3546 // We are going to resume the execution of the scalar loop.
3547 // Go over all of the induction variables that we found and fix the
3548 // PHIs that are left in the scalar version of the loop.
3549 // The starting values of PHI nodes depend on the counter of the last
3550 // iteration in the vectorized loop.
3551 // If we come from a bypass edge then we need to start from the original
3553 for (auto &InductionEntry
: Legal
->getInductionVars()) {
3554 PHINode
*OrigPhi
= InductionEntry
.first
;
3555 InductionDescriptor II
= InductionEntry
.second
;
3557 // Create phi nodes to merge from the backedge-taken check block.
3558 PHINode
*BCResumeVal
=
3559 PHINode::Create(OrigPhi
->getType(), 3, "bc.resume.val",
3560 LoopScalarPreHeader
->getTerminator());
3561 // Copy original phi DL over to the new one.
3562 BCResumeVal
->setDebugLoc(OrigPhi
->getDebugLoc());
3563 Value
*&EndValue
= IVEndValues
[OrigPhi
];
3564 Value
*EndValueFromAdditionalBypass
= AdditionalBypass
.second
;
3565 if (OrigPhi
== OldInduction
) {
3566 // We know what the end value is.
3567 EndValue
= VectorTripCount
;
3569 IRBuilder
<> B(L
->getLoopPreheader()->getTerminator());
3571 // Fast-math-flags propagate from the original induction instruction.
3572 if (II
.getInductionBinOp() && isa
<FPMathOperator
>(II
.getInductionBinOp()))
3573 B
.setFastMathFlags(II
.getInductionBinOp()->getFastMathFlags());
3575 Type
*StepType
= II
.getStep()->getType();
3576 Instruction::CastOps CastOp
=
3577 CastInst::getCastOpcode(VectorTripCount
, true, StepType
, true);
3578 Value
*CRD
= B
.CreateCast(CastOp
, VectorTripCount
, StepType
, "cast.crd");
3579 const DataLayout
&DL
= LoopScalarBody
->getModule()->getDataLayout();
3580 EndValue
= emitTransformedIndex(B
, CRD
, PSE
.getSE(), DL
, II
);
3581 EndValue
->setName("ind.end");
3583 // Compute the end value for the additional bypass (if applicable).
3584 if (AdditionalBypass
.first
) {
3585 B
.SetInsertPoint(&(*AdditionalBypass
.first
->getFirstInsertionPt()));
3586 CastOp
= CastInst::getCastOpcode(AdditionalBypass
.second
, true,
3589 B
.CreateCast(CastOp
, AdditionalBypass
.second
, StepType
, "cast.crd");
3590 EndValueFromAdditionalBypass
=
3591 emitTransformedIndex(B
, CRD
, PSE
.getSE(), DL
, II
);
3592 EndValueFromAdditionalBypass
->setName("ind.end");
3595 // The new PHI merges the original incoming value, in case of a bypass,
3596 // or the value at the end of the vectorized loop.
3597 BCResumeVal
->addIncoming(EndValue
, LoopMiddleBlock
);
3599 // Fix the scalar body counter (PHI node).
3600 // The old induction's phi node in the scalar body needs the truncated
3602 for (BasicBlock
*BB
: LoopBypassBlocks
)
3603 BCResumeVal
->addIncoming(II
.getStartValue(), BB
);
3605 if (AdditionalBypass
.first
)
3606 BCResumeVal
->setIncomingValueForBlock(AdditionalBypass
.first
,
3607 EndValueFromAdditionalBypass
);
3609 OrigPhi
->setIncomingValueForBlock(LoopScalarPreHeader
, BCResumeVal
);
3613 BasicBlock
*InnerLoopVectorizer::completeLoopSkeleton(Loop
*L
,
3614 MDNode
*OrigLoopID
) {
3615 assert(L
&& "Expected valid loop.");
3617 // The trip counts should be cached by now.
3618 Value
*Count
= getOrCreateTripCount(L
);
3619 Value
*VectorTripCount
= getOrCreateVectorTripCount(L
);
3621 auto *ScalarLatchTerm
= OrigLoop
->getLoopLatch()->getTerminator();
3623 // Add a check in the middle block to see if we have completed
3624 // all of the iterations in the first vector loop. Three cases:
3625 // 1) If we require a scalar epilogue, there is no conditional branch as
3626 // we unconditionally branch to the scalar preheader. Do nothing.
3627 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3628 // Thus if tail is to be folded, we know we don't need to run the
3629 // remainder and we can use the previous value for the condition (true).
3630 // 3) Otherwise, construct a runtime check.
3631 if (!Cost
->requiresScalarEpilogue(VF
) && !Cost
->foldTailByMasking()) {
3632 Instruction
*CmpN
= CmpInst::Create(Instruction::ICmp
, CmpInst::ICMP_EQ
,
3633 Count
, VectorTripCount
, "cmp.n",
3634 LoopMiddleBlock
->getTerminator());
3636 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3637 // of the corresponding compare because they may have ended up with
3638 // different line numbers and we want to avoid awkward line stepping while
3639 // debugging. Eg. if the compare has got a line number inside the loop.
3640 CmpN
->setDebugLoc(ScalarLatchTerm
->getDebugLoc());
3641 cast
<BranchInst
>(LoopMiddleBlock
->getTerminator())->setCondition(CmpN
);
3644 // Get ready to start creating new instructions into the vectorized body.
3645 assert(LoopVectorPreHeader
== L
->getLoopPreheader() &&
3646 "Inconsistent vector loop preheader");
3647 Builder
.SetInsertPoint(&*LoopVectorBody
->getFirstInsertionPt());
3649 Optional
<MDNode
*> VectorizedLoopID
=
3650 makeFollowupLoopID(OrigLoopID
, {LLVMLoopVectorizeFollowupAll
,
3651 LLVMLoopVectorizeFollowupVectorized
});
3652 if (VectorizedLoopID
.hasValue()) {
3653 L
->setLoopID(VectorizedLoopID
.getValue());
3655 // Do not setAlreadyVectorized if loop attributes have been defined
3657 return LoopVectorPreHeader
;
3660 // Keep all loop hints from the original loop on the vector loop (we'll
3661 // replace the vectorizer-specific hints below).
3662 if (MDNode
*LID
= OrigLoop
->getLoopID())
3665 LoopVectorizeHints
Hints(L
, true, *ORE
);
3666 Hints
.setAlreadyVectorized();
3668 #ifdef EXPENSIVE_CHECKS
3669 assert(DT
->verify(DominatorTree::VerificationLevel::Fast
));
3673 return LoopVectorPreHeader
;
3676 BasicBlock
*InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3678 In this function we generate a new loop. The new loop will contain
3679 the vectorized instructions while the old loop will continue to run the
3682 [ ] <-- loop iteration number check.
3685 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3688 || [ ] <-- vector pre header.
3692 | [ ]_| <-- vector loop.
3695 \ -[ ] <--- middle-block.
3698 | ->[ ] <--- new preheader.
3700 (opt) v <-- edge from middle to exit iff epilogue is not required.
3702 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3705 >[ ] <-- exit block(s).
3709 // Get the metadata of the original loop before it gets modified.
3710 MDNode
*OrigLoopID
= OrigLoop
->getLoopID();
3712 // Workaround! Compute the trip count of the original loop and cache it
3713 // before we start modifying the CFG. This code has a systemic problem
3714 // wherein it tries to run analysis over partially constructed IR; this is
3715 // wrong, and not simply for SCEV. The trip count of the original loop
3716 // simply happens to be prone to hitting this in practice. In theory, we
3717 // can hit the same issue for any SCEV, or ValueTracking query done during
3718 // mutation. See PR49900.
3719 getOrCreateTripCount(OrigLoop
);
3721 // Create an empty vector loop, and prepare basic blocks for the runtime
3723 Loop
*Lp
= createVectorLoopSkeleton("");
3725 // Now, compare the new count to zero. If it is zero skip the vector loop and
3726 // jump to the scalar loop. This check also covers the case where the
3727 // backedge-taken count is uint##_max: adding one to it will overflow leading
3728 // to an incorrect trip count of zero. In this (rare) case we will also jump
3729 // to the scalar loop.
3730 emitMinimumIterationCountCheck(Lp
, LoopScalarPreHeader
);
3732 // Generate the code to check any assumptions that we've made for SCEV
3734 emitSCEVChecks(Lp
, LoopScalarPreHeader
);
3736 // Generate the code that checks in runtime if arrays overlap. We put the
3737 // checks into a separate block to make the more common case of few elements
3739 emitMemRuntimeChecks(Lp
, LoopScalarPreHeader
);
3741 // Some loops have a single integer induction variable, while other loops
3742 // don't. One example is c++ iterators that often have multiple pointer
3743 // induction variables. In the code below we also support a case where we
3744 // don't have a single induction variable.
3746 // We try to obtain an induction variable from the original loop as hard
3747 // as possible. However if we don't find one that:
3749 // - counts from zero, stepping by one
3750 // - is the size of the widest induction variable type
3751 // then we create a new one.
3752 OldInduction
= Legal
->getPrimaryInduction();
3753 Type
*IdxTy
= Legal
->getWidestInductionType();
3754 Value
*StartIdx
= ConstantInt::get(IdxTy
, 0);
3755 // The loop step is equal to the vectorization factor (num of SIMD elements)
3756 // times the unroll factor (num of SIMD instructions).
3757 Builder
.SetInsertPoint(&*Lp
->getHeader()->getFirstInsertionPt());
3758 Value
*Step
= createStepForVF(Builder
, ConstantInt::get(IdxTy
, UF
), VF
);
3759 Value
*CountRoundDown
= getOrCreateVectorTripCount(Lp
);
3761 createInductionVariable(Lp
, StartIdx
, CountRoundDown
, Step
,
3762 getDebugLocFromInstOrOperands(OldInduction
));
3764 // Emit phis for the new starting index of the scalar loop.
3765 createInductionResumeValues(Lp
, CountRoundDown
);
3767 return completeLoopSkeleton(Lp
, OrigLoopID
);
3770 // Fix up external users of the induction variable. At this point, we are
3771 // in LCSSA form, with all external PHIs that use the IV having one input value,
3772 // coming from the remainder loop. We need those PHIs to also have a correct
3773 // value for the IV when arriving directly from the middle block.
3774 void InnerLoopVectorizer::fixupIVUsers(PHINode
*OrigPhi
,
3775 const InductionDescriptor
&II
,
3776 Value
*CountRoundDown
, Value
*EndValue
,
3777 BasicBlock
*MiddleBlock
) {
3778 // There are two kinds of external IV usages - those that use the value
3779 // computed in the last iteration (the PHI) and those that use the penultimate
3780 // value (the value that feeds into the phi from the loop latch).
3781 // We allow both, but they, obviously, have different values.
3783 assert(OrigLoop
->getUniqueExitBlock() && "Expected a single exit block");
3785 DenseMap
<Value
*, Value
*> MissingVals
;
3787 // An external user of the last iteration's value should see the value that
3788 // the remainder loop uses to initialize its own IV.
3789 Value
*PostInc
= OrigPhi
->getIncomingValueForBlock(OrigLoop
->getLoopLatch());
3790 for (User
*U
: PostInc
->users()) {
3791 Instruction
*UI
= cast
<Instruction
>(U
);
3792 if (!OrigLoop
->contains(UI
)) {
3793 assert(isa
<PHINode
>(UI
) && "Expected LCSSA form");
3794 MissingVals
[UI
] = EndValue
;
3798 // An external user of the penultimate value need to see EndValue - Step.
3799 // The simplest way to get this is to recompute it from the constituent SCEVs,
3800 // that is Start + (Step * (CRD - 1)).
3801 for (User
*U
: OrigPhi
->users()) {
3802 auto *UI
= cast
<Instruction
>(U
);
3803 if (!OrigLoop
->contains(UI
)) {
3804 const DataLayout
&DL
=
3805 OrigLoop
->getHeader()->getModule()->getDataLayout();
3806 assert(isa
<PHINode
>(UI
) && "Expected LCSSA form");
3808 IRBuilder
<> B(MiddleBlock
->getTerminator());
3810 // Fast-math-flags propagate from the original induction instruction.
3811 if (II
.getInductionBinOp() && isa
<FPMathOperator
>(II
.getInductionBinOp()))
3812 B
.setFastMathFlags(II
.getInductionBinOp()->getFastMathFlags());
3814 Value
*CountMinusOne
= B
.CreateSub(
3815 CountRoundDown
, ConstantInt::get(CountRoundDown
->getType(), 1));
3817 !II
.getStep()->getType()->isIntegerTy()
3818 ? B
.CreateCast(Instruction::SIToFP
, CountMinusOne
,
3819 II
.getStep()->getType())
3820 : B
.CreateSExtOrTrunc(CountMinusOne
, II
.getStep()->getType());
3821 CMO
->setName("cast.cmo");
3822 Value
*Escape
= emitTransformedIndex(B
, CMO
, PSE
.getSE(), DL
, II
);
3823 Escape
->setName("ind.escape");
3824 MissingVals
[UI
] = Escape
;
3828 for (auto &I
: MissingVals
) {
3829 PHINode
*PHI
= cast
<PHINode
>(I
.first
);
3830 // One corner case we have to handle is two IVs "chasing" each-other,
3831 // that is %IV2 = phi [...], [ %IV1, %latch ]
3832 // In this case, if IV1 has an external use, we need to avoid adding both
3833 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3834 // don't already have an incoming value for the middle block.
3835 if (PHI
->getBasicBlockIndex(MiddleBlock
) == -1)
3836 PHI
->addIncoming(I
.second
, MiddleBlock
);
3842 struct CSEDenseMapInfo
{
3843 static bool canHandle(const Instruction
*I
) {
3844 return isa
<InsertElementInst
>(I
) || isa
<ExtractElementInst
>(I
) ||
3845 isa
<ShuffleVectorInst
>(I
) || isa
<GetElementPtrInst
>(I
);
3848 static inline Instruction
*getEmptyKey() {
3849 return DenseMapInfo
<Instruction
*>::getEmptyKey();
3852 static inline Instruction
*getTombstoneKey() {
3853 return DenseMapInfo
<Instruction
*>::getTombstoneKey();
3856 static unsigned getHashValue(const Instruction
*I
) {
3857 assert(canHandle(I
) && "Unknown instruction!");
3858 return hash_combine(I
->getOpcode(), hash_combine_range(I
->value_op_begin(),
3859 I
->value_op_end()));
3862 static bool isEqual(const Instruction
*LHS
, const Instruction
*RHS
) {
3863 if (LHS
== getEmptyKey() || RHS
== getEmptyKey() ||
3864 LHS
== getTombstoneKey() || RHS
== getTombstoneKey())
3866 return LHS
->isIdenticalTo(RHS
);
3870 } // end anonymous namespace
3872 ///Perform cse of induction variable instructions.
3873 static void cse(BasicBlock
*BB
) {
3874 // Perform simple cse.
3875 SmallDenseMap
<Instruction
*, Instruction
*, 4, CSEDenseMapInfo
> CSEMap
;
3876 for (BasicBlock::iterator I
= BB
->begin(), E
= BB
->end(); I
!= E
;) {
3877 Instruction
*In
= &*I
++;
3879 if (!CSEDenseMapInfo::canHandle(In
))
3882 // Check if we can replace this instruction with any of the
3883 // visited instructions.
3884 if (Instruction
*V
= CSEMap
.lookup(In
)) {
3885 In
->replaceAllUsesWith(V
);
3886 In
->eraseFromParent();
3895 LoopVectorizationCostModel::getVectorCallCost(CallInst
*CI
, ElementCount VF
,
3896 bool &NeedToScalarize
) const {
3897 Function
*F
= CI
->getCalledFunction();
3898 Type
*ScalarRetTy
= CI
->getType();
3899 SmallVector
<Type
*, 4> Tys
, ScalarTys
;
3900 for (auto &ArgOp
: CI
->arg_operands())
3901 ScalarTys
.push_back(ArgOp
->getType());
3903 // Estimate cost of scalarized vector call. The source operands are assumed
3904 // to be vectors, so we need to extract individual elements from there,
3905 // execute VF scalar calls, and then gather the result into the vector return
3907 InstructionCost ScalarCallCost
=
3908 TTI
.getCallInstrCost(F
, ScalarRetTy
, ScalarTys
, TTI::TCK_RecipThroughput
);
3910 return ScalarCallCost
;
3912 // Compute corresponding vector type for return value and arguments.
3913 Type
*RetTy
= ToVectorTy(ScalarRetTy
, VF
);
3914 for (Type
*ScalarTy
: ScalarTys
)
3915 Tys
.push_back(ToVectorTy(ScalarTy
, VF
));
3917 // Compute costs of unpacking argument values for the scalar calls and
3918 // packing the return values to a vector.
3919 InstructionCost ScalarizationCost
= getScalarizationOverhead(CI
, VF
);
3921 InstructionCost Cost
=
3922 ScalarCallCost
* VF
.getKnownMinValue() + ScalarizationCost
;
3924 // If we can't emit a vector call for this function, then the currently found
3925 // cost is the cost we need to return.
3926 NeedToScalarize
= true;
3927 VFShape Shape
= VFShape::get(*CI
, VF
, false /*HasGlobalPred*/);
3928 Function
*VecFunc
= VFDatabase(*CI
).getVectorizedFunction(Shape
);
3930 if (!TLI
|| CI
->isNoBuiltin() || !VecFunc
)
3933 // If the corresponding vector cost is cheaper, return its cost.
3934 InstructionCost VectorCallCost
=
3935 TTI
.getCallInstrCost(nullptr, RetTy
, Tys
, TTI::TCK_RecipThroughput
);
3936 if (VectorCallCost
< Cost
) {
3937 NeedToScalarize
= false;
3938 Cost
= VectorCallCost
;
3943 static Type
*MaybeVectorizeType(Type
*Elt
, ElementCount VF
) {
3944 if (VF
.isScalar() || (!Elt
->isIntOrPtrTy() && !Elt
->isFloatingPointTy()))
3946 return VectorType::get(Elt
, VF
);
3950 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst
*CI
,
3951 ElementCount VF
) const {
3952 Intrinsic::ID ID
= getVectorIntrinsicIDForCall(CI
, TLI
);
3953 assert(ID
&& "Expected intrinsic call!");
3954 Type
*RetTy
= MaybeVectorizeType(CI
->getType(), VF
);
3956 if (auto *FPMO
= dyn_cast
<FPMathOperator
>(CI
))
3957 FMF
= FPMO
->getFastMathFlags();
3959 SmallVector
<const Value
*> Arguments(CI
->arg_begin(), CI
->arg_end());
3960 FunctionType
*FTy
= CI
->getCalledFunction()->getFunctionType();
3961 SmallVector
<Type
*> ParamTys
;
3962 std::transform(FTy
->param_begin(), FTy
->param_end(),
3963 std::back_inserter(ParamTys
),
3964 [&](Type
*Ty
) { return MaybeVectorizeType(Ty
, VF
); });
3966 IntrinsicCostAttributes
CostAttrs(ID
, RetTy
, Arguments
, ParamTys
, FMF
,
3967 dyn_cast
<IntrinsicInst
>(CI
));
3968 return TTI
.getIntrinsicInstrCost(CostAttrs
,
3969 TargetTransformInfo::TCK_RecipThroughput
);
3972 static Type
*smallestIntegerVectorType(Type
*T1
, Type
*T2
) {
3973 auto *I1
= cast
<IntegerType
>(cast
<VectorType
>(T1
)->getElementType());
3974 auto *I2
= cast
<IntegerType
>(cast
<VectorType
>(T2
)->getElementType());
3975 return I1
->getBitWidth() < I2
->getBitWidth() ? T1
: T2
;
3978 static Type
*largestIntegerVectorType(Type
*T1
, Type
*T2
) {
3979 auto *I1
= cast
<IntegerType
>(cast
<VectorType
>(T1
)->getElementType());
3980 auto *I2
= cast
<IntegerType
>(cast
<VectorType
>(T2
)->getElementType());
3981 return I1
->getBitWidth() > I2
->getBitWidth() ? T1
: T2
;
3984 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState
&State
) {
3985 // For every instruction `I` in MinBWs, truncate the operands, create a
3986 // truncated version of `I` and reextend its result. InstCombine runs
3987 // later and will remove any ext/trunc pairs.
3988 SmallPtrSet
<Value
*, 4> Erased
;
3989 for (const auto &KV
: Cost
->getMinimalBitwidths()) {
3990 // If the value wasn't vectorized, we must maintain the original scalar
3991 // type. The absence of the value from State indicates that it
3992 // wasn't vectorized.
3993 VPValue
*Def
= State
.Plan
->getVPValue(KV
.first
);
3994 if (!State
.hasAnyVectorValue(Def
))
3996 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3997 Value
*I
= State
.get(Def
, Part
);
3998 if (Erased
.count(I
) || I
->use_empty() || !isa
<Instruction
>(I
))
4000 Type
*OriginalTy
= I
->getType();
4001 Type
*ScalarTruncatedTy
=
4002 IntegerType::get(OriginalTy
->getContext(), KV
.second
);
4003 auto *TruncatedTy
= VectorType::get(
4004 ScalarTruncatedTy
, cast
<VectorType
>(OriginalTy
)->getElementCount());
4005 if (TruncatedTy
== OriginalTy
)
4008 IRBuilder
<> B(cast
<Instruction
>(I
));
4009 auto ShrinkOperand
= [&](Value
*V
) -> Value
* {
4010 if (auto *ZI
= dyn_cast
<ZExtInst
>(V
))
4011 if (ZI
->getSrcTy() == TruncatedTy
)
4012 return ZI
->getOperand(0);
4013 return B
.CreateZExtOrTrunc(V
, TruncatedTy
);
4016 // The actual instruction modification depends on the instruction type,
4018 Value
*NewI
= nullptr;
4019 if (auto *BO
= dyn_cast
<BinaryOperator
>(I
)) {
4020 NewI
= B
.CreateBinOp(BO
->getOpcode(), ShrinkOperand(BO
->getOperand(0)),
4021 ShrinkOperand(BO
->getOperand(1)));
4023 // Any wrapping introduced by shrinking this operation shouldn't be
4024 // considered undefined behavior. So, we can't unconditionally copy
4025 // arithmetic wrapping flags to NewI.
4026 cast
<BinaryOperator
>(NewI
)->copyIRFlags(I
, /*IncludeWrapFlags=*/false);
4027 } else if (auto *CI
= dyn_cast
<ICmpInst
>(I
)) {
4029 B
.CreateICmp(CI
->getPredicate(), ShrinkOperand(CI
->getOperand(0)),
4030 ShrinkOperand(CI
->getOperand(1)));
4031 } else if (auto *SI
= dyn_cast
<SelectInst
>(I
)) {
4032 NewI
= B
.CreateSelect(SI
->getCondition(),
4033 ShrinkOperand(SI
->getTrueValue()),
4034 ShrinkOperand(SI
->getFalseValue()));
4035 } else if (auto *CI
= dyn_cast
<CastInst
>(I
)) {
4036 switch (CI
->getOpcode()) {
4038 llvm_unreachable("Unhandled cast!");
4039 case Instruction::Trunc
:
4040 NewI
= ShrinkOperand(CI
->getOperand(0));
4042 case Instruction::SExt
:
4043 NewI
= B
.CreateSExtOrTrunc(
4045 smallestIntegerVectorType(OriginalTy
, TruncatedTy
));
4047 case Instruction::ZExt
:
4048 NewI
= B
.CreateZExtOrTrunc(
4050 smallestIntegerVectorType(OriginalTy
, TruncatedTy
));
4053 } else if (auto *SI
= dyn_cast
<ShuffleVectorInst
>(I
)) {
4055 cast
<VectorType
>(SI
->getOperand(0)->getType())->getElementCount();
4056 auto *O0
= B
.CreateZExtOrTrunc(
4057 SI
->getOperand(0), VectorType::get(ScalarTruncatedTy
, Elements0
));
4059 cast
<VectorType
>(SI
->getOperand(1)->getType())->getElementCount();
4060 auto *O1
= B
.CreateZExtOrTrunc(
4061 SI
->getOperand(1), VectorType::get(ScalarTruncatedTy
, Elements1
));
4063 NewI
= B
.CreateShuffleVector(O0
, O1
, SI
->getShuffleMask());
4064 } else if (isa
<LoadInst
>(I
) || isa
<PHINode
>(I
)) {
4065 // Don't do anything with the operands, just extend the result.
4067 } else if (auto *IE
= dyn_cast
<InsertElementInst
>(I
)) {
4069 cast
<VectorType
>(IE
->getOperand(0)->getType())->getElementCount();
4070 auto *O0
= B
.CreateZExtOrTrunc(
4071 IE
->getOperand(0), VectorType::get(ScalarTruncatedTy
, Elements
));
4072 auto *O1
= B
.CreateZExtOrTrunc(IE
->getOperand(1), ScalarTruncatedTy
);
4073 NewI
= B
.CreateInsertElement(O0
, O1
, IE
->getOperand(2));
4074 } else if (auto *EE
= dyn_cast
<ExtractElementInst
>(I
)) {
4076 cast
<VectorType
>(EE
->getOperand(0)->getType())->getElementCount();
4077 auto *O0
= B
.CreateZExtOrTrunc(
4078 EE
->getOperand(0), VectorType::get(ScalarTruncatedTy
, Elements
));
4079 NewI
= B
.CreateExtractElement(O0
, EE
->getOperand(2));
4081 // If we don't know what to do, be conservative and don't do anything.
4085 // Lastly, extend the result.
4086 NewI
->takeName(cast
<Instruction
>(I
));
4087 Value
*Res
= B
.CreateZExtOrTrunc(NewI
, OriginalTy
);
4088 I
->replaceAllUsesWith(Res
);
4089 cast
<Instruction
>(I
)->eraseFromParent();
4091 State
.reset(Def
, Res
, Part
);
4095 // We'll have created a bunch of ZExts that are now parentless. Clean up.
4096 for (const auto &KV
: Cost
->getMinimalBitwidths()) {
4097 // If the value wasn't vectorized, we must maintain the original scalar
4098 // type. The absence of the value from State indicates that it
4099 // wasn't vectorized.
4100 VPValue
*Def
= State
.Plan
->getVPValue(KV
.first
);
4101 if (!State
.hasAnyVectorValue(Def
))
4103 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4104 Value
*I
= State
.get(Def
, Part
);
4105 ZExtInst
*Inst
= dyn_cast
<ZExtInst
>(I
);
4106 if (Inst
&& Inst
->use_empty()) {
4107 Value
*NewI
= Inst
->getOperand(0);
4108 Inst
->eraseFromParent();
4109 State
.reset(Def
, NewI
, Part
);
4115 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState
&State
) {
4116 // Insert truncates and extends for any truncated instructions as hints to
4119 truncateToMinimalBitwidths(State
);
4121 // Fix widened non-induction PHIs by setting up the PHI operands.
4122 if (OrigPHIsToFix
.size()) {
4123 assert(EnableVPlanNativePath
&&
4124 "Unexpected non-induction PHIs for fixup in non VPlan-native path");
4125 fixNonInductionPHIs(State
);
4128 // At this point every instruction in the original loop is widened to a
4129 // vector form. Now we need to fix the recurrences in the loop. These PHI
4130 // nodes are currently empty because we did not want to introduce cycles.
4131 // This is the second stage of vectorizing recurrences.
4132 fixCrossIterationPHIs(State
);
4134 // Forget the original basic block.
4135 PSE
.getSE()->forgetLoop(OrigLoop
);
4137 // If we inserted an edge from the middle block to the unique exit block,
4138 // update uses outside the loop (phis) to account for the newly inserted
4140 if (!Cost
->requiresScalarEpilogue(VF
)) {
4141 // Fix-up external users of the induction variables.
4142 for (auto &Entry
: Legal
->getInductionVars())
4143 fixupIVUsers(Entry
.first
, Entry
.second
,
4144 getOrCreateVectorTripCount(LI
->getLoopFor(LoopVectorBody
)),
4145 IVEndValues
[Entry
.first
], LoopMiddleBlock
);
4147 fixLCSSAPHIs(State
);
4150 for (Instruction
*PI
: PredicatedInstructions
)
4151 sinkScalarOperands(&*PI
);
4153 // Remove redundant induction instructions.
4154 cse(LoopVectorBody
);
4156 // Set/update profile weights for the vector and remainder loops as original
4157 // loop iterations are now distributed among them. Note that original loop
4158 // represented by LoopScalarBody becomes remainder loop after vectorization.
4160 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
4161 // end up getting slightly roughened result but that should be OK since
4162 // profile is not inherently precise anyway. Note also possible bypass of
4163 // vector code caused by legality checks is ignored, assigning all the weight
4164 // to the vector loop, optimistically.
4166 // For scalable vectorization we can't know at compile time how many iterations
4167 // of the loop are handled in one vector iteration, so instead assume a pessimistic
4169 setProfileInfoAfterUnrolling(
4170 LI
->getLoopFor(LoopScalarBody
), LI
->getLoopFor(LoopVectorBody
),
4171 LI
->getLoopFor(LoopScalarBody
), VF
.getKnownMinValue() * UF
);
4174 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState
&State
) {
4175 // In order to support recurrences we need to be able to vectorize Phi nodes.
4176 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4177 // stage #2: We now need to fix the recurrences by adding incoming edges to
4178 // the currently empty PHI nodes. At this point every instruction in the
4179 // original loop is widened to a vector form so we can use them to construct
4180 // the incoming edges.
4181 VPBasicBlock
*Header
= State
.Plan
->getEntry()->getEntryBasicBlock();
4182 for (VPRecipeBase
&R
: Header
->phis()) {
4183 if (auto *ReductionPhi
= dyn_cast
<VPReductionPHIRecipe
>(&R
))
4184 fixReduction(ReductionPhi
, State
);
4185 else if (auto *FOR
= dyn_cast
<VPFirstOrderRecurrencePHIRecipe
>(&R
))
4186 fixFirstOrderRecurrence(FOR
, State
);
4190 void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe
*PhiR
,
4191 VPTransformState
&State
) {
4192 // This is the second phase of vectorizing first-order recurrences. An
4193 // overview of the transformation is described below. Suppose we have the
4196 // for (int i = 0; i < n; ++i)
4197 // b[i] = a[i] - a[i - 1];
4199 // There is a first-order recurrence on "a". For this loop, the shorthand
4200 // scalar IR looks like:
4207 // i = phi [0, scalar.ph], [i+1, scalar.body]
4208 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4211 // br cond, scalar.body, ...
4213 // In this example, s1 is a recurrence because it's value depends on the
4214 // previous iteration. In the first phase of vectorization, we created a
4215 // vector phi v1 for s1. We now complete the vectorization and produce the
4216 // shorthand vector IR shown below (for VF = 4, UF = 1).
4219 // v_init = vector(..., ..., ..., a[-1])
4223 // i = phi [0, vector.ph], [i+4, vector.body]
4224 // v1 = phi [v_init, vector.ph], [v2, vector.body]
4225 // v2 = a[i, i+1, i+2, i+3];
4226 // v3 = vector(v1(3), v2(0, 1, 2))
4227 // b[i, i+1, i+2, i+3] = v2 - v3
4228 // br cond, vector.body, middle.block
4235 // s_init = phi [x, middle.block], [a[-1], otherwise]
4238 // After execution completes the vector loop, we extract the next value of
4239 // the recurrence (x) to use as the initial value in the scalar loop.
4241 // Extract the last vector element in the middle block. This will be the
4242 // initial value for the recurrence when jumping to the scalar loop.
4243 VPValue
*PreviousDef
= PhiR
->getBackedgeValue();
4244 Value
*Incoming
= State
.get(PreviousDef
, UF
- 1);
4245 auto *ExtractForScalar
= Incoming
;
4246 auto *IdxTy
= Builder
.getInt32Ty();
4247 if (VF
.isVector()) {
4248 auto *One
= ConstantInt::get(IdxTy
, 1);
4249 Builder
.SetInsertPoint(LoopMiddleBlock
->getTerminator());
4250 auto *RuntimeVF
= getRuntimeVF(Builder
, IdxTy
, VF
);
4251 auto *LastIdx
= Builder
.CreateSub(RuntimeVF
, One
);
4252 ExtractForScalar
= Builder
.CreateExtractElement(ExtractForScalar
, LastIdx
,
4253 "vector.recur.extract");
4255 // Extract the second last element in the middle block if the
4256 // Phi is used outside the loop. We need to extract the phi itself
4257 // and not the last element (the phi update in the current iteration). This
4258 // will be the value when jumping to the exit block from the LoopMiddleBlock,
4259 // when the scalar loop is not run at all.
4260 Value
*ExtractForPhiUsedOutsideLoop
= nullptr;
4261 if (VF
.isVector()) {
4262 auto *RuntimeVF
= getRuntimeVF(Builder
, IdxTy
, VF
);
4263 auto *Idx
= Builder
.CreateSub(RuntimeVF
, ConstantInt::get(IdxTy
, 2));
4264 ExtractForPhiUsedOutsideLoop
= Builder
.CreateExtractElement(
4265 Incoming
, Idx
, "vector.recur.extract.for.phi");
4267 // When loop is unrolled without vectorizing, initialize
4268 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
4269 // of `Incoming`. This is analogous to the vectorized case above: extracting
4270 // the second last element when VF > 1.
4271 ExtractForPhiUsedOutsideLoop
= State
.get(PreviousDef
, UF
- 2);
4273 // Fix the initial value of the original recurrence in the scalar loop.
4274 Builder
.SetInsertPoint(&*LoopScalarPreHeader
->begin());
4275 PHINode
*Phi
= cast
<PHINode
>(PhiR
->getUnderlyingValue());
4276 auto *Start
= Builder
.CreatePHI(Phi
->getType(), 2, "scalar.recur.init");
4277 auto *ScalarInit
= PhiR
->getStartValue()->getLiveInIRValue();
4278 for (auto *BB
: predecessors(LoopScalarPreHeader
)) {
4279 auto *Incoming
= BB
== LoopMiddleBlock
? ExtractForScalar
: ScalarInit
;
4280 Start
->addIncoming(Incoming
, BB
);
4283 Phi
->setIncomingValueForBlock(LoopScalarPreHeader
, Start
);
4284 Phi
->setName("scalar.recur");
4286 // Finally, fix users of the recurrence outside the loop. The users will need
4287 // either the last value of the scalar recurrence or the last value of the
4288 // vector recurrence we extracted in the middle block. Since the loop is in
4289 // LCSSA form, we just need to find all the phi nodes for the original scalar
4290 // recurrence in the exit block, and then add an edge for the middle block.
4291 // Note that LCSSA does not imply single entry when the original scalar loop
4292 // had multiple exiting edges (as we always run the last iteration in the
4293 // scalar epilogue); in that case, there is no edge from middle to exit and
4294 // and thus no phis which needed updated.
4295 if (!Cost
->requiresScalarEpilogue(VF
))
4296 for (PHINode
&LCSSAPhi
: LoopExitBlock
->phis())
4297 if (any_of(LCSSAPhi
.incoming_values(),
4298 [Phi
](Value
*V
) { return V
== Phi
; }))
4299 LCSSAPhi
.addIncoming(ExtractForPhiUsedOutsideLoop
, LoopMiddleBlock
);
4302 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe
*PhiR
,
4303 VPTransformState
&State
) {
4304 PHINode
*OrigPhi
= cast
<PHINode
>(PhiR
->getUnderlyingValue());
4305 // Get it's reduction variable descriptor.
4306 assert(Legal
->isReductionVariable(OrigPhi
) &&
4307 "Unable to find the reduction variable");
4308 const RecurrenceDescriptor
&RdxDesc
= PhiR
->getRecurrenceDescriptor();
4310 RecurKind RK
= RdxDesc
.getRecurrenceKind();
4311 TrackingVH
<Value
> ReductionStartValue
= RdxDesc
.getRecurrenceStartValue();
4312 Instruction
*LoopExitInst
= RdxDesc
.getLoopExitInstr();
4313 setDebugLocFromInst(ReductionStartValue
);
4315 VPValue
*LoopExitInstDef
= PhiR
->getBackedgeValue();
4316 // This is the vector-clone of the value that leaves the loop.
4317 Type
*VecTy
= State
.get(LoopExitInstDef
, 0)->getType();
4319 // Wrap flags are in general invalid after vectorization, clear them.
4320 clearReductionWrapFlags(RdxDesc
, State
);
4322 // Before each round, move the insertion point right between
4323 // the PHIs and the values we are going to write.
4324 // This allows us to write both PHINodes and the extractelement
4326 Builder
.SetInsertPoint(&*LoopMiddleBlock
->getFirstInsertionPt());
4328 setDebugLocFromInst(LoopExitInst
);
4330 Type
*PhiTy
= OrigPhi
->getType();
4331 // If tail is folded by masking, the vector value to leave the loop should be
4332 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4333 // instead of the former. For an inloop reduction the reduction will already
4334 // be predicated, and does not need to be handled here.
4335 if (Cost
->foldTailByMasking() && !PhiR
->isInLoop()) {
4336 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4337 Value
*VecLoopExitInst
= State
.get(LoopExitInstDef
, Part
);
4338 Value
*Sel
= nullptr;
4339 for (User
*U
: VecLoopExitInst
->users()) {
4340 if (isa
<SelectInst
>(U
)) {
4341 assert(!Sel
&& "Reduction exit feeding two selects");
4344 assert(isa
<PHINode
>(U
) && "Reduction exit must feed Phi's or select");
4346 assert(Sel
&& "Reduction exit feeds no select");
4347 State
.reset(LoopExitInstDef
, Sel
, Part
);
4349 // If the target can create a predicated operator for the reduction at no
4350 // extra cost in the loop (for example a predicated vadd), it can be
4351 // cheaper for the select to remain in the loop than be sunk out of it,
4352 // and so use the select value for the phi instead of the old
4354 if (PreferPredicatedReductionSelect
||
4355 TTI
->preferPredicatedReductionSelect(
4356 RdxDesc
.getOpcode(), PhiTy
,
4357 TargetTransformInfo::ReductionFlags())) {
4359 cast
<PHINode
>(State
.get(PhiR
->getVPSingleValue(), Part
));
4360 VecRdxPhi
->setIncomingValueForBlock(
4361 LI
->getLoopFor(LoopVectorBody
)->getLoopLatch(), Sel
);
4366 // If the vector reduction can be performed in a smaller type, we truncate
4367 // then extend the loop exit value to enable InstCombine to evaluate the
4368 // entire expression in the smaller type.
4369 if (VF
.isVector() && PhiTy
!= RdxDesc
.getRecurrenceType()) {
4370 assert(!PhiR
->isInLoop() && "Unexpected truncated inloop reduction!");
4371 Type
*RdxVecTy
= VectorType::get(RdxDesc
.getRecurrenceType(), VF
);
4372 Builder
.SetInsertPoint(
4373 LI
->getLoopFor(LoopVectorBody
)->getLoopLatch()->getTerminator());
4374 VectorParts
RdxParts(UF
);
4375 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4376 RdxParts
[Part
] = State
.get(LoopExitInstDef
, Part
);
4377 Value
*Trunc
= Builder
.CreateTrunc(RdxParts
[Part
], RdxVecTy
);
4378 Value
*Extnd
= RdxDesc
.isSigned() ? Builder
.CreateSExt(Trunc
, VecTy
)
4379 : Builder
.CreateZExt(Trunc
, VecTy
);
4380 for (Value::user_iterator UI
= RdxParts
[Part
]->user_begin();
4381 UI
!= RdxParts
[Part
]->user_end();)
4383 (*UI
++)->replaceUsesOfWith(RdxParts
[Part
], Extnd
);
4384 RdxParts
[Part
] = Extnd
;
4389 Builder
.SetInsertPoint(&*LoopMiddleBlock
->getFirstInsertionPt());
4390 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4391 RdxParts
[Part
] = Builder
.CreateTrunc(RdxParts
[Part
], RdxVecTy
);
4392 State
.reset(LoopExitInstDef
, RdxParts
[Part
], Part
);
4396 // Reduce all of the unrolled parts into a single vector.
4397 Value
*ReducedPartRdx
= State
.get(LoopExitInstDef
, 0);
4398 unsigned Op
= RecurrenceDescriptor::getOpcode(RK
);
4400 // The middle block terminator has already been assigned a DebugLoc here (the
4401 // OrigLoop's single latch terminator). We want the whole middle block to
4402 // appear to execute on this line because: (a) it is all compiler generated,
4403 // (b) these instructions are always executed after evaluating the latch
4404 // conditional branch, and (c) other passes may add new predecessors which
4405 // terminate on this line. This is the easiest way to ensure we don't
4406 // accidentally cause an extra step back into the loop while debugging.
4407 setDebugLocFromInst(LoopMiddleBlock
->getTerminator());
4408 if (PhiR
->isOrdered())
4409 ReducedPartRdx
= State
.get(LoopExitInstDef
, UF
- 1);
4411 // Floating-point operations should have some FMF to enable the reduction.
4412 IRBuilderBase::FastMathFlagGuard
FMFG(Builder
);
4413 Builder
.setFastMathFlags(RdxDesc
.getFastMathFlags());
4414 for (unsigned Part
= 1; Part
< UF
; ++Part
) {
4415 Value
*RdxPart
= State
.get(LoopExitInstDef
, Part
);
4416 if (Op
!= Instruction::ICmp
&& Op
!= Instruction::FCmp
) {
4417 ReducedPartRdx
= Builder
.CreateBinOp(
4418 (Instruction::BinaryOps
)Op
, RdxPart
, ReducedPartRdx
, "bin.rdx");
4420 ReducedPartRdx
= createMinMaxOp(Builder
, RK
, ReducedPartRdx
, RdxPart
);
4425 // Create the reduction after the loop. Note that inloop reductions create the
4426 // target reduction in the loop using a Reduction recipe.
4427 if (VF
.isVector() && !PhiR
->isInLoop()) {
4429 createTargetReduction(Builder
, TTI
, RdxDesc
, ReducedPartRdx
);
4430 // If the reduction can be performed in a smaller type, we need to extend
4431 // the reduction to the wider type before we branch to the original loop.
4432 if (PhiTy
!= RdxDesc
.getRecurrenceType())
4433 ReducedPartRdx
= RdxDesc
.isSigned()
4434 ? Builder
.CreateSExt(ReducedPartRdx
, PhiTy
)
4435 : Builder
.CreateZExt(ReducedPartRdx
, PhiTy
);
4438 // Create a phi node that merges control-flow from the backedge-taken check
4439 // block and the middle block.
4440 PHINode
*BCBlockPhi
= PHINode::Create(PhiTy
, 2, "bc.merge.rdx",
4441 LoopScalarPreHeader
->getTerminator());
4442 for (unsigned I
= 0, E
= LoopBypassBlocks
.size(); I
!= E
; ++I
)
4443 BCBlockPhi
->addIncoming(ReductionStartValue
, LoopBypassBlocks
[I
]);
4444 BCBlockPhi
->addIncoming(ReducedPartRdx
, LoopMiddleBlock
);
4446 // Now, we need to fix the users of the reduction variable
4447 // inside and outside of the scalar remainder loop.
4449 // We know that the loop is in LCSSA form. We need to update the PHI nodes
4450 // in the exit blocks. See comment on analogous loop in
4451 // fixFirstOrderRecurrence for a more complete explaination of the logic.
4452 if (!Cost
->requiresScalarEpilogue(VF
))
4453 for (PHINode
&LCSSAPhi
: LoopExitBlock
->phis())
4454 if (any_of(LCSSAPhi
.incoming_values(),
4455 [LoopExitInst
](Value
*V
) { return V
== LoopExitInst
; }))
4456 LCSSAPhi
.addIncoming(ReducedPartRdx
, LoopMiddleBlock
);
4458 // Fix the scalar loop reduction variable with the incoming reduction sum
4459 // from the vector body and from the backedge value.
4460 int IncomingEdgeBlockIdx
=
4461 OrigPhi
->getBasicBlockIndex(OrigLoop
->getLoopLatch());
4462 assert(IncomingEdgeBlockIdx
>= 0 && "Invalid block index");
4463 // Pick the other block.
4464 int SelfEdgeBlockIdx
= (IncomingEdgeBlockIdx
? 0 : 1);
4465 OrigPhi
->setIncomingValue(SelfEdgeBlockIdx
, BCBlockPhi
);
4466 OrigPhi
->setIncomingValue(IncomingEdgeBlockIdx
, LoopExitInst
);
4469 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor
&RdxDesc
,
4470 VPTransformState
&State
) {
4471 RecurKind RK
= RdxDesc
.getRecurrenceKind();
4472 if (RK
!= RecurKind::Add
&& RK
!= RecurKind::Mul
)
4475 Instruction
*LoopExitInstr
= RdxDesc
.getLoopExitInstr();
4476 assert(LoopExitInstr
&& "null loop exit instruction");
4477 SmallVector
<Instruction
*, 8> Worklist
;
4478 SmallPtrSet
<Instruction
*, 8> Visited
;
4479 Worklist
.push_back(LoopExitInstr
);
4480 Visited
.insert(LoopExitInstr
);
4482 while (!Worklist
.empty()) {
4483 Instruction
*Cur
= Worklist
.pop_back_val();
4484 if (isa
<OverflowingBinaryOperator
>(Cur
))
4485 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4486 Value
*V
= State
.get(State
.Plan
->getVPValue(Cur
), Part
);
4487 cast
<Instruction
>(V
)->dropPoisonGeneratingFlags();
4490 for (User
*U
: Cur
->users()) {
4491 Instruction
*UI
= cast
<Instruction
>(U
);
4492 if ((Cur
!= LoopExitInstr
|| OrigLoop
->contains(UI
->getParent())) &&
4493 Visited
.insert(UI
).second
)
4494 Worklist
.push_back(UI
);
4499 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState
&State
) {
4500 for (PHINode
&LCSSAPhi
: LoopExitBlock
->phis()) {
4501 if (LCSSAPhi
.getBasicBlockIndex(LoopMiddleBlock
) != -1)
4502 // Some phis were already hand updated by the reduction and recurrence
4503 // code above, leave them alone.
4506 auto *IncomingValue
= LCSSAPhi
.getIncomingValue(0);
4507 // Non-instruction incoming values will have only one value.
4509 VPLane Lane
= VPLane::getFirstLane();
4510 if (isa
<Instruction
>(IncomingValue
) &&
4511 !Cost
->isUniformAfterVectorization(cast
<Instruction
>(IncomingValue
),
4513 Lane
= VPLane::getLastLaneForVF(VF
);
4515 // Can be a loop invariant incoming value or the last scalar value to be
4516 // extracted from the vectorized loop.
4517 Builder
.SetInsertPoint(LoopMiddleBlock
->getTerminator());
4518 Value
*lastIncomingValue
=
4519 OrigLoop
->isLoopInvariant(IncomingValue
)
4521 : State
.get(State
.Plan
->getVPValue(IncomingValue
),
4522 VPIteration(UF
- 1, Lane
));
4523 LCSSAPhi
.addIncoming(lastIncomingValue
, LoopMiddleBlock
);
4527 void InnerLoopVectorizer::sinkScalarOperands(Instruction
*PredInst
) {
4528 // The basic block and loop containing the predicated instruction.
4529 auto *PredBB
= PredInst
->getParent();
4530 auto *VectorLoop
= LI
->getLoopFor(PredBB
);
4532 // Initialize a worklist with the operands of the predicated instruction.
4533 SetVector
<Value
*> Worklist(PredInst
->op_begin(), PredInst
->op_end());
4535 // Holds instructions that we need to analyze again. An instruction may be
4536 // reanalyzed if we don't yet know if we can sink it or not.
4537 SmallVector
<Instruction
*, 8> InstsToReanalyze
;
4539 // Returns true if a given use occurs in the predicated block. Phi nodes use
4540 // their operands in their corresponding predecessor blocks.
4541 auto isBlockOfUsePredicated
= [&](Use
&U
) -> bool {
4542 auto *I
= cast
<Instruction
>(U
.getUser());
4543 BasicBlock
*BB
= I
->getParent();
4544 if (auto *Phi
= dyn_cast
<PHINode
>(I
))
4545 BB
= Phi
->getIncomingBlock(
4546 PHINode::getIncomingValueNumForOperand(U
.getOperandNo()));
4547 return BB
== PredBB
;
4550 // Iteratively sink the scalarized operands of the predicated instruction
4551 // into the block we created for it. When an instruction is sunk, it's
4552 // operands are then added to the worklist. The algorithm ends after one pass
4553 // through the worklist doesn't sink a single instruction.
4556 // Add the instructions that need to be reanalyzed to the worklist, and
4557 // reset the changed indicator.
4558 Worklist
.insert(InstsToReanalyze
.begin(), InstsToReanalyze
.end());
4559 InstsToReanalyze
.clear();
4562 while (!Worklist
.empty()) {
4563 auto *I
= dyn_cast
<Instruction
>(Worklist
.pop_back_val());
4565 // We can't sink an instruction if it is a phi node, is not in the loop,
4566 // or may have side effects.
4567 if (!I
|| isa
<PHINode
>(I
) || !VectorLoop
->contains(I
) ||
4568 I
->mayHaveSideEffects())
4571 // If the instruction is already in PredBB, check if we can sink its
4572 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4573 // sinking the scalar instruction I, hence it appears in PredBB; but it
4574 // may have failed to sink I's operands (recursively), which we try
4576 if (I
->getParent() == PredBB
) {
4577 Worklist
.insert(I
->op_begin(), I
->op_end());
4581 // It's legal to sink the instruction if all its uses occur in the
4582 // predicated block. Otherwise, there's nothing to do yet, and we may
4583 // need to reanalyze the instruction.
4584 if (!llvm::all_of(I
->uses(), isBlockOfUsePredicated
)) {
4585 InstsToReanalyze
.push_back(I
);
4589 // Move the instruction to the beginning of the predicated block, and add
4590 // it's operands to the worklist.
4591 I
->moveBefore(&*PredBB
->getFirstInsertionPt());
4592 Worklist
.insert(I
->op_begin(), I
->op_end());
4594 // The sinking may have enabled other instructions to be sunk, so we will
4601 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState
&State
) {
4602 for (PHINode
*OrigPhi
: OrigPHIsToFix
) {
4603 VPWidenPHIRecipe
*VPPhi
=
4604 cast
<VPWidenPHIRecipe
>(State
.Plan
->getVPValue(OrigPhi
));
4605 PHINode
*NewPhi
= cast
<PHINode
>(State
.get(VPPhi
, 0));
4606 // Make sure the builder has a valid insert point.
4607 Builder
.SetInsertPoint(NewPhi
);
4608 for (unsigned i
= 0; i
< VPPhi
->getNumOperands(); ++i
) {
4609 VPValue
*Inc
= VPPhi
->getIncomingValue(i
);
4610 VPBasicBlock
*VPBB
= VPPhi
->getIncomingBlock(i
);
4611 NewPhi
->addIncoming(State
.get(Inc
, 0), State
.CFG
.VPBB2IRBB
[VPBB
]);
4616 bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor
&RdxDesc
) {
4617 return Cost
->useOrderedReductions(RdxDesc
);
4620 void InnerLoopVectorizer::widenGEP(GetElementPtrInst
*GEP
, VPValue
*VPDef
,
4621 VPUser
&Operands
, unsigned UF
,
4622 ElementCount VF
, bool IsPtrLoopInvariant
,
4623 SmallBitVector
&IsIndexLoopInvariant
,
4624 VPTransformState
&State
) {
4625 // Construct a vector GEP by widening the operands of the scalar GEP as
4626 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4627 // results in a vector of pointers when at least one operand of the GEP
4628 // is vector-typed. Thus, to keep the representation compact, we only use
4629 // vector-typed operands for loop-varying values.
4631 if (VF
.isVector() && IsPtrLoopInvariant
&& IsIndexLoopInvariant
.all()) {
4632 // If we are vectorizing, but the GEP has only loop-invariant operands,
4633 // the GEP we build (by only using vector-typed operands for
4634 // loop-varying values) would be a scalar pointer. Thus, to ensure we
4635 // produce a vector of pointers, we need to either arbitrarily pick an
4636 // operand to broadcast, or broadcast a clone of the original GEP.
4637 // Here, we broadcast a clone of the original.
4639 // TODO: If at some point we decide to scalarize instructions having
4640 // loop-invariant operands, this special case will no longer be
4641 // required. We would add the scalarization decision to
4642 // collectLoopScalars() and teach getVectorValue() to broadcast
4643 // the lane-zero scalar value.
4644 auto *Clone
= Builder
.Insert(GEP
->clone());
4645 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4646 Value
*EntryPart
= Builder
.CreateVectorSplat(VF
, Clone
);
4647 State
.set(VPDef
, EntryPart
, Part
);
4648 addMetadata(EntryPart
, GEP
);
4651 // If the GEP has at least one loop-varying operand, we are sure to
4652 // produce a vector of pointers. But if we are only unrolling, we want
4653 // to produce a scalar GEP for each unroll part. Thus, the GEP we
4654 // produce with the code below will be scalar (if VF == 1) or vector
4655 // (otherwise). Note that for the unroll-only case, we still maintain
4656 // values in the vector mapping with initVector, as we do for other
4658 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4659 // The pointer operand of the new GEP. If it's loop-invariant, we
4660 // won't broadcast it.
4661 auto *Ptr
= IsPtrLoopInvariant
4662 ? State
.get(Operands
.getOperand(0), VPIteration(0, 0))
4663 : State
.get(Operands
.getOperand(0), Part
);
4665 // Collect all the indices for the new GEP. If any index is
4666 // loop-invariant, we won't broadcast it.
4667 SmallVector
<Value
*, 4> Indices
;
4668 for (unsigned I
= 1, E
= Operands
.getNumOperands(); I
< E
; I
++) {
4669 VPValue
*Operand
= Operands
.getOperand(I
);
4670 if (IsIndexLoopInvariant
[I
- 1])
4671 Indices
.push_back(State
.get(Operand
, VPIteration(0, 0)));
4673 Indices
.push_back(State
.get(Operand
, Part
));
4676 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4677 // but it should be a vector, otherwise.
4680 ? Builder
.CreateInBoundsGEP(GEP
->getSourceElementType(), Ptr
,
4682 : Builder
.CreateGEP(GEP
->getSourceElementType(), Ptr
, Indices
);
4683 assert((VF
.isScalar() || NewGEP
->getType()->isVectorTy()) &&
4684 "NewGEP is not a pointer vector");
4685 State
.set(VPDef
, NewGEP
, Part
);
4686 addMetadata(NewGEP
, GEP
);
4691 void InnerLoopVectorizer::widenPHIInstruction(Instruction
*PN
,
4692 VPWidenPHIRecipe
*PhiR
,
4693 VPTransformState
&State
) {
4694 PHINode
*P
= cast
<PHINode
>(PN
);
4695 if (EnableVPlanNativePath
) {
4696 // Currently we enter here in the VPlan-native path for non-induction
4697 // PHIs where all control flow is uniform. We simply widen these PHIs.
4698 // Create a vector phi with no operands - the vector phi operands will be
4699 // set at the end of vector code generation.
4700 Type
*VecTy
= (State
.VF
.isScalar())
4702 : VectorType::get(PN
->getType(), State
.VF
);
4703 Value
*VecPhi
= Builder
.CreatePHI(VecTy
, PN
->getNumOperands(), "vec.phi");
4704 State
.set(PhiR
, VecPhi
, 0);
4705 OrigPHIsToFix
.push_back(P
);
4710 assert(PN
->getParent() == OrigLoop
->getHeader() &&
4711 "Non-header phis should have been handled elsewhere");
4713 // In order to support recurrences we need to be able to vectorize Phi nodes.
4714 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4715 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4716 // this value when we vectorize all of the instructions that use the PHI.
4718 assert(!Legal
->isReductionVariable(P
) &&
4719 "reductions should be handled elsewhere");
4721 setDebugLocFromInst(P
);
4723 // This PHINode must be an induction variable.
4724 // Make sure that we know about it.
4725 assert(Legal
->getInductionVars().count(P
) && "Not an induction variable");
4727 InductionDescriptor II
= Legal
->getInductionVars().lookup(P
);
4728 const DataLayout
&DL
= OrigLoop
->getHeader()->getModule()->getDataLayout();
4730 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4731 // which can be found from the original scalar operations.
4732 switch (II
.getKind()) {
4733 case InductionDescriptor::IK_NoInduction
:
4734 llvm_unreachable("Unknown induction");
4735 case InductionDescriptor::IK_IntInduction
:
4736 case InductionDescriptor::IK_FpInduction
:
4737 llvm_unreachable("Integer/fp induction is handled elsewhere.");
4738 case InductionDescriptor::IK_PtrInduction
: {
4739 // Handle the pointer induction variable case.
4740 assert(P
->getType()->isPointerTy() && "Unexpected type.");
4742 if (Cost
->isScalarAfterVectorization(P
, State
.VF
)) {
4743 // This is the normalized GEP that starts counting at zero.
4745 Builder
.CreateSExtOrTrunc(Induction
, II
.getStep()->getType());
4746 // Determine the number of scalars we need to generate for each unroll
4747 // iteration. If the instruction is uniform, we only need to generate the
4748 // first lane. Otherwise, we generate all VF values.
4749 bool IsUniform
= Cost
->isUniformAfterVectorization(P
, State
.VF
);
4750 unsigned Lanes
= IsUniform
? 1 : State
.VF
.getKnownMinValue();
4752 bool NeedsVectorIndex
= !IsUniform
&& VF
.isScalable();
4753 Value
*UnitStepVec
= nullptr, *PtrIndSplat
= nullptr;
4754 if (NeedsVectorIndex
) {
4755 Type
*VecIVTy
= VectorType::get(PtrInd
->getType(), VF
);
4756 UnitStepVec
= Builder
.CreateStepVector(VecIVTy
);
4757 PtrIndSplat
= Builder
.CreateVectorSplat(VF
, PtrInd
);
4760 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4761 Value
*PartStart
= createStepForVF(
4762 Builder
, ConstantInt::get(PtrInd
->getType(), Part
), VF
);
4764 if (NeedsVectorIndex
) {
4765 Value
*PartStartSplat
= Builder
.CreateVectorSplat(VF
, PartStart
);
4766 Value
*Indices
= Builder
.CreateAdd(PartStartSplat
, UnitStepVec
);
4767 Value
*GlobalIndices
= Builder
.CreateAdd(PtrIndSplat
, Indices
);
4769 emitTransformedIndex(Builder
, GlobalIndices
, PSE
.getSE(), DL
, II
);
4770 SclrGep
->setName("next.gep");
4771 State
.set(PhiR
, SclrGep
, Part
);
4772 // We've cached the whole vector, which means we can support the
4773 // extraction of any lane.
4777 for (unsigned Lane
= 0; Lane
< Lanes
; ++Lane
) {
4778 Value
*Idx
= Builder
.CreateAdd(
4779 PartStart
, ConstantInt::get(PtrInd
->getType(), Lane
));
4780 Value
*GlobalIdx
= Builder
.CreateAdd(PtrInd
, Idx
);
4782 emitTransformedIndex(Builder
, GlobalIdx
, PSE
.getSE(), DL
, II
);
4783 SclrGep
->setName("next.gep");
4784 State
.set(PhiR
, SclrGep
, VPIteration(Part
, Lane
));
4789 assert(isa
<SCEVConstant
>(II
.getStep()) &&
4790 "Induction step not a SCEV constant!");
4791 Type
*PhiType
= II
.getStep()->getType();
4793 // Build a pointer phi
4794 Value
*ScalarStartValue
= II
.getStartValue();
4795 Type
*ScStValueType
= ScalarStartValue
->getType();
4796 PHINode
*NewPointerPhi
=
4797 PHINode::Create(ScStValueType
, 2, "pointer.phi", Induction
);
4798 NewPointerPhi
->addIncoming(ScalarStartValue
, LoopVectorPreHeader
);
4800 // A pointer induction, performed by using a gep
4801 BasicBlock
*LoopLatch
= LI
->getLoopFor(LoopVectorBody
)->getLoopLatch();
4802 Instruction
*InductionLoc
= LoopLatch
->getTerminator();
4803 const SCEV
*ScalarStep
= II
.getStep();
4804 SCEVExpander
Exp(*PSE
.getSE(), DL
, "induction");
4805 Value
*ScalarStepValue
=
4806 Exp
.expandCodeFor(ScalarStep
, PhiType
, InductionLoc
);
4807 Value
*RuntimeVF
= getRuntimeVF(Builder
, PhiType
, VF
);
4808 Value
*NumUnrolledElems
=
4809 Builder
.CreateMul(RuntimeVF
, ConstantInt::get(PhiType
, State
.UF
));
4810 Value
*InductionGEP
= GetElementPtrInst::Create(
4811 ScStValueType
->getPointerElementType(), NewPointerPhi
,
4812 Builder
.CreateMul(ScalarStepValue
, NumUnrolledElems
), "ptr.ind",
4814 NewPointerPhi
->addIncoming(InductionGEP
, LoopLatch
);
4816 // Create UF many actual address geps that use the pointer
4817 // phi as base and a vectorized version of the step value
4818 // (<step*0, ..., step*N>) as offset.
4819 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
) {
4820 Type
*VecPhiType
= VectorType::get(PhiType
, State
.VF
);
4821 Value
*StartOffsetScalar
=
4822 Builder
.CreateMul(RuntimeVF
, ConstantInt::get(PhiType
, Part
));
4823 Value
*StartOffset
=
4824 Builder
.CreateVectorSplat(State
.VF
, StartOffsetScalar
);
4825 // Create a vector of consecutive numbers from zero to VF.
4827 Builder
.CreateAdd(StartOffset
, Builder
.CreateStepVector(VecPhiType
));
4829 Value
*GEP
= Builder
.CreateGEP(
4830 ScStValueType
->getPointerElementType(), NewPointerPhi
,
4832 StartOffset
, Builder
.CreateVectorSplat(State
.VF
, ScalarStepValue
),
4834 State
.set(PhiR
, GEP
, Part
);
4840 /// A helper function for checking whether an integer division-related
4841 /// instruction may divide by zero (in which case it must be predicated if
4842 /// executed conditionally in the scalar code).
4843 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4844 /// Non-zero divisors that are non compile-time constants will not be
4845 /// converted into multiplication, so we will still end up scalarizing
4846 /// the division, but can do so w/o predication.
4847 static bool mayDivideByZero(Instruction
&I
) {
4848 assert((I
.getOpcode() == Instruction::UDiv
||
4849 I
.getOpcode() == Instruction::SDiv
||
4850 I
.getOpcode() == Instruction::URem
||
4851 I
.getOpcode() == Instruction::SRem
) &&
4852 "Unexpected instruction");
4853 Value
*Divisor
= I
.getOperand(1);
4854 auto *CInt
= dyn_cast
<ConstantInt
>(Divisor
);
4855 return !CInt
|| CInt
->isZero();
4858 void InnerLoopVectorizer::widenInstruction(Instruction
&I
, VPValue
*Def
,
4860 VPTransformState
&State
) {
4861 switch (I
.getOpcode()) {
4862 case Instruction::Call
:
4863 case Instruction::Br
:
4864 case Instruction::PHI
:
4865 case Instruction::GetElementPtr
:
4866 case Instruction::Select
:
4867 llvm_unreachable("This instruction is handled by a different recipe.");
4868 case Instruction::UDiv
:
4869 case Instruction::SDiv
:
4870 case Instruction::SRem
:
4871 case Instruction::URem
:
4872 case Instruction::Add
:
4873 case Instruction::FAdd
:
4874 case Instruction::Sub
:
4875 case Instruction::FSub
:
4876 case Instruction::FNeg
:
4877 case Instruction::Mul
:
4878 case Instruction::FMul
:
4879 case Instruction::FDiv
:
4880 case Instruction::FRem
:
4881 case Instruction::Shl
:
4882 case Instruction::LShr
:
4883 case Instruction::AShr
:
4884 case Instruction::And
:
4885 case Instruction::Or
:
4886 case Instruction::Xor
: {
4887 // Just widen unops and binops.
4888 setDebugLocFromInst(&I
);
4890 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4891 SmallVector
<Value
*, 2> Ops
;
4892 for (VPValue
*VPOp
: User
.operands())
4893 Ops
.push_back(State
.get(VPOp
, Part
));
4895 Value
*V
= Builder
.CreateNAryOp(I
.getOpcode(), Ops
);
4897 if (auto *VecOp
= dyn_cast
<Instruction
>(V
))
4898 VecOp
->copyIRFlags(&I
);
4900 // Use this vector value for all users of the original instruction.
4901 State
.set(Def
, V
, Part
);
4907 case Instruction::ICmp
:
4908 case Instruction::FCmp
: {
4909 // Widen compares. Generate vector compares.
4910 bool FCmp
= (I
.getOpcode() == Instruction::FCmp
);
4911 auto *Cmp
= cast
<CmpInst
>(&I
);
4912 setDebugLocFromInst(Cmp
);
4913 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4914 Value
*A
= State
.get(User
.getOperand(0), Part
);
4915 Value
*B
= State
.get(User
.getOperand(1), Part
);
4918 // Propagate fast math flags.
4919 IRBuilder
<>::FastMathFlagGuard
FMFG(Builder
);
4920 Builder
.setFastMathFlags(Cmp
->getFastMathFlags());
4921 C
= Builder
.CreateFCmp(Cmp
->getPredicate(), A
, B
);
4923 C
= Builder
.CreateICmp(Cmp
->getPredicate(), A
, B
);
4925 State
.set(Def
, C
, Part
);
4932 case Instruction::ZExt
:
4933 case Instruction::SExt
:
4934 case Instruction::FPToUI
:
4935 case Instruction::FPToSI
:
4936 case Instruction::FPExt
:
4937 case Instruction::PtrToInt
:
4938 case Instruction::IntToPtr
:
4939 case Instruction::SIToFP
:
4940 case Instruction::UIToFP
:
4941 case Instruction::Trunc
:
4942 case Instruction::FPTrunc
:
4943 case Instruction::BitCast
: {
4944 auto *CI
= cast
<CastInst
>(&I
);
4945 setDebugLocFromInst(CI
);
4947 /// Vectorize casts.
4949 (VF
.isScalar()) ? CI
->getType() : VectorType::get(CI
->getType(), VF
);
4951 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4952 Value
*A
= State
.get(User
.getOperand(0), Part
);
4953 Value
*Cast
= Builder
.CreateCast(CI
->getOpcode(), A
, DestTy
);
4954 State
.set(Def
, Cast
, Part
);
4955 addMetadata(Cast
, &I
);
4960 // This instruction is not vectorized by simple widening.
4961 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I
);
4962 llvm_unreachable("Unhandled instruction!");
4966 void InnerLoopVectorizer::widenCallInstruction(CallInst
&I
, VPValue
*Def
,
4967 VPUser
&ArgOperands
,
4968 VPTransformState
&State
) {
4969 assert(!isa
<DbgInfoIntrinsic
>(I
) &&
4970 "DbgInfoIntrinsic should have been dropped during VPlan construction");
4971 setDebugLocFromInst(&I
);
4973 Module
*M
= I
.getParent()->getParent()->getParent();
4974 auto *CI
= cast
<CallInst
>(&I
);
4976 SmallVector
<Type
*, 4> Tys
;
4977 for (Value
*ArgOperand
: CI
->arg_operands())
4978 Tys
.push_back(ToVectorTy(ArgOperand
->getType(), VF
.getKnownMinValue()));
4980 Intrinsic::ID ID
= getVectorIntrinsicIDForCall(CI
, TLI
);
4982 // The flag shows whether we use Intrinsic or a usual Call for vectorized
4983 // version of the instruction.
4984 // Is it beneficial to perform intrinsic call compared to lib call?
4985 bool NeedToScalarize
= false;
4986 InstructionCost CallCost
= Cost
->getVectorCallCost(CI
, VF
, NeedToScalarize
);
4987 InstructionCost IntrinsicCost
= ID
? Cost
->getVectorIntrinsicCost(CI
, VF
) : 0;
4988 bool UseVectorIntrinsic
= ID
&& IntrinsicCost
<= CallCost
;
4989 assert((UseVectorIntrinsic
|| !NeedToScalarize
) &&
4990 "Instruction should be scalarized elsewhere.");
4991 assert((IntrinsicCost
.isValid() || CallCost
.isValid()) &&
4992 "Either the intrinsic cost or vector call cost must be valid");
4994 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4995 SmallVector
<Type
*, 2> TysForDecl
= {CI
->getType()};
4996 SmallVector
<Value
*, 4> Args
;
4997 for (auto &I
: enumerate(ArgOperands
.operands())) {
4998 // Some intrinsics have a scalar argument - don't replace it with a
5001 if (!UseVectorIntrinsic
|| !hasVectorInstrinsicScalarOpd(ID
, I
.index()))
5002 Arg
= State
.get(I
.value(), Part
);
5004 Arg
= State
.get(I
.value(), VPIteration(0, 0));
5005 if (hasVectorInstrinsicOverloadedScalarOpd(ID
, I
.index()))
5006 TysForDecl
.push_back(Arg
->getType());
5008 Args
.push_back(Arg
);
5012 if (UseVectorIntrinsic
) {
5013 // Use vector version of the intrinsic.
5015 TysForDecl
[0] = VectorType::get(CI
->getType()->getScalarType(), VF
);
5016 VectorF
= Intrinsic::getDeclaration(M
, ID
, TysForDecl
);
5017 assert(VectorF
&& "Can't retrieve vector intrinsic.");
5019 // Use vector version of the function call.
5020 const VFShape Shape
= VFShape::get(*CI
, VF
, false /*HasGlobalPred*/);
5022 assert(VFDatabase(*CI
).getVectorizedFunction(Shape
) != nullptr &&
5023 "Can't create vector function.");
5025 VectorF
= VFDatabase(*CI
).getVectorizedFunction(Shape
);
5027 SmallVector
<OperandBundleDef
, 1> OpBundles
;
5028 CI
->getOperandBundlesAsDefs(OpBundles
);
5029 CallInst
*V
= Builder
.CreateCall(VectorF
, Args
, OpBundles
);
5031 if (isa
<FPMathOperator
>(V
))
5032 V
->copyFastMathFlags(CI
);
5034 State
.set(Def
, V
, Part
);
5039 void InnerLoopVectorizer::widenSelectInstruction(SelectInst
&I
, VPValue
*VPDef
,
5042 VPTransformState
&State
) {
5043 setDebugLocFromInst(&I
);
5045 // The condition can be loop invariant but still defined inside the
5046 // loop. This means that we can't just use the original 'cond' value.
5047 // We have to take the 'vectorized' value and pick the first lane.
5048 // Instcombine will make this a no-op.
5049 auto *InvarCond
= InvariantCond
5050 ? State
.get(Operands
.getOperand(0), VPIteration(0, 0))
5053 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
5055 InvarCond
? InvarCond
: State
.get(Operands
.getOperand(0), Part
);
5056 Value
*Op0
= State
.get(Operands
.getOperand(1), Part
);
5057 Value
*Op1
= State
.get(Operands
.getOperand(2), Part
);
5058 Value
*Sel
= Builder
.CreateSelect(Cond
, Op0
, Op1
);
5059 State
.set(VPDef
, Sel
, Part
);
5060 addMetadata(Sel
, &I
);
5064 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF
) {
5065 // We should not collect Scalars more than once per VF. Right now, this
5066 // function is called from collectUniformsAndScalars(), which already does
5067 // this check. Collecting Scalars for VF=1 does not make any sense.
5068 assert(VF
.isVector() && Scalars
.find(VF
) == Scalars
.end() &&
5069 "This function should not be visited twice for the same VF");
5071 SmallSetVector
<Instruction
*, 8> Worklist
;
5073 // These sets are used to seed the analysis with pointers used by memory
5074 // accesses that will remain scalar.
5075 SmallSetVector
<Instruction
*, 8> ScalarPtrs
;
5076 SmallPtrSet
<Instruction
*, 8> PossibleNonScalarPtrs
;
5077 auto *Latch
= TheLoop
->getLoopLatch();
5079 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
5080 // The pointer operands of loads and stores will be scalar as long as the
5081 // memory access is not a gather or scatter operation. The value operand of a
5082 // store will remain scalar if the store is scalarized.
5083 auto isScalarUse
= [&](Instruction
*MemAccess
, Value
*Ptr
) {
5084 InstWidening WideningDecision
= getWideningDecision(MemAccess
, VF
);
5085 assert(WideningDecision
!= CM_Unknown
&&
5086 "Widening decision should be ready at this moment");
5087 if (auto *Store
= dyn_cast
<StoreInst
>(MemAccess
))
5088 if (Ptr
== Store
->getValueOperand())
5089 return WideningDecision
== CM_Scalarize
;
5090 assert(Ptr
== getLoadStorePointerOperand(MemAccess
) &&
5091 "Ptr is neither a value or pointer operand");
5092 return WideningDecision
!= CM_GatherScatter
;
5095 // A helper that returns true if the given value is a bitcast or
5096 // getelementptr instruction contained in the loop.
5097 auto isLoopVaryingBitCastOrGEP
= [&](Value
*V
) {
5098 return ((isa
<BitCastInst
>(V
) && V
->getType()->isPointerTy()) ||
5099 isa
<GetElementPtrInst
>(V
)) &&
5100 !TheLoop
->isLoopInvariant(V
);
5103 auto isScalarPtrInduction
= [&](Instruction
*MemAccess
, Value
*Ptr
) {
5104 if (!isa
<PHINode
>(Ptr
) ||
5105 !Legal
->getInductionVars().count(cast
<PHINode
>(Ptr
)))
5107 auto &Induction
= Legal
->getInductionVars()[cast
<PHINode
>(Ptr
)];
5108 if (Induction
.getKind() != InductionDescriptor::IK_PtrInduction
)
5110 return isScalarUse(MemAccess
, Ptr
);
5113 // A helper that evaluates a memory access's use of a pointer. If the
5114 // pointer is actually the pointer induction of a loop, it is being
5115 // inserted into Worklist. If the use will be a scalar use, and the
5116 // pointer is only used by memory accesses, we place the pointer in
5117 // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
5118 auto evaluatePtrUse
= [&](Instruction
*MemAccess
, Value
*Ptr
) {
5119 if (isScalarPtrInduction(MemAccess
, Ptr
)) {
5120 Worklist
.insert(cast
<Instruction
>(Ptr
));
5121 LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
5124 Instruction
*Update
= cast
<Instruction
>(
5125 cast
<PHINode
>(Ptr
)->getIncomingValueForBlock(Latch
));
5126 ScalarPtrs
.insert(Update
);
5129 // We only care about bitcast and getelementptr instructions contained in
5131 if (!isLoopVaryingBitCastOrGEP(Ptr
))
5134 // If the pointer has already been identified as scalar (e.g., if it was
5135 // also identified as uniform), there's nothing to do.
5136 auto *I
= cast
<Instruction
>(Ptr
);
5137 if (Worklist
.count(I
))
5140 // If the use of the pointer will be a scalar use, and all users of the
5141 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5142 // place the pointer in PossibleNonScalarPtrs.
5143 if (isScalarUse(MemAccess
, Ptr
) && llvm::all_of(I
->users(), [&](User
*U
) {
5144 return isa
<LoadInst
>(U
) || isa
<StoreInst
>(U
);
5146 ScalarPtrs
.insert(I
);
5148 PossibleNonScalarPtrs
.insert(I
);
5151 // We seed the scalars analysis with three classes of instructions: (1)
5152 // instructions marked uniform-after-vectorization and (2) bitcast,
5153 // getelementptr and (pointer) phi instructions used by memory accesses
5154 // requiring a scalar use.
5156 // (1) Add to the worklist all instructions that have been identified as
5157 // uniform-after-vectorization.
5158 Worklist
.insert(Uniforms
[VF
].begin(), Uniforms
[VF
].end());
5160 // (2) Add to the worklist all bitcast and getelementptr instructions used by
5161 // memory accesses requiring a scalar use. The pointer operands of loads and
5162 // stores will be scalar as long as the memory accesses is not a gather or
5163 // scatter operation. The value operand of a store will remain scalar if the
5164 // store is scalarized.
5165 for (auto *BB
: TheLoop
->blocks())
5166 for (auto &I
: *BB
) {
5167 if (auto *Load
= dyn_cast
<LoadInst
>(&I
)) {
5168 evaluatePtrUse(Load
, Load
->getPointerOperand());
5169 } else if (auto *Store
= dyn_cast
<StoreInst
>(&I
)) {
5170 evaluatePtrUse(Store
, Store
->getPointerOperand());
5171 evaluatePtrUse(Store
, Store
->getValueOperand());
5174 for (auto *I
: ScalarPtrs
)
5175 if (!PossibleNonScalarPtrs
.count(I
)) {
5176 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I
<< "\n");
5180 // Insert the forced scalars.
5181 // FIXME: Currently widenPHIInstruction() often creates a dead vector
5182 // induction variable when the PHI user is scalarized.
5183 auto ForcedScalar
= ForcedScalars
.find(VF
);
5184 if (ForcedScalar
!= ForcedScalars
.end())
5185 for (auto *I
: ForcedScalar
->second
)
5188 // Expand the worklist by looking through any bitcasts and getelementptr
5189 // instructions we've already identified as scalar. This is similar to the
5190 // expansion step in collectLoopUniforms(); however, here we're only
5191 // expanding to include additional bitcasts and getelementptr instructions.
5193 while (Idx
!= Worklist
.size()) {
5194 Instruction
*Dst
= Worklist
[Idx
++];
5195 if (!isLoopVaryingBitCastOrGEP(Dst
->getOperand(0)))
5197 auto *Src
= cast
<Instruction
>(Dst
->getOperand(0));
5198 if (llvm::all_of(Src
->users(), [&](User
*U
) -> bool {
5199 auto *J
= cast
<Instruction
>(U
);
5200 return !TheLoop
->contains(J
) || Worklist
.count(J
) ||
5201 ((isa
<LoadInst
>(J
) || isa
<StoreInst
>(J
)) &&
5202 isScalarUse(J
, Src
));
5204 Worklist
.insert(Src
);
5205 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src
<< "\n");
5209 // An induction variable will remain scalar if all users of the induction
5210 // variable and induction variable update remain scalar.
5211 for (auto &Induction
: Legal
->getInductionVars()) {
5212 auto *Ind
= Induction
.first
;
5213 auto *IndUpdate
= cast
<Instruction
>(Ind
->getIncomingValueForBlock(Latch
));
5215 // If tail-folding is applied, the primary induction variable will be used
5216 // to feed a vector compare.
5217 if (Ind
== Legal
->getPrimaryInduction() && foldTailByMasking())
5220 // Determine if all users of the induction variable are scalar after
5222 auto ScalarInd
= llvm::all_of(Ind
->users(), [&](User
*U
) -> bool {
5223 auto *I
= cast
<Instruction
>(U
);
5224 return I
== IndUpdate
|| !TheLoop
->contains(I
) || Worklist
.count(I
);
5229 // Determine if all users of the induction variable update instruction are
5230 // scalar after vectorization.
5231 auto ScalarIndUpdate
=
5232 llvm::all_of(IndUpdate
->users(), [&](User
*U
) -> bool {
5233 auto *I
= cast
<Instruction
>(U
);
5234 return I
== Ind
|| !TheLoop
->contains(I
) || Worklist
.count(I
);
5236 if (!ScalarIndUpdate
)
5239 // The induction variable and its update instruction will remain scalar.
5240 Worklist
.insert(Ind
);
5241 Worklist
.insert(IndUpdate
);
5242 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind
<< "\n");
5243 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
5247 Scalars
[VF
].insert(Worklist
.begin(), Worklist
.end());
5250 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction
*I
) const {
5251 if (!blockNeedsPredication(I
->getParent()))
5253 switch(I
->getOpcode()) {
5256 case Instruction::Load
:
5257 case Instruction::Store
: {
5258 if (!Legal
->isMaskRequired(I
))
5260 auto *Ptr
= getLoadStorePointerOperand(I
);
5261 auto *Ty
= getLoadStoreType(I
);
5262 const Align Alignment
= getLoadStoreAlignment(I
);
5263 return isa
<LoadInst
>(I
) ? !(isLegalMaskedLoad(Ty
, Ptr
, Alignment
) ||
5264 TTI
.isLegalMaskedGather(Ty
, Alignment
))
5265 : !(isLegalMaskedStore(Ty
, Ptr
, Alignment
) ||
5266 TTI
.isLegalMaskedScatter(Ty
, Alignment
));
5268 case Instruction::UDiv
:
5269 case Instruction::SDiv
:
5270 case Instruction::SRem
:
5271 case Instruction::URem
:
5272 return mayDivideByZero(*I
);
5277 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5278 Instruction
*I
, ElementCount VF
) {
5279 assert(isAccessInterleaved(I
) && "Expecting interleaved access.");
5280 assert(getWideningDecision(I
, VF
) == CM_Unknown
&&
5281 "Decision should not be set yet.");
5282 auto *Group
= getInterleavedAccessGroup(I
);
5283 assert(Group
&& "Must have a group.");
5285 // If the instruction's allocated size doesn't equal it's type size, it
5286 // requires padding and will be scalarized.
5287 auto &DL
= I
->getModule()->getDataLayout();
5288 auto *ScalarTy
= getLoadStoreType(I
);
5289 if (hasIrregularType(ScalarTy
, DL
))
5292 // Check if masking is required.
5293 // A Group may need masking for one of two reasons: it resides in a block that
5294 // needs predication, or it was decided to use masking to deal with gaps
5295 // (either a gap at the end of a load-access that may result in a speculative
5296 // load, or any gaps in a store-access).
5297 bool PredicatedAccessRequiresMasking
=
5298 Legal
->blockNeedsPredication(I
->getParent()) && Legal
->isMaskRequired(I
);
5299 bool LoadAccessWithGapsRequiresEpilogMasking
=
5300 isa
<LoadInst
>(I
) && Group
->requiresScalarEpilogue() &&
5301 !isScalarEpilogueAllowed();
5302 bool StoreAccessWithGapsRequiresMasking
=
5303 isa
<StoreInst
>(I
) && (Group
->getNumMembers() < Group
->getFactor());
5304 if (!PredicatedAccessRequiresMasking
&&
5305 !LoadAccessWithGapsRequiresEpilogMasking
&&
5306 !StoreAccessWithGapsRequiresMasking
)
5309 // If masked interleaving is required, we expect that the user/target had
5310 // enabled it, because otherwise it either wouldn't have been created or
5311 // it should have been invalidated by the CostModel.
5312 assert(useMaskedInterleavedAccesses(TTI
) &&
5313 "Masked interleave-groups for predicated accesses are not enabled.");
5315 auto *Ty
= getLoadStoreType(I
);
5316 const Align Alignment
= getLoadStoreAlignment(I
);
5317 return isa
<LoadInst
>(I
) ? TTI
.isLegalMaskedLoad(Ty
, Alignment
)
5318 : TTI
.isLegalMaskedStore(Ty
, Alignment
);
5321 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5322 Instruction
*I
, ElementCount VF
) {
5323 // Get and ensure we have a valid memory instruction.
5324 LoadInst
*LI
= dyn_cast
<LoadInst
>(I
);
5325 StoreInst
*SI
= dyn_cast
<StoreInst
>(I
);
5326 assert((LI
|| SI
) && "Invalid memory instruction");
5328 auto *Ptr
= getLoadStorePointerOperand(I
);
5330 // In order to be widened, the pointer should be consecutive, first of all.
5331 if (!Legal
->isConsecutivePtr(Ptr
))
5334 // If the instruction is a store located in a predicated block, it will be
5336 if (isScalarWithPredication(I
))
5339 // If the instruction's allocated size doesn't equal it's type size, it
5340 // requires padding and will be scalarized.
5341 auto &DL
= I
->getModule()->getDataLayout();
5342 auto *ScalarTy
= LI
? LI
->getType() : SI
->getValueOperand()->getType();
5343 if (hasIrregularType(ScalarTy
, DL
))
5349 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF
) {
5350 // We should not collect Uniforms more than once per VF. Right now,
5351 // this function is called from collectUniformsAndScalars(), which
5352 // already does this check. Collecting Uniforms for VF=1 does not make any
5355 assert(VF
.isVector() && Uniforms
.find(VF
) == Uniforms
.end() &&
5356 "This function should not be visited twice for the same VF");
5358 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5359 // not analyze again. Uniforms.count(VF) will return 1.
5360 Uniforms
[VF
].clear();
5362 // We now know that the loop is vectorizable!
5363 // Collect instructions inside the loop that will remain uniform after
5366 // Global values, params and instructions outside of current loop are out of
5368 auto isOutOfScope
= [&](Value
*V
) -> bool {
5369 Instruction
*I
= dyn_cast
<Instruction
>(V
);
5370 return (!I
|| !TheLoop
->contains(I
));
5373 SetVector
<Instruction
*> Worklist
;
5374 BasicBlock
*Latch
= TheLoop
->getLoopLatch();
5376 // Instructions that are scalar with predication must not be considered
5377 // uniform after vectorization, because that would create an erroneous
5378 // replicating region where only a single instance out of VF should be formed.
5379 // TODO: optimize such seldom cases if found important, see PR40816.
5380 auto addToWorklistIfAllowed
= [&](Instruction
*I
) -> void {
5381 if (isOutOfScope(I
)) {
5382 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5386 if (isScalarWithPredication(I
)) {
5387 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5391 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I
<< "\n");
5395 // Start with the conditional branch. If the branch condition is an
5396 // instruction contained in the loop that is only used by the branch, it is
5398 auto *Cmp
= dyn_cast
<Instruction
>(Latch
->getTerminator()->getOperand(0));
5399 if (Cmp
&& TheLoop
->contains(Cmp
) && Cmp
->hasOneUse())
5400 addToWorklistIfAllowed(Cmp
);
5402 auto isUniformDecision
= [&](Instruction
*I
, ElementCount VF
) {
5403 InstWidening WideningDecision
= getWideningDecision(I
, VF
);
5404 assert(WideningDecision
!= CM_Unknown
&&
5405 "Widening decision should be ready at this moment");
5407 // A uniform memory op is itself uniform. We exclude uniform stores
5408 // here as they demand the last lane, not the first one.
5409 if (isa
<LoadInst
>(I
) && Legal
->isUniformMemOp(*I
)) {
5410 assert(WideningDecision
== CM_Scalarize
);
5414 return (WideningDecision
== CM_Widen
||
5415 WideningDecision
== CM_Widen_Reverse
||
5416 WideningDecision
== CM_Interleave
);
5420 // Returns true if Ptr is the pointer operand of a memory access instruction
5421 // I, and I is known to not require scalarization.
5422 auto isVectorizedMemAccessUse
= [&](Instruction
*I
, Value
*Ptr
) -> bool {
5423 return getLoadStorePointerOperand(I
) == Ptr
&& isUniformDecision(I
, VF
);
5426 // Holds a list of values which are known to have at least one uniform use.
5427 // Note that there may be other uses which aren't uniform. A "uniform use"
5428 // here is something which only demands lane 0 of the unrolled iterations;
5429 // it does not imply that all lanes produce the same value (e.g. this is not
5430 // the usual meaning of uniform)
5431 SetVector
<Value
*> HasUniformUse
;
5433 // Scan the loop for instructions which are either a) known to have only
5434 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5435 for (auto *BB
: TheLoop
->blocks())
5436 for (auto &I
: *BB
) {
5437 if (IntrinsicInst
*II
= dyn_cast
<IntrinsicInst
>(&I
)) {
5438 switch (II
->getIntrinsicID()) {
5439 case Intrinsic::sideeffect
:
5440 case Intrinsic::experimental_noalias_scope_decl
:
5441 case Intrinsic::assume
:
5442 case Intrinsic::lifetime_start
:
5443 case Intrinsic::lifetime_end
:
5444 if (TheLoop
->hasLoopInvariantOperands(&I
))
5445 addToWorklistIfAllowed(&I
);
5452 // ExtractValue instructions must be uniform, because the operands are
5453 // known to be loop-invariant.
5454 if (auto *EVI
= dyn_cast
<ExtractValueInst
>(&I
)) {
5455 assert(isOutOfScope(EVI
->getAggregateOperand()) &&
5456 "Expected aggregate value to be loop invariant");
5457 addToWorklistIfAllowed(EVI
);
5461 // If there's no pointer operand, there's nothing to do.
5462 auto *Ptr
= getLoadStorePointerOperand(&I
);
5466 // A uniform memory op is itself uniform. We exclude uniform stores
5467 // here as they demand the last lane, not the first one.
5468 if (isa
<LoadInst
>(I
) && Legal
->isUniformMemOp(I
))
5469 addToWorklistIfAllowed(&I
);
5471 if (isUniformDecision(&I
, VF
)) {
5472 assert(isVectorizedMemAccessUse(&I
, Ptr
) && "consistency check");
5473 HasUniformUse
.insert(Ptr
);
5477 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5478 // demanding) users. Since loops are assumed to be in LCSSA form, this
5479 // disallows uses outside the loop as well.
5480 for (auto *V
: HasUniformUse
) {
5481 if (isOutOfScope(V
))
5483 auto *I
= cast
<Instruction
>(V
);
5484 auto UsersAreMemAccesses
=
5485 llvm::all_of(I
->users(), [&](User
*U
) -> bool {
5486 return isVectorizedMemAccessUse(cast
<Instruction
>(U
), V
);
5488 if (UsersAreMemAccesses
)
5489 addToWorklistIfAllowed(I
);
5492 // Expand Worklist in topological order: whenever a new instruction
5493 // is added , its users should be already inside Worklist. It ensures
5494 // a uniform instruction will only be used by uniform instructions.
5496 while (idx
!= Worklist
.size()) {
5497 Instruction
*I
= Worklist
[idx
++];
5499 for (auto OV
: I
->operand_values()) {
5500 // isOutOfScope operands cannot be uniform instructions.
5501 if (isOutOfScope(OV
))
5503 // First order recurrence Phi's should typically be considered
5505 auto *OP
= dyn_cast
<PHINode
>(OV
);
5506 if (OP
&& Legal
->isFirstOrderRecurrence(OP
))
5508 // If all the users of the operand are uniform, then add the
5509 // operand into the uniform worklist.
5510 auto *OI
= cast
<Instruction
>(OV
);
5511 if (llvm::all_of(OI
->users(), [&](User
*U
) -> bool {
5512 auto *J
= cast
<Instruction
>(U
);
5513 return Worklist
.count(J
) || isVectorizedMemAccessUse(J
, OI
);
5515 addToWorklistIfAllowed(OI
);
5519 // For an instruction to be added into Worklist above, all its users inside
5520 // the loop should also be in Worklist. However, this condition cannot be
5521 // true for phi nodes that form a cyclic dependence. We must process phi
5522 // nodes separately. An induction variable will remain uniform if all users
5523 // of the induction variable and induction variable update remain uniform.
5524 // The code below handles both pointer and non-pointer induction variables.
5525 for (auto &Induction
: Legal
->getInductionVars()) {
5526 auto *Ind
= Induction
.first
;
5527 auto *IndUpdate
= cast
<Instruction
>(Ind
->getIncomingValueForBlock(Latch
));
5529 // Determine if all users of the induction variable are uniform after
5531 auto UniformInd
= llvm::all_of(Ind
->users(), [&](User
*U
) -> bool {
5532 auto *I
= cast
<Instruction
>(U
);
5533 return I
== IndUpdate
|| !TheLoop
->contains(I
) || Worklist
.count(I
) ||
5534 isVectorizedMemAccessUse(I
, Ind
);
5539 // Determine if all users of the induction variable update instruction are
5540 // uniform after vectorization.
5541 auto UniformIndUpdate
=
5542 llvm::all_of(IndUpdate
->users(), [&](User
*U
) -> bool {
5543 auto *I
= cast
<Instruction
>(U
);
5544 return I
== Ind
|| !TheLoop
->contains(I
) || Worklist
.count(I
) ||
5545 isVectorizedMemAccessUse(I
, IndUpdate
);
5547 if (!UniformIndUpdate
)
5550 // The induction variable and its update instruction will remain uniform.
5551 addToWorklistIfAllowed(Ind
);
5552 addToWorklistIfAllowed(IndUpdate
);
5555 Uniforms
[VF
].insert(Worklist
.begin(), Worklist
.end());
5558 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5559 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5561 if (Legal
->getRuntimePointerChecking()->Need
) {
5562 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5563 "runtime pointer checks needed. Enable vectorization of this "
5564 "loop with '#pragma clang loop vectorize(enable)' when "
5565 "compiling with -Os/-Oz",
5566 "CantVersionLoopWithOptForSize", ORE
, TheLoop
);
5570 if (!PSE
.getUnionPredicate().getPredicates().empty()) {
5571 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5572 "runtime SCEV checks needed. Enable vectorization of this "
5573 "loop with '#pragma clang loop vectorize(enable)' when "
5574 "compiling with -Os/-Oz",
5575 "CantVersionLoopWithOptForSize", ORE
, TheLoop
);
5579 // FIXME: Avoid specializing for stride==1 instead of bailing out.
5580 if (!Legal
->getLAI()->getSymbolicStrides().empty()) {
5581 reportVectorizationFailure("Runtime stride check for small trip count",
5582 "runtime stride == 1 checks needed. Enable vectorization of "
5583 "this loop without such check by compiling with -Os/-Oz",
5584 "CantVersionLoopWithOptForSize", ORE
, TheLoop
);
5592 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements
) {
5593 if (!TTI
.supportsScalableVectors() && !ForceTargetSupportsScalableVectors
)
5594 return ElementCount::getScalable(0);
5596 if (Hints
->isScalableVectorizationDisabled()) {
5597 reportVectorizationInfo("Scalable vectorization is explicitly disabled",
5598 "ScalableVectorizationDisabled", ORE
, TheLoop
);
5599 return ElementCount::getScalable(0);
5602 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
5604 auto MaxScalableVF
= ElementCount::getScalable(
5605 std::numeric_limits
<ElementCount::ScalarTy
>::max());
5607 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
5608 // FIXME: While for scalable vectors this is currently sufficient, this should
5609 // be replaced by a more detailed mechanism that filters out specific VFs,
5610 // instead of invalidating vectorization for a whole set of VFs based on the
5613 // Disable scalable vectorization if the loop contains unsupported reductions.
5614 if (!canVectorizeReductions(MaxScalableVF
)) {
5615 reportVectorizationInfo(
5616 "Scalable vectorization not supported for the reduction "
5617 "operations found in this loop.",
5618 "ScalableVFUnfeasible", ORE
, TheLoop
);
5619 return ElementCount::getScalable(0);
5622 // Disable scalable vectorization if the loop contains any instructions
5623 // with element types not supported for scalable vectors.
5624 if (any_of(ElementTypesInLoop
, [&](Type
*Ty
) {
5625 return !Ty
->isVoidTy() &&
5626 !this->TTI
.isElementTypeLegalForScalableVector(Ty
);
5628 reportVectorizationInfo("Scalable vectorization is not supported "
5629 "for all element types found in this loop.",
5630 "ScalableVFUnfeasible", ORE
, TheLoop
);
5631 return ElementCount::getScalable(0);
5634 if (Legal
->isSafeForAnyVectorWidth())
5635 return MaxScalableVF
;
5637 // Limit MaxScalableVF by the maximum safe dependence distance.
5638 Optional
<unsigned> MaxVScale
= TTI
.getMaxVScale();
5639 if (!MaxVScale
&& TheFunction
->hasFnAttribute(Attribute::VScaleRange
)) {
5640 unsigned VScaleMax
= TheFunction
->getFnAttribute(Attribute::VScaleRange
)
5641 .getVScaleRangeArgs()
5644 MaxVScale
= VScaleMax
;
5646 MaxScalableVF
= ElementCount::getScalable(
5647 MaxVScale
? (MaxSafeElements
/ MaxVScale
.getValue()) : 0);
5649 reportVectorizationInfo(
5650 "Max legal vector width too small, scalable vectorization "
5652 "ScalableVFUnfeasible", ORE
, TheLoop
);
5654 return MaxScalableVF
;
5658 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount
,
5659 ElementCount UserVF
) {
5660 MinBWs
= computeMinimumValueSizes(TheLoop
->getBlocks(), *DB
, &TTI
);
5661 unsigned SmallestType
, WidestType
;
5662 std::tie(SmallestType
, WidestType
) = getSmallestAndWidestTypes();
5664 // Get the maximum safe dependence distance in bits computed by LAA.
5665 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5666 // the memory accesses that is most restrictive (involved in the smallest
5667 // dependence distance).
5668 unsigned MaxSafeElements
=
5669 PowerOf2Floor(Legal
->getMaxSafeVectorWidthInBits() / WidestType
);
5671 auto MaxSafeFixedVF
= ElementCount::getFixed(MaxSafeElements
);
5672 auto MaxSafeScalableVF
= getMaxLegalScalableVF(MaxSafeElements
);
5674 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
5676 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
5679 // First analyze the UserVF, fall back if the UserVF should be ignored.
5681 auto MaxSafeUserVF
=
5682 UserVF
.isScalable() ? MaxSafeScalableVF
: MaxSafeFixedVF
;
5684 if (ElementCount::isKnownLE(UserVF
, MaxSafeUserVF
)) {
5685 // If `VF=vscale x N` is safe, then so is `VF=N`
5686 if (UserVF
.isScalable())
5687 return FixedScalableVFPair(
5688 ElementCount::getFixed(UserVF
.getKnownMinValue()), UserVF
);
5693 assert(ElementCount::isKnownGT(UserVF
, MaxSafeUserVF
));
5695 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
5696 // is better to ignore the hint and let the compiler choose a suitable VF.
5697 if (!UserVF
.isScalable()) {
5698 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5699 << " is unsafe, clamping to max safe VF="
5700 << MaxSafeFixedVF
<< ".\n");
5702 return OptimizationRemarkAnalysis(DEBUG_TYPE
, "VectorizationFactor",
5703 TheLoop
->getStartLoc(),
5704 TheLoop
->getHeader())
5705 << "User-specified vectorization factor "
5706 << ore::NV("UserVectorizationFactor", UserVF
)
5707 << " is unsafe, clamping to maximum safe vectorization factor "
5708 << ore::NV("VectorizationFactor", MaxSafeFixedVF
);
5710 return MaxSafeFixedVF
;
5713 if (!TTI
.supportsScalableVectors() && !ForceTargetSupportsScalableVectors
) {
5714 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5715 << " is ignored because scalable vectors are not "
5718 return OptimizationRemarkAnalysis(DEBUG_TYPE
, "VectorizationFactor",
5719 TheLoop
->getStartLoc(),
5720 TheLoop
->getHeader())
5721 << "User-specified vectorization factor "
5722 << ore::NV("UserVectorizationFactor", UserVF
)
5723 << " is ignored because the target does not support scalable "
5724 "vectors. The compiler will pick a more suitable value.";
5727 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5728 << " is unsafe. Ignoring scalable UserVF.\n");
5730 return OptimizationRemarkAnalysis(DEBUG_TYPE
, "VectorizationFactor",
5731 TheLoop
->getStartLoc(),
5732 TheLoop
->getHeader())
5733 << "User-specified vectorization factor "
5734 << ore::NV("UserVectorizationFactor", UserVF
)
5735 << " is unsafe. Ignoring the hint to let the compiler pick a "
5736 "more suitable value.";
5741 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5742 << " / " << WidestType
<< " bits.\n");
5744 FixedScalableVFPair
Result(ElementCount::getFixed(1),
5745 ElementCount::getScalable(0));
5746 if (auto MaxVF
= getMaximizedVFForTarget(ConstTripCount
, SmallestType
,
5747 WidestType
, MaxSafeFixedVF
))
5748 Result
.FixedVF
= MaxVF
;
5750 if (auto MaxVF
= getMaximizedVFForTarget(ConstTripCount
, SmallestType
,
5751 WidestType
, MaxSafeScalableVF
))
5752 if (MaxVF
.isScalable()) {
5753 Result
.ScalableVF
= MaxVF
;
5754 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
5762 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF
, unsigned UserIC
) {
5763 if (Legal
->getRuntimePointerChecking()->Need
&& TTI
.hasBranchDivergence()) {
5764 // TODO: It may by useful to do since it's still likely to be dynamically
5765 // uniform if the target can skip.
5766 reportVectorizationFailure(
5767 "Not inserting runtime ptr check for divergent target",
5768 "runtime pointer checks needed. Not enabled for divergent target",
5769 "CantVersionLoopWithDivergentTarget", ORE
, TheLoop
);
5770 return FixedScalableVFPair::getNone();
5773 unsigned TC
= PSE
.getSE()->getSmallConstantTripCount(TheLoop
);
5774 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC
<< '\n');
5776 reportVectorizationFailure("Single iteration (non) loop",
5777 "loop trip count is one, irrelevant for vectorization",
5778 "SingleIterationLoop", ORE
, TheLoop
);
5779 return FixedScalableVFPair::getNone();
5782 switch (ScalarEpilogueStatus
) {
5783 case CM_ScalarEpilogueAllowed
:
5784 return computeFeasibleMaxVF(TC
, UserVF
);
5785 case CM_ScalarEpilogueNotAllowedUsePredicate
:
5787 case CM_ScalarEpilogueNotNeededUsePredicate
:
5789 dbgs() << "LV: vector predicate hint/switch found.\n"
5790 << "LV: Not allowing scalar epilogue, creating predicated "
5791 << "vector loop.\n");
5793 case CM_ScalarEpilogueNotAllowedLowTripLoop
:
5794 // fallthrough as a special case of OptForSize
5795 case CM_ScalarEpilogueNotAllowedOptSize
:
5796 if (ScalarEpilogueStatus
== CM_ScalarEpilogueNotAllowedOptSize
)
5798 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5800 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5803 // Bail if runtime checks are required, which are not good when optimising
5805 if (runtimeChecksRequired())
5806 return FixedScalableVFPair::getNone();
5811 // The only loops we can vectorize without a scalar epilogue, are loops with
5812 // a bottom-test and a single exiting block. We'd have to handle the fact
5813 // that not every instruction executes on the last iteration. This will
5814 // require a lane mask which varies through the vector loop body. (TODO)
5815 if (TheLoop
->getExitingBlock() != TheLoop
->getLoopLatch()) {
5816 // If there was a tail-folding hint/switch, but we can't fold the tail by
5817 // masking, fallback to a vectorization with a scalar epilogue.
5818 if (ScalarEpilogueStatus
== CM_ScalarEpilogueNotNeededUsePredicate
) {
5819 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5820 "scalar epilogue instead.\n");
5821 ScalarEpilogueStatus
= CM_ScalarEpilogueAllowed
;
5822 return computeFeasibleMaxVF(TC
, UserVF
);
5824 return FixedScalableVFPair::getNone();
5827 // Now try the tail folding
5829 // Invalidate interleave groups that require an epilogue if we can't mask
5830 // the interleave-group.
5831 if (!useMaskedInterleavedAccesses(TTI
)) {
5832 assert(WideningDecisions
.empty() && Uniforms
.empty() && Scalars
.empty() &&
5833 "No decisions should have been taken at this point");
5834 // Note: There is no need to invalidate any cost modeling decisions here, as
5835 // non where taken so far.
5836 InterleaveInfo
.invalidateGroupsRequiringScalarEpilogue();
5839 FixedScalableVFPair MaxFactors
= computeFeasibleMaxVF(TC
, UserVF
);
5840 // Avoid tail folding if the trip count is known to be a multiple of any VF
5842 // FIXME: The condition below pessimises the case for fixed-width vectors,
5843 // when scalable VFs are also candidates for vectorization.
5844 if (MaxFactors
.FixedVF
.isVector() && !MaxFactors
.ScalableVF
) {
5845 ElementCount MaxFixedVF
= MaxFactors
.FixedVF
;
5846 assert((UserVF
.isNonZero() || isPowerOf2_32(MaxFixedVF
.getFixedValue())) &&
5847 "MaxFixedVF must be a power of 2");
5848 unsigned MaxVFtimesIC
= UserIC
? MaxFixedVF
.getFixedValue() * UserIC
5849 : MaxFixedVF
.getFixedValue();
5850 ScalarEvolution
*SE
= PSE
.getSE();
5851 const SCEV
*BackedgeTakenCount
= PSE
.getBackedgeTakenCount();
5852 const SCEV
*ExitCount
= SE
->getAddExpr(
5853 BackedgeTakenCount
, SE
->getOne(BackedgeTakenCount
->getType()));
5854 const SCEV
*Rem
= SE
->getURemExpr(
5855 SE
->applyLoopGuards(ExitCount
, TheLoop
),
5856 SE
->getConstant(BackedgeTakenCount
->getType(), MaxVFtimesIC
));
5857 if (Rem
->isZero()) {
5858 // Accept MaxFixedVF if we do not have a tail.
5859 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5864 // For scalable vectors, don't use tail folding as this is currently not yet
5865 // supported. The code is likely to have ended up here if the tripcount is
5866 // low, in which case it makes sense not to use scalable vectors.
5867 if (MaxFactors
.ScalableVF
.isVector())
5868 MaxFactors
.ScalableVF
= ElementCount::getScalable(0);
5870 // If we don't know the precise trip count, or if the trip count that we
5871 // found modulo the vectorization factor is not zero, try to fold the tail
5873 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5874 if (Legal
->prepareToFoldTailByMasking()) {
5875 FoldTailByMasking
= true;
5879 // If there was a tail-folding hint/switch, but we can't fold the tail by
5880 // masking, fallback to a vectorization with a scalar epilogue.
5881 if (ScalarEpilogueStatus
== CM_ScalarEpilogueNotNeededUsePredicate
) {
5882 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5883 "scalar epilogue instead.\n");
5884 ScalarEpilogueStatus
= CM_ScalarEpilogueAllowed
;
5888 if (ScalarEpilogueStatus
== CM_ScalarEpilogueNotAllowedUsePredicate
) {
5889 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5890 return FixedScalableVFPair::getNone();
5894 reportVectorizationFailure(
5895 "Unable to calculate the loop count due to complex control flow",
5896 "unable to calculate the loop count due to complex control flow",
5897 "UnknownLoopCountComplexCFG", ORE
, TheLoop
);
5898 return FixedScalableVFPair::getNone();
5901 reportVectorizationFailure(
5902 "Cannot optimize for size and vectorize at the same time.",
5903 "cannot optimize for size and vectorize at the same time. "
5904 "Enable vectorization of this loop with '#pragma clang loop "
5905 "vectorize(enable)' when compiling with -Os/-Oz",
5906 "NoTailLoopWithOptForSize", ORE
, TheLoop
);
5907 return FixedScalableVFPair::getNone();
5910 ElementCount
LoopVectorizationCostModel::getMaximizedVFForTarget(
5911 unsigned ConstTripCount
, unsigned SmallestType
, unsigned WidestType
,
5912 const ElementCount
&MaxSafeVF
) {
5913 bool ComputeScalableMaxVF
= MaxSafeVF
.isScalable();
5914 TypeSize WidestRegister
= TTI
.getRegisterBitWidth(
5915 ComputeScalableMaxVF
? TargetTransformInfo::RGK_ScalableVector
5916 : TargetTransformInfo::RGK_FixedWidthVector
);
5918 // Convenience function to return the minimum of two ElementCounts.
5919 auto MinVF
= [](const ElementCount
&LHS
, const ElementCount
&RHS
) {
5920 assert((LHS
.isScalable() == RHS
.isScalable()) &&
5921 "Scalable flags must match");
5922 return ElementCount::isKnownLT(LHS
, RHS
) ? LHS
: RHS
;
5925 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5926 // Note that both WidestRegister and WidestType may not be a powers of 2.
5927 auto MaxVectorElementCount
= ElementCount::get(
5928 PowerOf2Floor(WidestRegister
.getKnownMinSize() / WidestType
),
5929 ComputeScalableMaxVF
);
5930 MaxVectorElementCount
= MinVF(MaxVectorElementCount
, MaxSafeVF
);
5931 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5932 << (MaxVectorElementCount
* WidestType
) << " bits.\n");
5934 if (!MaxVectorElementCount
) {
5935 LLVM_DEBUG(dbgs() << "LV: The target has no "
5936 << (ComputeScalableMaxVF
? "scalable" : "fixed")
5937 << " vector registers.\n");
5938 return ElementCount::getFixed(1);
5941 const auto TripCountEC
= ElementCount::getFixed(ConstTripCount
);
5942 if (ConstTripCount
&&
5943 ElementCount::isKnownLE(TripCountEC
, MaxVectorElementCount
) &&
5944 isPowerOf2_32(ConstTripCount
)) {
5945 // We need to clamp the VF to be the ConstTripCount. There is no point in
5946 // choosing a higher viable VF as done in the loop below. If
5947 // MaxVectorElementCount is scalable, we only fall back on a fixed VF when
5948 // the TC is less than or equal to the known number of lanes.
5949 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5950 << ConstTripCount
<< "\n");
5954 ElementCount MaxVF
= MaxVectorElementCount
;
5955 if (TTI
.shouldMaximizeVectorBandwidth() ||
5956 (MaximizeBandwidth
&& isScalarEpilogueAllowed())) {
5957 auto MaxVectorElementCountMaxBW
= ElementCount::get(
5958 PowerOf2Floor(WidestRegister
.getKnownMinSize() / SmallestType
),
5959 ComputeScalableMaxVF
);
5960 MaxVectorElementCountMaxBW
= MinVF(MaxVectorElementCountMaxBW
, MaxSafeVF
);
5962 // Collect all viable vectorization factors larger than the default MaxVF
5963 // (i.e. MaxVectorElementCount).
5964 SmallVector
<ElementCount
, 8> VFs
;
5965 for (ElementCount VS
= MaxVectorElementCount
* 2;
5966 ElementCount::isKnownLE(VS
, MaxVectorElementCountMaxBW
); VS
*= 2)
5969 // For each VF calculate its register usage.
5970 auto RUs
= calculateRegisterUsage(VFs
);
5972 // Select the largest VF which doesn't require more registers than existing
5974 for (int i
= RUs
.size() - 1; i
>= 0; --i
) {
5975 bool Selected
= true;
5976 for (auto &pair
: RUs
[i
].MaxLocalUsers
) {
5977 unsigned TargetNumRegisters
= TTI
.getNumberOfRegisters(pair
.first
);
5978 if (pair
.second
> TargetNumRegisters
)
5986 if (ElementCount MinVF
=
5987 TTI
.getMinimumVF(SmallestType
, ComputeScalableMaxVF
)) {
5988 if (ElementCount::isKnownLT(MaxVF
, MinVF
)) {
5989 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5990 << ") with target's minimum: " << MinVF
<< '\n');
5998 bool LoopVectorizationCostModel::isMoreProfitable(
5999 const VectorizationFactor
&A
, const VectorizationFactor
&B
) const {
6000 InstructionCost CostA
= A
.Cost
;
6001 InstructionCost CostB
= B
.Cost
;
6003 unsigned MaxTripCount
= PSE
.getSE()->getSmallConstantMaxTripCount(TheLoop
);
6005 if (!A
.Width
.isScalable() && !B
.Width
.isScalable() && FoldTailByMasking
&&
6007 // If we are folding the tail and the trip count is a known (possibly small)
6008 // constant, the trip count will be rounded up to an integer number of
6009 // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
6010 // which we compare directly. When not folding the tail, the total cost will
6011 // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
6012 // approximated with the per-lane cost below instead of using the tripcount
6014 auto RTCostA
= CostA
* divideCeil(MaxTripCount
, A
.Width
.getFixedValue());
6015 auto RTCostB
= CostB
* divideCeil(MaxTripCount
, B
.Width
.getFixedValue());
6016 return RTCostA
< RTCostB
;
6019 // When set to preferred, for now assume vscale may be larger than 1, so
6020 // that scalable vectorization is slightly favorable over fixed-width
6022 if (Hints
->isScalableVectorizationPreferred())
6023 if (A
.Width
.isScalable() && !B
.Width
.isScalable())
6024 return (CostA
* B
.Width
.getKnownMinValue()) <=
6025 (CostB
* A
.Width
.getKnownMinValue());
6027 // To avoid the need for FP division:
6028 // (CostA / A.Width) < (CostB / B.Width)
6029 // <=> (CostA * B.Width) < (CostB * A.Width)
6030 return (CostA
* B
.Width
.getKnownMinValue()) <
6031 (CostB
* A
.Width
.getKnownMinValue());
6034 VectorizationFactor
LoopVectorizationCostModel::selectVectorizationFactor(
6035 const ElementCountSet
&VFCandidates
) {
6036 InstructionCost ExpectedCost
= expectedCost(ElementCount::getFixed(1)).first
;
6037 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost
<< ".\n");
6038 assert(ExpectedCost
.isValid() && "Unexpected invalid cost for scalar loop");
6039 assert(VFCandidates
.count(ElementCount::getFixed(1)) &&
6040 "Expected Scalar VF to be a candidate");
6042 const VectorizationFactor
ScalarCost(ElementCount::getFixed(1), ExpectedCost
);
6043 VectorizationFactor ChosenFactor
= ScalarCost
;
6045 bool ForceVectorization
= Hints
->getForce() == LoopVectorizeHints::FK_Enabled
;
6046 if (ForceVectorization
&& VFCandidates
.size() > 1) {
6047 // Ignore scalar width, because the user explicitly wants vectorization.
6048 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
6050 ChosenFactor
.Cost
= InstructionCost::getMax();
6053 SmallVector
<InstructionVFPair
> InvalidCosts
;
6054 for (const auto &i
: VFCandidates
) {
6055 // The cost for scalar VF=1 is already calculated, so ignore it.
6059 VectorizationCostTy C
= expectedCost(i
, &InvalidCosts
);
6060 VectorizationFactor
Candidate(i
, C
.first
);
6062 dbgs() << "LV: Vector loop of width " << i
<< " costs: "
6063 << (Candidate
.Cost
/ Candidate
.Width
.getKnownMinValue())
6064 << (i
.isScalable() ? " (assuming a minimum vscale of 1)" : "")
6067 if (!C
.second
&& !ForceVectorization
) {
6069 dbgs() << "LV: Not considering vector loop of width " << i
6070 << " because it will not generate any vector instructions.\n");
6074 // If profitable add it to ProfitableVF list.
6075 if (isMoreProfitable(Candidate
, ScalarCost
))
6076 ProfitableVFs
.push_back(Candidate
);
6078 if (isMoreProfitable(Candidate
, ChosenFactor
))
6079 ChosenFactor
= Candidate
;
6082 // Emit a report of VFs with invalid costs in the loop.
6083 if (!InvalidCosts
.empty()) {
6084 // Group the remarks per instruction, keeping the instruction order from
6086 std::map
<Instruction
*, unsigned> Numbering
;
6088 for (auto &Pair
: InvalidCosts
)
6089 if (!Numbering
.count(Pair
.first
))
6090 Numbering
[Pair
.first
] = I
++;
6092 // Sort the list, first on instruction(number) then on VF.
6093 llvm::sort(InvalidCosts
,
6094 [&Numbering
](InstructionVFPair
&A
, InstructionVFPair
&B
) {
6095 if (Numbering
[A
.first
] != Numbering
[B
.first
])
6096 return Numbering
[A
.first
] < Numbering
[B
.first
];
6097 ElementCountComparator ECC
;
6098 return ECC(A
.second
, B
.second
);
6101 // For a list of ordered instruction-vf pairs:
6102 // [(load, vf1), (load, vf2), (store, vf1)]
6103 // Group the instructions together to emit separate remarks for:
6106 auto Tail
= ArrayRef
<InstructionVFPair
>(InvalidCosts
);
6107 auto Subset
= ArrayRef
<InstructionVFPair
>();
6110 Subset
= Tail
.take_front(1);
6112 Instruction
*I
= Subset
.front().first
;
6114 // If the next instruction is different, or if there are no other pairs,
6115 // emit a remark for the collated subset. e.g.
6116 // [(load, vf1), (load, vf2))]
6118 // remark: invalid costs for 'load' at VF=(vf, vf2)
6119 if (Subset
== Tail
|| Tail
[Subset
.size()].first
!= I
) {
6120 std::string OutString
;
6121 raw_string_ostream
OS(OutString
);
6122 assert(!Subset
.empty() && "Unexpected empty range");
6123 OS
<< "Instruction with invalid costs prevented vectorization at VF=(";
6124 for (auto &Pair
: Subset
)
6125 OS
<< (Pair
.second
== Subset
.front().second
? "" : ", ")
6128 if (auto *CI
= dyn_cast
<CallInst
>(I
))
6129 OS
<< " call to " << CI
->getCalledFunction()->getName();
6131 OS
<< " " << I
->getOpcodeName();
6133 reportVectorizationInfo(OutString
, "InvalidCost", ORE
, TheLoop
, I
);
6134 Tail
= Tail
.drop_front(Subset
.size());
6137 // Grow the subset by one element
6138 Subset
= Tail
.take_front(Subset
.size() + 1);
6139 } while (!Tail
.empty());
6142 if (!EnableCondStoresVectorization
&& NumPredStores
) {
6143 reportVectorizationFailure("There are conditional stores.",
6144 "store that is conditionally executed prevents vectorization",
6145 "ConditionalStore", ORE
, TheLoop
);
6146 ChosenFactor
= ScalarCost
;
6149 LLVM_DEBUG(if (ForceVectorization
&& !ChosenFactor
.Width
.isScalar() &&
6150 ChosenFactor
.Cost
>= ScalarCost
.Cost
) dbgs()
6151 << "LV: Vectorization seems to be not beneficial, "
6152 << "but was forced by a user.\n");
6153 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor
.Width
<< ".\n");
6154 return ChosenFactor
;
6157 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
6158 const Loop
&L
, ElementCount VF
) const {
6159 // Cross iteration phis such as reductions need special handling and are
6160 // currently unsupported.
6161 if (any_of(L
.getHeader()->phis(), [&](PHINode
&Phi
) {
6162 return Legal
->isFirstOrderRecurrence(&Phi
) ||
6163 Legal
->isReductionVariable(&Phi
);
6167 // Phis with uses outside of the loop require special handling and are
6168 // currently unsupported.
6169 for (auto &Entry
: Legal
->getInductionVars()) {
6170 // Look for uses of the value of the induction at the last iteration.
6171 Value
*PostInc
= Entry
.first
->getIncomingValueForBlock(L
.getLoopLatch());
6172 for (User
*U
: PostInc
->users())
6173 if (!L
.contains(cast
<Instruction
>(U
)))
6175 // Look for uses of penultimate value of the induction.
6176 for (User
*U
: Entry
.first
->users())
6177 if (!L
.contains(cast
<Instruction
>(U
)))
6181 // Induction variables that are widened require special handling that is
6182 // currently not supported.
6183 if (any_of(Legal
->getInductionVars(), [&](auto &Entry
) {
6184 return !(this->isScalarAfterVectorization(Entry
.first
, VF
) ||
6185 this->isProfitableToScalarize(Entry
.first
, VF
));
6189 // Epilogue vectorization code has not been auditted to ensure it handles
6190 // non-latch exits properly. It may be fine, but it needs auditted and
6192 if (L
.getExitingBlock() != L
.getLoopLatch())
6198 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
6199 const ElementCount VF
) const {
6200 // FIXME: We need a much better cost-model to take different parameters such
6201 // as register pressure, code size increase and cost of extra branches into
6202 // account. For now we apply a very crude heuristic and only consider loops
6203 // with vectorization factors larger than a certain value.
6204 // We also consider epilogue vectorization unprofitable for targets that don't
6205 // consider interleaving beneficial (eg. MVE).
6206 if (TTI
.getMaxInterleaveFactor(VF
.getKnownMinValue()) <= 1)
6208 if (VF
.getFixedValue() >= EpilogueVectorizationMinVF
)
6214 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
6215 const ElementCount MainLoopVF
, const LoopVectorizationPlanner
&LVP
) {
6216 VectorizationFactor Result
= VectorizationFactor::Disabled();
6217 if (!EnableEpilogueVectorization
) {
6218 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
6222 if (!isScalarEpilogueAllowed()) {
6224 dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
6229 // FIXME: This can be fixed for scalable vectors later, because at this stage
6230 // the LoopVectorizer will only consider vectorizing a loop with scalable
6231 // vectors when the loop has a hint to enable vectorization for a given VF.
6232 if (MainLoopVF
.isScalable()) {
6233 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not "
6234 "yet supported.\n");
6238 // Not really a cost consideration, but check for unsupported cases here to
6239 // simplify the logic.
6240 if (!isCandidateForEpilogueVectorization(*TheLoop
, MainLoopVF
)) {
6242 dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
6243 "not a supported candidate.\n";);
6247 if (EpilogueVectorizationForceVF
> 1) {
6248 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
6249 if (LVP
.hasPlanWithVFs(
6250 {MainLoopVF
, ElementCount::getFixed(EpilogueVectorizationForceVF
)}))
6251 return {ElementCount::getFixed(EpilogueVectorizationForceVF
), 0};
6255 << "LEV: Epilogue vectorization forced factor is not viable.\n";);
6260 if (TheLoop
->getHeader()->getParent()->hasOptSize() ||
6261 TheLoop
->getHeader()->getParent()->hasMinSize()) {
6264 << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
6268 if (!isEpilogueVectorizationProfitable(MainLoopVF
))
6271 for (auto &NextVF
: ProfitableVFs
)
6272 if (ElementCount::isKnownLT(NextVF
.Width
, MainLoopVF
) &&
6273 (Result
.Width
.getFixedValue() == 1 ||
6274 isMoreProfitable(NextVF
, Result
)) &&
6275 LVP
.hasPlanWithVFs({MainLoopVF
, NextVF
.Width
}))
6278 if (Result
!= VectorizationFactor::Disabled())
6279 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
6280 << Result
.Width
.getFixedValue() << "\n";);
6284 std::pair
<unsigned, unsigned>
6285 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
6286 unsigned MinWidth
= -1U;
6287 unsigned MaxWidth
= 8;
6288 const DataLayout
&DL
= TheFunction
->getParent()->getDataLayout();
6289 for (Type
*T
: ElementTypesInLoop
) {
6290 MinWidth
= std::min
<unsigned>(
6291 MinWidth
, DL
.getTypeSizeInBits(T
->getScalarType()).getFixedSize());
6292 MaxWidth
= std::max
<unsigned>(
6293 MaxWidth
, DL
.getTypeSizeInBits(T
->getScalarType()).getFixedSize());
6295 return {MinWidth
, MaxWidth
};
6298 void LoopVectorizationCostModel::collectElementTypesForWidening() {
6299 ElementTypesInLoop
.clear();
6301 for (BasicBlock
*BB
: TheLoop
->blocks()) {
6302 // For each instruction in the loop.
6303 for (Instruction
&I
: BB
->instructionsWithoutDebug()) {
6304 Type
*T
= I
.getType();
6306 // Skip ignored values.
6307 if (ValuesToIgnore
.count(&I
))
6310 // Only examine Loads, Stores and PHINodes.
6311 if (!isa
<LoadInst
>(I
) && !isa
<StoreInst
>(I
) && !isa
<PHINode
>(I
))
6314 // Examine PHI nodes that are reduction variables. Update the type to
6315 // account for the recurrence type.
6316 if (auto *PN
= dyn_cast
<PHINode
>(&I
)) {
6317 if (!Legal
->isReductionVariable(PN
))
6319 const RecurrenceDescriptor
&RdxDesc
= Legal
->getReductionVars()[PN
];
6320 if (PreferInLoopReductions
|| useOrderedReductions(RdxDesc
) ||
6321 TTI
.preferInLoopReduction(RdxDesc
.getOpcode(),
6322 RdxDesc
.getRecurrenceType(),
6323 TargetTransformInfo::ReductionFlags()))
6325 T
= RdxDesc
.getRecurrenceType();
6328 // Examine the stored values.
6329 if (auto *ST
= dyn_cast
<StoreInst
>(&I
))
6330 T
= ST
->getValueOperand()->getType();
6332 // Ignore loaded pointer types and stored pointer types that are not
6335 // FIXME: The check here attempts to predict whether a load or store will
6336 // be vectorized. We only know this for certain after a VF has
6337 // been selected. Here, we assume that if an access can be
6338 // vectorized, it will be. We should also look at extending this
6339 // optimization to non-pointer types.
6341 if (T
->isPointerTy() && !isConsecutiveLoadOrStore(&I
) &&
6342 !isAccessInterleaved(&I
) && !isLegalGatherOrScatter(&I
))
6345 ElementTypesInLoop
.insert(T
);
6350 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF
,
6351 unsigned LoopCost
) {
6352 // -- The interleave heuristics --
6353 // We interleave the loop in order to expose ILP and reduce the loop overhead.
6354 // There are many micro-architectural considerations that we can't predict
6355 // at this level. For example, frontend pressure (on decode or fetch) due to
6356 // code size, or the number and capabilities of the execution ports.
6358 // We use the following heuristics to select the interleave count:
6359 // 1. If the code has reductions, then we interleave to break the cross
6360 // iteration dependency.
6361 // 2. If the loop is really small, then we interleave to reduce the loop
6363 // 3. We don't interleave if we think that we will spill registers to memory
6364 // due to the increased register pressure.
6366 if (!isScalarEpilogueAllowed())
6369 // We used the distance for the interleave count.
6370 if (Legal
->getMaxSafeDepDistBytes() != -1U)
6373 auto BestKnownTC
= getSmallBestKnownTC(*PSE
.getSE(), TheLoop
);
6374 const bool HasReductions
= !Legal
->getReductionVars().empty();
6375 // Do not interleave loops with a relatively small known or estimated trip
6376 // count. But we will interleave when InterleaveSmallLoopScalarReduction is
6377 // enabled, and the code has scalar reductions(HasReductions && VF = 1),
6378 // because with the above conditions interleaving can expose ILP and break
6379 // cross iteration dependences for reductions.
6380 if (BestKnownTC
&& (*BestKnownTC
< TinyTripCountInterleaveThreshold
) &&
6381 !(InterleaveSmallLoopScalarReduction
&& HasReductions
&& VF
.isScalar()))
6384 RegisterUsage R
= calculateRegisterUsage({VF
})[0];
6385 // We divide by these constants so assume that we have at least one
6386 // instruction that uses at least one register.
6387 for (auto& pair
: R
.MaxLocalUsers
) {
6388 pair
.second
= std::max(pair
.second
, 1U);
6391 // We calculate the interleave count using the following formula.
6392 // Subtract the number of loop invariants from the number of available
6393 // registers. These registers are used by all of the interleaved instances.
6394 // Next, divide the remaining registers by the number of registers that is
6395 // required by the loop, in order to estimate how many parallel instances
6396 // fit without causing spills. All of this is rounded down if necessary to be
6397 // a power of two. We want power of two interleave count to simplify any
6398 // addressing operations or alignment considerations.
6399 // We also want power of two interleave counts to ensure that the induction
6400 // variable of the vector loop wraps to zero, when tail is folded by masking;
6401 // this currently happens when OptForSize, in which case IC is set to 1 above.
6402 unsigned IC
= UINT_MAX
;
6404 for (auto& pair
: R
.MaxLocalUsers
) {
6405 unsigned TargetNumRegisters
= TTI
.getNumberOfRegisters(pair
.first
);
6406 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
6408 << TTI
.getRegisterClassName(pair
.first
) << " register class\n");
6409 if (VF
.isScalar()) {
6410 if (ForceTargetNumScalarRegs
.getNumOccurrences() > 0)
6411 TargetNumRegisters
= ForceTargetNumScalarRegs
;
6413 if (ForceTargetNumVectorRegs
.getNumOccurrences() > 0)
6414 TargetNumRegisters
= ForceTargetNumVectorRegs
;
6416 unsigned MaxLocalUsers
= pair
.second
;
6417 unsigned LoopInvariantRegs
= 0;
6418 if (R
.LoopInvariantRegs
.find(pair
.first
) != R
.LoopInvariantRegs
.end())
6419 LoopInvariantRegs
= R
.LoopInvariantRegs
[pair
.first
];
6421 unsigned TmpIC
= PowerOf2Floor((TargetNumRegisters
- LoopInvariantRegs
) / MaxLocalUsers
);
6422 // Don't count the induction variable as interleaved.
6423 if (EnableIndVarRegisterHeur
) {
6425 PowerOf2Floor((TargetNumRegisters
- LoopInvariantRegs
- 1) /
6426 std::max(1U, (MaxLocalUsers
- 1)));
6429 IC
= std::min(IC
, TmpIC
);
6432 // Clamp the interleave ranges to reasonable counts.
6433 unsigned MaxInterleaveCount
=
6434 TTI
.getMaxInterleaveFactor(VF
.getKnownMinValue());
6436 // Check if the user has overridden the max.
6437 if (VF
.isScalar()) {
6438 if (ForceTargetMaxScalarInterleaveFactor
.getNumOccurrences() > 0)
6439 MaxInterleaveCount
= ForceTargetMaxScalarInterleaveFactor
;
6441 if (ForceTargetMaxVectorInterleaveFactor
.getNumOccurrences() > 0)
6442 MaxInterleaveCount
= ForceTargetMaxVectorInterleaveFactor
;
6445 // If trip count is known or estimated compile time constant, limit the
6446 // interleave count to be less than the trip count divided by VF, provided it
6449 // For scalable vectors we can't know if interleaving is beneficial. It may
6450 // not be beneficial for small loops if none of the lanes in the second vector
6451 // iterations is enabled. However, for larger loops, there is likely to be a
6452 // similar benefit as for fixed-width vectors. For now, we choose to leave
6453 // the InterleaveCount as if vscale is '1', although if some information about
6454 // the vector is known (e.g. min vector size), we can make a better decision.
6456 MaxInterleaveCount
=
6457 std::min(*BestKnownTC
/ VF
.getKnownMinValue(), MaxInterleaveCount
);
6458 // Make sure MaxInterleaveCount is greater than 0.
6459 MaxInterleaveCount
= std::max(1u, MaxInterleaveCount
);
6462 assert(MaxInterleaveCount
> 0 &&
6463 "Maximum interleave count must be greater than 0");
6465 // Clamp the calculated IC to be between the 1 and the max interleave count
6466 // that the target and trip count allows.
6467 if (IC
> MaxInterleaveCount
)
6468 IC
= MaxInterleaveCount
;
6470 // Make sure IC is greater than 0.
6471 IC
= std::max(1u, IC
);
6473 assert(IC
> 0 && "Interleave count must be greater than 0.");
6475 // If we did not calculate the cost for VF (because the user selected the VF)
6476 // then we calculate the cost of VF here.
6477 if (LoopCost
== 0) {
6478 InstructionCost C
= expectedCost(VF
).first
;
6479 assert(C
.isValid() && "Expected to have chosen a VF with valid cost");
6480 LoopCost
= *C
.getValue();
6483 assert(LoopCost
&& "Non-zero loop cost expected");
6485 // Interleave if we vectorized this loop and there is a reduction that could
6486 // benefit from interleaving.
6487 if (VF
.isVector() && HasReductions
) {
6488 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6492 // Note that if we've already vectorized the loop we will have done the
6493 // runtime check and so interleaving won't require further checks.
6494 bool InterleavingRequiresRuntimePointerCheck
=
6495 (VF
.isScalar() && Legal
->getRuntimePointerChecking()->Need
);
6497 // We want to interleave small loops in order to reduce the loop overhead and
6498 // potentially expose ILP opportunities.
6499 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost
<< '\n'
6500 << "LV: IC is " << IC
<< '\n'
6501 << "LV: VF is " << VF
<< '\n');
6502 const bool AggressivelyInterleaveReductions
=
6503 TTI
.enableAggressiveInterleaving(HasReductions
);
6504 if (!InterleavingRequiresRuntimePointerCheck
&& LoopCost
< SmallLoopCost
) {
6505 // We assume that the cost overhead is 1 and we use the cost model
6506 // to estimate the cost of the loop and interleave until the cost of the
6507 // loop overhead is about 5% of the cost of the loop.
6509 std::min(IC
, (unsigned)PowerOf2Floor(SmallLoopCost
/ LoopCost
));
6511 // Interleave until store/load ports (estimated by max interleave count) are
6513 unsigned NumStores
= Legal
->getNumStores();
6514 unsigned NumLoads
= Legal
->getNumLoads();
6515 unsigned StoresIC
= IC
/ (NumStores
? NumStores
: 1);
6516 unsigned LoadsIC
= IC
/ (NumLoads
? NumLoads
: 1);
6518 // If we have a scalar reduction (vector reductions are already dealt with
6519 // by this point), we can increase the critical path length if the loop
6520 // we're interleaving is inside another loop. For tree-wise reductions
6521 // set the limit to 2, and for ordered reductions it's best to disable
6522 // interleaving entirely.
6523 if (HasReductions
&& TheLoop
->getLoopDepth() > 1) {
6524 bool HasOrderedReductions
=
6525 any_of(Legal
->getReductionVars(), [&](auto &Reduction
) -> bool {
6526 const RecurrenceDescriptor
&RdxDesc
= Reduction
.second
;
6527 return RdxDesc
.isOrdered();
6529 if (HasOrderedReductions
) {
6531 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
6535 unsigned F
= static_cast<unsigned>(MaxNestedScalarReductionIC
);
6536 SmallIC
= std::min(SmallIC
, F
);
6537 StoresIC
= std::min(StoresIC
, F
);
6538 LoadsIC
= std::min(LoadsIC
, F
);
6541 if (EnableLoadStoreRuntimeInterleave
&&
6542 std::max(StoresIC
, LoadsIC
) > SmallIC
) {
6544 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6545 return std::max(StoresIC
, LoadsIC
);
6548 // If there are scalar reductions and TTI has enabled aggressive
6549 // interleaving for reductions, we will interleave to expose ILP.
6550 if (InterleaveSmallLoopScalarReduction
&& VF
.isScalar() &&
6551 AggressivelyInterleaveReductions
) {
6552 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6553 // Interleave no less than SmallIC but not as aggressive as the normal IC
6554 // to satisfy the rare situation when resources are too limited.
6555 return std::max(IC
/ 2, SmallIC
);
6557 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6562 // Interleave if this is a large loop (small loops are already dealt with by
6563 // this point) that could benefit from interleaving.
6564 if (AggressivelyInterleaveReductions
) {
6565 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6569 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6573 SmallVector
<LoopVectorizationCostModel::RegisterUsage
, 8>
6574 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef
<ElementCount
> VFs
) {
6575 // This function calculates the register usage by measuring the highest number
6576 // of values that are alive at a single location. Obviously, this is a very
6577 // rough estimation. We scan the loop in a topological order in order and
6578 // assign a number to each instruction. We use RPO to ensure that defs are
6579 // met before their users. We assume that each instruction that has in-loop
6580 // users starts an interval. We record every time that an in-loop value is
6581 // used, so we have a list of the first and last occurrences of each
6582 // instruction. Next, we transpose this data structure into a multi map that
6583 // holds the list of intervals that *end* at a specific location. This multi
6584 // map allows us to perform a linear search. We scan the instructions linearly
6585 // and record each time that a new interval starts, by placing it in a set.
6586 // If we find this value in the multi-map then we remove it from the set.
6587 // The max register usage is the maximum size of the set.
6588 // We also search for instructions that are defined outside the loop, but are
6589 // used inside the loop. We need this number separately from the max-interval
6590 // usage number because when we unroll, loop-invariant values do not take
6592 LoopBlocksDFS
DFS(TheLoop
);
6597 // Each 'key' in the map opens a new interval. The values
6598 // of the map are the index of the 'last seen' usage of the
6599 // instruction that is the key.
6600 using IntervalMap
= DenseMap
<Instruction
*, unsigned>;
6602 // Maps instruction to its index.
6603 SmallVector
<Instruction
*, 64> IdxToInstr
;
6604 // Marks the end of each interval.
6605 IntervalMap EndPoint
;
6606 // Saves the list of instruction indices that are used in the loop.
6607 SmallPtrSet
<Instruction
*, 8> Ends
;
6608 // Saves the list of values that are used in the loop but are
6609 // defined outside the loop, such as arguments and constants.
6610 SmallPtrSet
<Value
*, 8> LoopInvariants
;
6612 for (BasicBlock
*BB
: make_range(DFS
.beginRPO(), DFS
.endRPO())) {
6613 for (Instruction
&I
: BB
->instructionsWithoutDebug()) {
6614 IdxToInstr
.push_back(&I
);
6616 // Save the end location of each USE.
6617 for (Value
*U
: I
.operands()) {
6618 auto *Instr
= dyn_cast
<Instruction
>(U
);
6620 // Ignore non-instruction values such as arguments, constants, etc.
6624 // If this instruction is outside the loop then record it and continue.
6625 if (!TheLoop
->contains(Instr
)) {
6626 LoopInvariants
.insert(Instr
);
6630 // Overwrite previous end points.
6631 EndPoint
[Instr
] = IdxToInstr
.size();
6637 // Saves the list of intervals that end with the index in 'key'.
6638 using InstrList
= SmallVector
<Instruction
*, 2>;
6639 DenseMap
<unsigned, InstrList
> TransposeEnds
;
6641 // Transpose the EndPoints to a list of values that end at each index.
6642 for (auto &Interval
: EndPoint
)
6643 TransposeEnds
[Interval
.second
].push_back(Interval
.first
);
6645 SmallPtrSet
<Instruction
*, 8> OpenIntervals
;
6646 SmallVector
<RegisterUsage
, 8> RUs(VFs
.size());
6647 SmallVector
<SmallMapVector
<unsigned, unsigned, 4>, 8> MaxUsages(VFs
.size());
6649 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6651 // A lambda that gets the register usage for the given type and VF.
6652 const auto &TTICapture
= TTI
;
6653 auto GetRegUsage
= [&TTICapture
](Type
*Ty
, ElementCount VF
) -> unsigned {
6654 if (Ty
->isTokenTy() || !VectorType::isValidElementType(Ty
))
6656 InstructionCost::CostType RegUsage
=
6657 *TTICapture
.getRegUsageForType(VectorType::get(Ty
, VF
)).getValue();
6658 assert(RegUsage
>= 0 && RegUsage
<= std::numeric_limits
<unsigned>::max() &&
6659 "Nonsensical values for register usage.");
6663 for (unsigned int i
= 0, s
= IdxToInstr
.size(); i
< s
; ++i
) {
6664 Instruction
*I
= IdxToInstr
[i
];
6666 // Remove all of the instructions that end at this location.
6667 InstrList
&List
= TransposeEnds
[i
];
6668 for (Instruction
*ToRemove
: List
)
6669 OpenIntervals
.erase(ToRemove
);
6671 // Ignore instructions that are never used within the loop.
6675 // Skip ignored values.
6676 if (ValuesToIgnore
.count(I
))
6679 // For each VF find the maximum usage of registers.
6680 for (unsigned j
= 0, e
= VFs
.size(); j
< e
; ++j
) {
6681 // Count the number of live intervals.
6682 SmallMapVector
<unsigned, unsigned, 4> RegUsage
;
6684 if (VFs
[j
].isScalar()) {
6685 for (auto Inst
: OpenIntervals
) {
6686 unsigned ClassID
= TTI
.getRegisterClassForType(false, Inst
->getType());
6687 if (RegUsage
.find(ClassID
) == RegUsage
.end())
6688 RegUsage
[ClassID
] = 1;
6690 RegUsage
[ClassID
] += 1;
6693 collectUniformsAndScalars(VFs
[j
]);
6694 for (auto Inst
: OpenIntervals
) {
6695 // Skip ignored values for VF > 1.
6696 if (VecValuesToIgnore
.count(Inst
))
6698 if (isScalarAfterVectorization(Inst
, VFs
[j
])) {
6699 unsigned ClassID
= TTI
.getRegisterClassForType(false, Inst
->getType());
6700 if (RegUsage
.find(ClassID
) == RegUsage
.end())
6701 RegUsage
[ClassID
] = 1;
6703 RegUsage
[ClassID
] += 1;
6705 unsigned ClassID
= TTI
.getRegisterClassForType(true, Inst
->getType());
6706 if (RegUsage
.find(ClassID
) == RegUsage
.end())
6707 RegUsage
[ClassID
] = GetRegUsage(Inst
->getType(), VFs
[j
]);
6709 RegUsage
[ClassID
] += GetRegUsage(Inst
->getType(), VFs
[j
]);
6714 for (auto& pair
: RegUsage
) {
6715 if (MaxUsages
[j
].find(pair
.first
) != MaxUsages
[j
].end())
6716 MaxUsages
[j
][pair
.first
] = std::max(MaxUsages
[j
][pair
.first
], pair
.second
);
6718 MaxUsages
[j
][pair
.first
] = pair
.second
;
6722 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i
<< " Interval # "
6723 << OpenIntervals
.size() << '\n');
6725 // Add the current instruction to the list of open intervals.
6726 OpenIntervals
.insert(I
);
6729 for (unsigned i
= 0, e
= VFs
.size(); i
< e
; ++i
) {
6730 SmallMapVector
<unsigned, unsigned, 4> Invariant
;
6732 for (auto Inst
: LoopInvariants
) {
6734 VFs
[i
].isScalar() ? 1 : GetRegUsage(Inst
->getType(), VFs
[i
]);
6736 TTI
.getRegisterClassForType(VFs
[i
].isVector(), Inst
->getType());
6737 if (Invariant
.find(ClassID
) == Invariant
.end())
6738 Invariant
[ClassID
] = Usage
;
6740 Invariant
[ClassID
] += Usage
;
6744 dbgs() << "LV(REG): VF = " << VFs
[i
] << '\n';
6745 dbgs() << "LV(REG): Found max usage: " << MaxUsages
[i
].size()
6747 for (const auto &pair
: MaxUsages
[i
]) {
6748 dbgs() << "LV(REG): RegisterClass: "
6749 << TTI
.getRegisterClassName(pair
.first
) << ", " << pair
.second
6752 dbgs() << "LV(REG): Found invariant usage: " << Invariant
.size()
6754 for (const auto &pair
: Invariant
) {
6755 dbgs() << "LV(REG): RegisterClass: "
6756 << TTI
.getRegisterClassName(pair
.first
) << ", " << pair
.second
6761 RU
.LoopInvariantRegs
= Invariant
;
6762 RU
.MaxLocalUsers
= MaxUsages
[i
];
6769 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction
*I
){
6770 // TODO: Cost model for emulated masked load/store is completely
6771 // broken. This hack guides the cost model to use an artificially
6772 // high enough value to practically disable vectorization with such
6773 // operations, except where previously deployed legality hack allowed
6774 // using very low cost values. This is to avoid regressions coming simply
6775 // from moving "masked load/store" check from legality to cost model.
6776 // Masked Load/Gather emulation was previously never allowed.
6777 // Limited number of Masked Store/Scatter emulation was allowed.
6778 assert(isPredicatedInst(I
) &&
6779 "Expecting a scalar emulated instruction");
6780 return isa
<LoadInst
>(I
) ||
6781 (isa
<StoreInst
>(I
) &&
6782 NumPredStores
> NumberOfStoresToPredicate
);
6785 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF
) {
6786 // If we aren't vectorizing the loop, or if we've already collected the
6787 // instructions to scalarize, there's nothing to do. Collection may already
6788 // have occurred if we have a user-selected VF and are now computing the
6789 // expected cost for interleaving.
6790 if (VF
.isScalar() || VF
.isZero() ||
6791 InstsToScalarize
.find(VF
) != InstsToScalarize
.end())
6794 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6795 // not profitable to scalarize any instructions, the presence of VF in the
6796 // map will indicate that we've analyzed it already.
6797 ScalarCostsTy
&ScalarCostsVF
= InstsToScalarize
[VF
];
6799 // Find all the instructions that are scalar with predication in the loop and
6800 // determine if it would be better to not if-convert the blocks they are in.
6801 // If so, we also record the instructions to scalarize.
6802 for (BasicBlock
*BB
: TheLoop
->blocks()) {
6803 if (!blockNeedsPredication(BB
))
6805 for (Instruction
&I
: *BB
)
6806 if (isScalarWithPredication(&I
)) {
6807 ScalarCostsTy ScalarCosts
;
6808 // Do not apply discount if scalable, because that would lead to
6809 // invalid scalarization costs.
6810 // Do not apply discount logic if hacked cost is needed
6811 // for emulated masked memrefs.
6812 if (!VF
.isScalable() && !useEmulatedMaskMemRefHack(&I
) &&
6813 computePredInstDiscount(&I
, ScalarCosts
, VF
) >= 0)
6814 ScalarCostsVF
.insert(ScalarCosts
.begin(), ScalarCosts
.end());
6815 // Remember that BB will remain after vectorization.
6816 PredicatedBBsAfterVectorization
.insert(BB
);
6821 int LoopVectorizationCostModel::computePredInstDiscount(
6822 Instruction
*PredInst
, ScalarCostsTy
&ScalarCosts
, ElementCount VF
) {
6823 assert(!isUniformAfterVectorization(PredInst
, VF
) &&
6824 "Instruction marked uniform-after-vectorization will be predicated");
6826 // Initialize the discount to zero, meaning that the scalar version and the
6827 // vector version cost the same.
6828 InstructionCost Discount
= 0;
6830 // Holds instructions to analyze. The instructions we visit are mapped in
6831 // ScalarCosts. Those instructions are the ones that would be scalarized if
6832 // we find that the scalar version costs less.
6833 SmallVector
<Instruction
*, 8> Worklist
;
6835 // Returns true if the given instruction can be scalarized.
6836 auto canBeScalarized
= [&](Instruction
*I
) -> bool {
6837 // We only attempt to scalarize instructions forming a single-use chain
6838 // from the original predicated block that would otherwise be vectorized.
6839 // Although not strictly necessary, we give up on instructions we know will
6840 // already be scalar to avoid traversing chains that are unlikely to be
6842 if (!I
->hasOneUse() || PredInst
->getParent() != I
->getParent() ||
6843 isScalarAfterVectorization(I
, VF
))
6846 // If the instruction is scalar with predication, it will be analyzed
6847 // separately. We ignore it within the context of PredInst.
6848 if (isScalarWithPredication(I
))
6851 // If any of the instruction's operands are uniform after vectorization,
6852 // the instruction cannot be scalarized. This prevents, for example, a
6853 // masked load from being scalarized.
6855 // We assume we will only emit a value for lane zero of an instruction
6856 // marked uniform after vectorization, rather than VF identical values.
6857 // Thus, if we scalarize an instruction that uses a uniform, we would
6858 // create uses of values corresponding to the lanes we aren't emitting code
6859 // for. This behavior can be changed by allowing getScalarValue to clone
6860 // the lane zero values for uniforms rather than asserting.
6861 for (Use
&U
: I
->operands())
6862 if (auto *J
= dyn_cast
<Instruction
>(U
.get()))
6863 if (isUniformAfterVectorization(J
, VF
))
6866 // Otherwise, we can scalarize the instruction.
6870 // Compute the expected cost discount from scalarizing the entire expression
6871 // feeding the predicated instruction. We currently only consider expressions
6872 // that are single-use instruction chains.
6873 Worklist
.push_back(PredInst
);
6874 while (!Worklist
.empty()) {
6875 Instruction
*I
= Worklist
.pop_back_val();
6877 // If we've already analyzed the instruction, there's nothing to do.
6878 if (ScalarCosts
.find(I
) != ScalarCosts
.end())
6881 // Compute the cost of the vector instruction. Note that this cost already
6882 // includes the scalarization overhead of the predicated instruction.
6883 InstructionCost VectorCost
= getInstructionCost(I
, VF
).first
;
6885 // Compute the cost of the scalarized instruction. This cost is the cost of
6886 // the instruction as if it wasn't if-converted and instead remained in the
6887 // predicated block. We will scale this cost by block probability after
6888 // computing the scalarization overhead.
6889 InstructionCost ScalarCost
=
6890 VF
.getFixedValue() *
6891 getInstructionCost(I
, ElementCount::getFixed(1)).first
;
6893 // Compute the scalarization overhead of needed insertelement instructions
6895 if (isScalarWithPredication(I
) && !I
->getType()->isVoidTy()) {
6896 ScalarCost
+= TTI
.getScalarizationOverhead(
6897 cast
<VectorType
>(ToVectorTy(I
->getType(), VF
)),
6898 APInt::getAllOnesValue(VF
.getFixedValue()), true, false);
6900 VF
.getFixedValue() *
6901 TTI
.getCFInstrCost(Instruction::PHI
, TTI::TCK_RecipThroughput
);
6904 // Compute the scalarization overhead of needed extractelement
6905 // instructions. For each of the instruction's operands, if the operand can
6906 // be scalarized, add it to the worklist; otherwise, account for the
6908 for (Use
&U
: I
->operands())
6909 if (auto *J
= dyn_cast
<Instruction
>(U
.get())) {
6910 assert(VectorType::isValidElementType(J
->getType()) &&
6911 "Instruction has non-scalar type");
6912 if (canBeScalarized(J
))
6913 Worklist
.push_back(J
);
6914 else if (needsExtract(J
, VF
)) {
6915 ScalarCost
+= TTI
.getScalarizationOverhead(
6916 cast
<VectorType
>(ToVectorTy(J
->getType(), VF
)),
6917 APInt::getAllOnesValue(VF
.getFixedValue()), false, true);
6921 // Scale the total scalar cost by block probability.
6922 ScalarCost
/= getReciprocalPredBlockProb();
6924 // Compute the discount. A non-negative discount means the vector version
6925 // of the instruction costs more, and scalarizing would be beneficial.
6926 Discount
+= VectorCost
- ScalarCost
;
6927 ScalarCosts
[I
] = ScalarCost
;
6930 return *Discount
.getValue();
6933 LoopVectorizationCostModel::VectorizationCostTy
6934 LoopVectorizationCostModel::expectedCost(
6935 ElementCount VF
, SmallVectorImpl
<InstructionVFPair
> *Invalid
) {
6936 VectorizationCostTy Cost
;
6939 for (BasicBlock
*BB
: TheLoop
->blocks()) {
6940 VectorizationCostTy BlockCost
;
6942 // For each instruction in the old loop.
6943 for (Instruction
&I
: BB
->instructionsWithoutDebug()) {
6944 // Skip ignored values.
6945 if (ValuesToIgnore
.count(&I
) ||
6946 (VF
.isVector() && VecValuesToIgnore
.count(&I
)))
6949 VectorizationCostTy C
= getInstructionCost(&I
, VF
);
6951 // Check if we should override the cost.
6952 if (C
.first
.isValid() &&
6953 ForceTargetInstructionCost
.getNumOccurrences() > 0)
6954 C
.first
= InstructionCost(ForceTargetInstructionCost
);
6956 // Keep a list of instructions with invalid costs.
6957 if (Invalid
&& !C
.first
.isValid())
6958 Invalid
->emplace_back(&I
, VF
);
6960 BlockCost
.first
+= C
.first
;
6961 BlockCost
.second
|= C
.second
;
6962 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C
.first
6963 << " for VF " << VF
<< " For instruction: " << I
6967 // If we are vectorizing a predicated block, it will have been
6968 // if-converted. This means that the block's instructions (aside from
6969 // stores and instructions that may divide by zero) will now be
6970 // unconditionally executed. For the scalar case, we may not always execute
6971 // the predicated block, if it is an if-else block. Thus, scale the block's
6972 // cost by the probability of executing it. blockNeedsPredication from
6973 // Legal is used so as to not include all blocks in tail folded loops.
6974 if (VF
.isScalar() && Legal
->blockNeedsPredication(BB
))
6975 BlockCost
.first
/= getReciprocalPredBlockProb();
6977 Cost
.first
+= BlockCost
.first
;
6978 Cost
.second
|= BlockCost
.second
;
6984 /// Gets Address Access SCEV after verifying that the access pattern
6985 /// is loop invariant except the induction variable dependence.
6987 /// This SCEV can be sent to the Target in order to estimate the address
6988 /// calculation cost.
6989 static const SCEV
*getAddressAccessSCEV(
6991 LoopVectorizationLegality
*Legal
,
6992 PredicatedScalarEvolution
&PSE
,
6993 const Loop
*TheLoop
) {
6995 auto *Gep
= dyn_cast
<GetElementPtrInst
>(Ptr
);
6999 // We are looking for a gep with all loop invariant indices except for one
7000 // which should be an induction variable.
7001 auto SE
= PSE
.getSE();
7002 unsigned NumOperands
= Gep
->getNumOperands();
7003 for (unsigned i
= 1; i
< NumOperands
; ++i
) {
7004 Value
*Opd
= Gep
->getOperand(i
);
7005 if (!SE
->isLoopInvariant(SE
->getSCEV(Opd
), TheLoop
) &&
7006 !Legal
->isInductionVariable(Opd
))
7010 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
7011 return PSE
.getSCEV(Ptr
);
7014 static bool isStrideMul(Instruction
*I
, LoopVectorizationLegality
*Legal
) {
7015 return Legal
->hasStride(I
->getOperand(0)) ||
7016 Legal
->hasStride(I
->getOperand(1));
7020 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction
*I
,
7022 assert(VF
.isVector() &&
7023 "Scalarization cost of instruction implies vectorization.");
7024 if (VF
.isScalable())
7025 return InstructionCost::getInvalid();
7027 Type
*ValTy
= getLoadStoreType(I
);
7028 auto SE
= PSE
.getSE();
7030 unsigned AS
= getLoadStoreAddressSpace(I
);
7031 Value
*Ptr
= getLoadStorePointerOperand(I
);
7032 Type
*PtrTy
= ToVectorTy(Ptr
->getType(), VF
);
7034 // Figure out whether the access is strided and get the stride value
7035 // if it's known in compile time
7036 const SCEV
*PtrSCEV
= getAddressAccessSCEV(Ptr
, Legal
, PSE
, TheLoop
);
7038 // Get the cost of the scalar memory instruction and address computation.
7039 InstructionCost Cost
=
7040 VF
.getKnownMinValue() * TTI
.getAddressComputationCost(PtrTy
, SE
, PtrSCEV
);
7042 // Don't pass *I here, since it is scalar but will actually be part of a
7043 // vectorized loop where the user of it is a vectorized instruction.
7044 const Align Alignment
= getLoadStoreAlignment(I
);
7045 Cost
+= VF
.getKnownMinValue() *
7046 TTI
.getMemoryOpCost(I
->getOpcode(), ValTy
->getScalarType(), Alignment
,
7047 AS
, TTI::TCK_RecipThroughput
);
7049 // Get the overhead of the extractelement and insertelement instructions
7050 // we might create due to scalarization.
7051 Cost
+= getScalarizationOverhead(I
, VF
);
7053 // If we have a predicated load/store, it will need extra i1 extracts and
7054 // conditional branches, but may not be executed for each vector lane. Scale
7055 // the cost by the probability of executing the predicated block.
7056 if (isPredicatedInst(I
)) {
7057 Cost
/= getReciprocalPredBlockProb();
7059 // Add the cost of an i1 extract and a branch
7061 VectorType::get(IntegerType::getInt1Ty(ValTy
->getContext()), VF
);
7062 Cost
+= TTI
.getScalarizationOverhead(
7063 Vec_i1Ty
, APInt::getAllOnesValue(VF
.getKnownMinValue()),
7064 /*Insert=*/false, /*Extract=*/true);
7065 Cost
+= TTI
.getCFInstrCost(Instruction::Br
, TTI::TCK_RecipThroughput
);
7067 if (useEmulatedMaskMemRefHack(I
))
7068 // Artificially setting to a high enough value to practically disable
7069 // vectorization with such operations.
7077 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction
*I
,
7079 Type
*ValTy
= getLoadStoreType(I
);
7080 auto *VectorTy
= cast
<VectorType
>(ToVectorTy(ValTy
, VF
));
7081 Value
*Ptr
= getLoadStorePointerOperand(I
);
7082 unsigned AS
= getLoadStoreAddressSpace(I
);
7083 int ConsecutiveStride
= Legal
->isConsecutivePtr(Ptr
);
7084 enum TTI::TargetCostKind CostKind
= TTI::TCK_RecipThroughput
;
7086 assert((ConsecutiveStride
== 1 || ConsecutiveStride
== -1) &&
7087 "Stride should be 1 or -1 for consecutive memory access");
7088 const Align Alignment
= getLoadStoreAlignment(I
);
7089 InstructionCost Cost
= 0;
7090 if (Legal
->isMaskRequired(I
))
7091 Cost
+= TTI
.getMaskedMemoryOpCost(I
->getOpcode(), VectorTy
, Alignment
, AS
,
7094 Cost
+= TTI
.getMemoryOpCost(I
->getOpcode(), VectorTy
, Alignment
, AS
,
7097 bool Reverse
= ConsecutiveStride
< 0;
7100 TTI
.getShuffleCost(TargetTransformInfo::SK_Reverse
, VectorTy
, None
, 0);
7105 LoopVectorizationCostModel::getUniformMemOpCost(Instruction
*I
,
7107 assert(Legal
->isUniformMemOp(*I
));
7109 Type
*ValTy
= getLoadStoreType(I
);
7110 auto *VectorTy
= cast
<VectorType
>(ToVectorTy(ValTy
, VF
));
7111 const Align Alignment
= getLoadStoreAlignment(I
);
7112 unsigned AS
= getLoadStoreAddressSpace(I
);
7113 enum TTI::TargetCostKind CostKind
= TTI::TCK_RecipThroughput
;
7114 if (isa
<LoadInst
>(I
)) {
7115 return TTI
.getAddressComputationCost(ValTy
) +
7116 TTI
.getMemoryOpCost(Instruction::Load
, ValTy
, Alignment
, AS
,
7118 TTI
.getShuffleCost(TargetTransformInfo::SK_Broadcast
, VectorTy
);
7120 StoreInst
*SI
= cast
<StoreInst
>(I
);
7122 bool isLoopInvariantStoreValue
= Legal
->isUniform(SI
->getValueOperand());
7123 return TTI
.getAddressComputationCost(ValTy
) +
7124 TTI
.getMemoryOpCost(Instruction::Store
, ValTy
, Alignment
, AS
,
7126 (isLoopInvariantStoreValue
7128 : TTI
.getVectorInstrCost(Instruction::ExtractElement
, VectorTy
,
7129 VF
.getKnownMinValue() - 1));
7133 LoopVectorizationCostModel::getGatherScatterCost(Instruction
*I
,
7135 Type
*ValTy
= getLoadStoreType(I
);
7136 auto *VectorTy
= cast
<VectorType
>(ToVectorTy(ValTy
, VF
));
7137 const Align Alignment
= getLoadStoreAlignment(I
);
7138 const Value
*Ptr
= getLoadStorePointerOperand(I
);
7140 return TTI
.getAddressComputationCost(VectorTy
) +
7141 TTI
.getGatherScatterOpCost(
7142 I
->getOpcode(), VectorTy
, Ptr
, Legal
->isMaskRequired(I
), Alignment
,
7143 TargetTransformInfo::TCK_RecipThroughput
, I
);
7147 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction
*I
,
7149 // TODO: Once we have support for interleaving with scalable vectors
7150 // we can calculate the cost properly here.
7151 if (VF
.isScalable())
7152 return InstructionCost::getInvalid();
7154 Type
*ValTy
= getLoadStoreType(I
);
7155 auto *VectorTy
= cast
<VectorType
>(ToVectorTy(ValTy
, VF
));
7156 unsigned AS
= getLoadStoreAddressSpace(I
);
7158 auto Group
= getInterleavedAccessGroup(I
);
7159 assert(Group
&& "Fail to get an interleaved access group.");
7161 unsigned InterleaveFactor
= Group
->getFactor();
7162 auto *WideVecTy
= VectorType::get(ValTy
, VF
* InterleaveFactor
);
7164 // Holds the indices of existing members in the interleaved group.
7165 SmallVector
<unsigned, 4> Indices
;
7166 for (unsigned IF
= 0; IF
< InterleaveFactor
; IF
++)
7167 if (Group
->getMember(IF
))
7168 Indices
.push_back(IF
);
7170 // Calculate the cost of the whole interleaved group.
7171 bool UseMaskForGaps
=
7172 (Group
->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
7173 (isa
<StoreInst
>(I
) && (Group
->getNumMembers() < Group
->getFactor()));
7174 InstructionCost Cost
= TTI
.getInterleavedMemoryOpCost(
7175 I
->getOpcode(), WideVecTy
, Group
->getFactor(), Indices
, Group
->getAlign(),
7176 AS
, TTI::TCK_RecipThroughput
, Legal
->isMaskRequired(I
), UseMaskForGaps
);
7178 if (Group
->isReverse()) {
7179 // TODO: Add support for reversed masked interleaved access.
7180 assert(!Legal
->isMaskRequired(I
) &&
7181 "Reverse masked interleaved access not supported.");
7183 Group
->getNumMembers() *
7184 TTI
.getShuffleCost(TargetTransformInfo::SK_Reverse
, VectorTy
, None
, 0);
7189 Optional
<InstructionCost
> LoopVectorizationCostModel::getReductionPatternCost(
7190 Instruction
*I
, ElementCount VF
, Type
*Ty
, TTI::TargetCostKind CostKind
) {
7191 using namespace llvm::PatternMatch
;
7192 // Early exit for no inloop reductions
7193 if (InLoopReductionChains
.empty() || VF
.isScalar() || !isa
<VectorType
>(Ty
))
7195 auto *VectorTy
= cast
<VectorType
>(Ty
);
7197 // We are looking for a pattern of, and finding the minimal acceptable cost:
7198 // reduce(mul(ext(A), ext(B))) or
7199 // reduce(mul(A, B)) or
7200 // reduce(ext(A)) or
7202 // The basic idea is that we walk down the tree to do that, finding the root
7203 // reduction instruction in InLoopReductionImmediateChains. From there we find
7204 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
7205 // of the components. If the reduction cost is lower then we return it for the
7206 // reduction instruction and 0 for the other instructions in the pattern. If
7207 // it is not we return an invalid cost specifying the orignal cost method
7209 Instruction
*RetI
= I
;
7210 if (match(RetI
, m_ZExtOrSExt(m_Value()))) {
7211 if (!RetI
->hasOneUser())
7213 RetI
= RetI
->user_back();
7215 if (match(RetI
, m_Mul(m_Value(), m_Value())) &&
7216 RetI
->user_back()->getOpcode() == Instruction::Add
) {
7217 if (!RetI
->hasOneUser())
7219 RetI
= RetI
->user_back();
7222 // Test if the found instruction is a reduction, and if not return an invalid
7223 // cost specifying the parent to use the original cost modelling.
7224 if (!InLoopReductionImmediateChains
.count(RetI
))
7227 // Find the reduction this chain is a part of and calculate the basic cost of
7228 // the reduction on its own.
7229 Instruction
*LastChain
= InLoopReductionImmediateChains
[RetI
];
7230 Instruction
*ReductionPhi
= LastChain
;
7231 while (!isa
<PHINode
>(ReductionPhi
))
7232 ReductionPhi
= InLoopReductionImmediateChains
[ReductionPhi
];
7234 const RecurrenceDescriptor
&RdxDesc
=
7235 Legal
->getReductionVars()[cast
<PHINode
>(ReductionPhi
)];
7237 InstructionCost BaseCost
= TTI
.getArithmeticReductionCost(
7238 RdxDesc
.getOpcode(), VectorTy
, RdxDesc
.getFastMathFlags(), CostKind
);
7240 // If we're using ordered reductions then we can just return the base cost
7241 // here, since getArithmeticReductionCost calculates the full ordered
7242 // reduction cost when FP reassociation is not allowed.
7243 if (useOrderedReductions(RdxDesc
))
7246 // Get the operand that was not the reduction chain and match it to one of the
7247 // patterns, returning the better cost if it is found.
7248 Instruction
*RedOp
= RetI
->getOperand(1) == LastChain
7249 ? dyn_cast
<Instruction
>(RetI
->getOperand(0))
7250 : dyn_cast
<Instruction
>(RetI
->getOperand(1));
7252 VectorTy
= VectorType::get(I
->getOperand(0)->getType(), VectorTy
);
7254 Instruction
*Op0
, *Op1
;
7257 m_ZExtOrSExt(m_Mul(m_Instruction(Op0
), m_Instruction(Op1
)))) &&
7258 match(Op0
, m_ZExtOrSExt(m_Value())) &&
7259 Op0
->getOpcode() == Op1
->getOpcode() &&
7260 Op0
->getOperand(0)->getType() == Op1
->getOperand(0)->getType() &&
7261 !TheLoop
->isLoopInvariant(Op0
) && !TheLoop
->isLoopInvariant(Op1
) &&
7262 (Op0
->getOpcode() == RedOp
->getOpcode() || Op0
== Op1
)) {
7264 // Matched reduce(ext(mul(ext(A), ext(B)))
7265 // Note that the extend opcodes need to all match, or if A==B they will have
7266 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
7267 // which is equally fine.
7268 bool IsUnsigned
= isa
<ZExtInst
>(Op0
);
7269 auto *ExtType
= VectorType::get(Op0
->getOperand(0)->getType(), VectorTy
);
7270 auto *MulType
= VectorType::get(Op0
->getType(), VectorTy
);
7272 InstructionCost ExtCost
=
7273 TTI
.getCastInstrCost(Op0
->getOpcode(), MulType
, ExtType
,
7274 TTI::CastContextHint::None
, CostKind
, Op0
);
7275 InstructionCost MulCost
=
7276 TTI
.getArithmeticInstrCost(Instruction::Mul
, MulType
, CostKind
);
7277 InstructionCost Ext2Cost
=
7278 TTI
.getCastInstrCost(RedOp
->getOpcode(), VectorTy
, MulType
,
7279 TTI::CastContextHint::None
, CostKind
, RedOp
);
7281 InstructionCost RedCost
= TTI
.getExtendedAddReductionCost(
7282 /*IsMLA=*/true, IsUnsigned
, RdxDesc
.getRecurrenceType(), ExtType
,
7285 if (RedCost
.isValid() &&
7286 RedCost
< ExtCost
* 2 + MulCost
+ Ext2Cost
+ BaseCost
)
7287 return I
== RetI
? RedCost
: 0;
7288 } else if (RedOp
&& match(RedOp
, m_ZExtOrSExt(m_Value())) &&
7289 !TheLoop
->isLoopInvariant(RedOp
)) {
7290 // Matched reduce(ext(A))
7291 bool IsUnsigned
= isa
<ZExtInst
>(RedOp
);
7292 auto *ExtType
= VectorType::get(RedOp
->getOperand(0)->getType(), VectorTy
);
7293 InstructionCost RedCost
= TTI
.getExtendedAddReductionCost(
7294 /*IsMLA=*/false, IsUnsigned
, RdxDesc
.getRecurrenceType(), ExtType
,
7297 InstructionCost ExtCost
=
7298 TTI
.getCastInstrCost(RedOp
->getOpcode(), VectorTy
, ExtType
,
7299 TTI::CastContextHint::None
, CostKind
, RedOp
);
7300 if (RedCost
.isValid() && RedCost
< BaseCost
+ ExtCost
)
7301 return I
== RetI
? RedCost
: 0;
7303 match(RedOp
, m_Mul(m_Instruction(Op0
), m_Instruction(Op1
)))) {
7304 if (match(Op0
, m_ZExtOrSExt(m_Value())) &&
7305 Op0
->getOpcode() == Op1
->getOpcode() &&
7306 Op0
->getOperand(0)->getType() == Op1
->getOperand(0)->getType() &&
7307 !TheLoop
->isLoopInvariant(Op0
) && !TheLoop
->isLoopInvariant(Op1
)) {
7308 bool IsUnsigned
= isa
<ZExtInst
>(Op0
);
7309 auto *ExtType
= VectorType::get(Op0
->getOperand(0)->getType(), VectorTy
);
7310 // Matched reduce(mul(ext, ext))
7311 InstructionCost ExtCost
=
7312 TTI
.getCastInstrCost(Op0
->getOpcode(), VectorTy
, ExtType
,
7313 TTI::CastContextHint::None
, CostKind
, Op0
);
7314 InstructionCost MulCost
=
7315 TTI
.getArithmeticInstrCost(Instruction::Mul
, VectorTy
, CostKind
);
7317 InstructionCost RedCost
= TTI
.getExtendedAddReductionCost(
7318 /*IsMLA=*/true, IsUnsigned
, RdxDesc
.getRecurrenceType(), ExtType
,
7321 if (RedCost
.isValid() && RedCost
< ExtCost
* 2 + MulCost
+ BaseCost
)
7322 return I
== RetI
? RedCost
: 0;
7323 } else if (!match(I
, m_ZExtOrSExt(m_Value()))) {
7324 // Matched reduce(mul())
7325 InstructionCost MulCost
=
7326 TTI
.getArithmeticInstrCost(Instruction::Mul
, VectorTy
, CostKind
);
7328 InstructionCost RedCost
= TTI
.getExtendedAddReductionCost(
7329 /*IsMLA=*/true, true, RdxDesc
.getRecurrenceType(), VectorTy
,
7332 if (RedCost
.isValid() && RedCost
< MulCost
+ BaseCost
)
7333 return I
== RetI
? RedCost
: 0;
7337 return I
== RetI
? Optional
<InstructionCost
>(BaseCost
) : None
;
7341 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction
*I
,
7343 // Calculate scalar cost only. Vectorization cost should be ready at this
7345 if (VF
.isScalar()) {
7346 Type
*ValTy
= getLoadStoreType(I
);
7347 const Align Alignment
= getLoadStoreAlignment(I
);
7348 unsigned AS
= getLoadStoreAddressSpace(I
);
7350 return TTI
.getAddressComputationCost(ValTy
) +
7351 TTI
.getMemoryOpCost(I
->getOpcode(), ValTy
, Alignment
, AS
,
7352 TTI::TCK_RecipThroughput
, I
);
7354 return getWideningCost(I
, VF
);
7357 LoopVectorizationCostModel::VectorizationCostTy
7358 LoopVectorizationCostModel::getInstructionCost(Instruction
*I
,
7360 // If we know that this instruction will remain uniform, check the cost of
7361 // the scalar version.
7362 if (isUniformAfterVectorization(I
, VF
))
7363 VF
= ElementCount::getFixed(1);
7365 if (VF
.isVector() && isProfitableToScalarize(I
, VF
))
7366 return VectorizationCostTy(InstsToScalarize
[VF
][I
], false);
7368 // Forced scalars do not have any scalarization overhead.
7369 auto ForcedScalar
= ForcedScalars
.find(VF
);
7370 if (VF
.isVector() && ForcedScalar
!= ForcedScalars
.end()) {
7371 auto InstSet
= ForcedScalar
->second
;
7372 if (InstSet
.count(I
))
7373 return VectorizationCostTy(
7374 (getInstructionCost(I
, ElementCount::getFixed(1)).first
*
7375 VF
.getKnownMinValue()),
7380 InstructionCost C
= getInstructionCost(I
, VF
, VectorTy
);
7382 bool TypeNotScalarized
=
7383 VF
.isVector() && VectorTy
->isVectorTy() &&
7384 TTI
.getNumberOfParts(VectorTy
) < VF
.getKnownMinValue();
7385 return VectorizationCostTy(C
, TypeNotScalarized
);
7389 LoopVectorizationCostModel::getScalarizationOverhead(Instruction
*I
,
7390 ElementCount VF
) const {
7392 // There is no mechanism yet to create a scalable scalarization loop,
7393 // so this is currently Invalid.
7394 if (VF
.isScalable())
7395 return InstructionCost::getInvalid();
7400 InstructionCost Cost
= 0;
7401 Type
*RetTy
= ToVectorTy(I
->getType(), VF
);
7402 if (!RetTy
->isVoidTy() &&
7403 (!isa
<LoadInst
>(I
) || !TTI
.supportsEfficientVectorElementLoadStore()))
7404 Cost
+= TTI
.getScalarizationOverhead(
7405 cast
<VectorType
>(RetTy
), APInt::getAllOnesValue(VF
.getKnownMinValue()),
7408 // Some targets keep addresses scalar.
7409 if (isa
<LoadInst
>(I
) && !TTI
.prefersVectorizedAddressing())
7412 // Some targets support efficient element stores.
7413 if (isa
<StoreInst
>(I
) && TTI
.supportsEfficientVectorElementLoadStore())
7416 // Collect operands to consider.
7417 CallInst
*CI
= dyn_cast
<CallInst
>(I
);
7418 Instruction::op_range Ops
= CI
? CI
->arg_operands() : I
->operands();
7420 // Skip operands that do not require extraction/scalarization and do not incur
7422 SmallVector
<Type
*> Tys
;
7423 for (auto *V
: filterExtractingOperands(Ops
, VF
))
7424 Tys
.push_back(MaybeVectorizeType(V
->getType(), VF
));
7425 return Cost
+ TTI
.getOperandsScalarizationOverhead(
7426 filterExtractingOperands(Ops
, VF
), Tys
);
7429 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF
) {
7433 for (BasicBlock
*BB
: TheLoop
->blocks()) {
7434 // For each instruction in the old loop.
7435 for (Instruction
&I
: *BB
) {
7436 Value
*Ptr
= getLoadStorePointerOperand(&I
);
7440 // TODO: We should generate better code and update the cost model for
7441 // predicated uniform stores. Today they are treated as any other
7442 // predicated store (see added test cases in
7443 // invariant-store-vectorization.ll).
7444 if (isa
<StoreInst
>(&I
) && isScalarWithPredication(&I
))
7447 if (Legal
->isUniformMemOp(I
)) {
7448 // TODO: Avoid replicating loads and stores instead of
7449 // relying on instcombine to remove them.
7450 // Load: Scalar load + broadcast
7451 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
7452 InstructionCost Cost
;
7453 if (isa
<StoreInst
>(&I
) && VF
.isScalable() &&
7454 isLegalGatherOrScatter(&I
)) {
7455 Cost
= getGatherScatterCost(&I
, VF
);
7456 setWideningDecision(&I
, VF
, CM_GatherScatter
, Cost
);
7458 assert((isa
<LoadInst
>(&I
) || !VF
.isScalable()) &&
7459 "Cannot yet scalarize uniform stores");
7460 Cost
= getUniformMemOpCost(&I
, VF
);
7461 setWideningDecision(&I
, VF
, CM_Scalarize
, Cost
);
7466 // We assume that widening is the best solution when possible.
7467 if (memoryInstructionCanBeWidened(&I
, VF
)) {
7468 InstructionCost Cost
= getConsecutiveMemOpCost(&I
, VF
);
7469 int ConsecutiveStride
=
7470 Legal
->isConsecutivePtr(getLoadStorePointerOperand(&I
));
7471 assert((ConsecutiveStride
== 1 || ConsecutiveStride
== -1) &&
7472 "Expected consecutive stride.");
7473 InstWidening Decision
=
7474 ConsecutiveStride
== 1 ? CM_Widen
: CM_Widen_Reverse
;
7475 setWideningDecision(&I
, VF
, Decision
, Cost
);
7479 // Choose between Interleaving, Gather/Scatter or Scalarization.
7480 InstructionCost InterleaveCost
= InstructionCost::getInvalid();
7481 unsigned NumAccesses
= 1;
7482 if (isAccessInterleaved(&I
)) {
7483 auto Group
= getInterleavedAccessGroup(&I
);
7484 assert(Group
&& "Fail to get an interleaved access group.");
7486 // Make one decision for the whole group.
7487 if (getWideningDecision(&I
, VF
) != CM_Unknown
)
7490 NumAccesses
= Group
->getNumMembers();
7491 if (interleavedAccessCanBeWidened(&I
, VF
))
7492 InterleaveCost
= getInterleaveGroupCost(&I
, VF
);
7495 InstructionCost GatherScatterCost
=
7496 isLegalGatherOrScatter(&I
)
7497 ? getGatherScatterCost(&I
, VF
) * NumAccesses
7498 : InstructionCost::getInvalid();
7500 InstructionCost ScalarizationCost
=
7501 getMemInstScalarizationCost(&I
, VF
) * NumAccesses
;
7503 // Choose better solution for the current VF,
7504 // write down this decision and use it during vectorization.
7505 InstructionCost Cost
;
7506 InstWidening Decision
;
7507 if (InterleaveCost
<= GatherScatterCost
&&
7508 InterleaveCost
< ScalarizationCost
) {
7509 Decision
= CM_Interleave
;
7510 Cost
= InterleaveCost
;
7511 } else if (GatherScatterCost
< ScalarizationCost
) {
7512 Decision
= CM_GatherScatter
;
7513 Cost
= GatherScatterCost
;
7515 Decision
= CM_Scalarize
;
7516 Cost
= ScalarizationCost
;
7518 // If the instructions belongs to an interleave group, the whole group
7519 // receives the same decision. The whole group receives the cost, but
7520 // the cost will actually be assigned to one instruction.
7521 if (auto Group
= getInterleavedAccessGroup(&I
))
7522 setWideningDecision(Group
, VF
, Decision
, Cost
);
7524 setWideningDecision(&I
, VF
, Decision
, Cost
);
7528 // Make sure that any load of address and any other address computation
7529 // remains scalar unless there is gather/scatter support. This avoids
7530 // inevitable extracts into address registers, and also has the benefit of
7531 // activating LSR more, since that pass can't optimize vectorized
7533 if (TTI
.prefersVectorizedAddressing())
7536 // Start with all scalar pointer uses.
7537 SmallPtrSet
<Instruction
*, 8> AddrDefs
;
7538 for (BasicBlock
*BB
: TheLoop
->blocks())
7539 for (Instruction
&I
: *BB
) {
7540 Instruction
*PtrDef
=
7541 dyn_cast_or_null
<Instruction
>(getLoadStorePointerOperand(&I
));
7542 if (PtrDef
&& TheLoop
->contains(PtrDef
) &&
7543 getWideningDecision(&I
, VF
) != CM_GatherScatter
)
7544 AddrDefs
.insert(PtrDef
);
7547 // Add all instructions used to generate the addresses.
7548 SmallVector
<Instruction
*, 4> Worklist
;
7549 append_range(Worklist
, AddrDefs
);
7550 while (!Worklist
.empty()) {
7551 Instruction
*I
= Worklist
.pop_back_val();
7552 for (auto &Op
: I
->operands())
7553 if (auto *InstOp
= dyn_cast
<Instruction
>(Op
))
7554 if ((InstOp
->getParent() == I
->getParent()) && !isa
<PHINode
>(InstOp
) &&
7555 AddrDefs
.insert(InstOp
).second
)
7556 Worklist
.push_back(InstOp
);
7559 for (auto *I
: AddrDefs
) {
7560 if (isa
<LoadInst
>(I
)) {
7561 // Setting the desired widening decision should ideally be handled in
7562 // by cost functions, but since this involves the task of finding out
7563 // if the loaded register is involved in an address computation, it is
7564 // instead changed here when we know this is the case.
7565 InstWidening Decision
= getWideningDecision(I
, VF
);
7566 if (Decision
== CM_Widen
|| Decision
== CM_Widen_Reverse
)
7567 // Scalarize a widened load of address.
7568 setWideningDecision(
7569 I
, VF
, CM_Scalarize
,
7570 (VF
.getKnownMinValue() *
7571 getMemoryInstructionCost(I
, ElementCount::getFixed(1))));
7572 else if (auto Group
= getInterleavedAccessGroup(I
)) {
7573 // Scalarize an interleave group of address loads.
7574 for (unsigned I
= 0; I
< Group
->getFactor(); ++I
) {
7575 if (Instruction
*Member
= Group
->getMember(I
))
7576 setWideningDecision(
7577 Member
, VF
, CM_Scalarize
,
7578 (VF
.getKnownMinValue() *
7579 getMemoryInstructionCost(Member
, ElementCount::getFixed(1))));
7583 // Make sure I gets scalarized and a cost estimate without
7584 // scalarization overhead.
7585 ForcedScalars
[VF
].insert(I
);
7590 LoopVectorizationCostModel::getInstructionCost(Instruction
*I
, ElementCount VF
,
7592 Type
*RetTy
= I
->getType();
7593 if (canTruncateToMinimalBitwidth(I
, VF
))
7594 RetTy
= IntegerType::get(RetTy
->getContext(), MinBWs
[I
]);
7595 auto SE
= PSE
.getSE();
7596 TTI::TargetCostKind CostKind
= TTI::TCK_RecipThroughput
;
7598 auto hasSingleCopyAfterVectorization
= [this](Instruction
*I
,
7599 ElementCount VF
) -> bool {
7603 auto Scalarized
= InstsToScalarize
.find(VF
);
7604 assert(Scalarized
!= InstsToScalarize
.end() &&
7605 "VF not yet analyzed for scalarization profitability");
7606 return !Scalarized
->second
.count(I
) &&
7607 llvm::all_of(I
->users(), [&](User
*U
) {
7608 auto *UI
= cast
<Instruction
>(U
);
7609 return !Scalarized
->second
.count(UI
);
7612 (void) hasSingleCopyAfterVectorization
;
7614 if (isScalarAfterVectorization(I
, VF
)) {
7615 // With the exception of GEPs and PHIs, after scalarization there should
7616 // only be one copy of the instruction generated in the loop. This is
7617 // because the VF is either 1, or any instructions that need scalarizing
7618 // have already been dealt with by the the time we get here. As a result,
7619 // it means we don't have to multiply the instruction cost by VF.
7620 assert(I
->getOpcode() == Instruction::GetElementPtr
||
7621 I
->getOpcode() == Instruction::PHI
||
7622 (I
->getOpcode() == Instruction::BitCast
&&
7623 I
->getType()->isPointerTy()) ||
7624 hasSingleCopyAfterVectorization(I
, VF
));
7627 VectorTy
= ToVectorTy(RetTy
, VF
);
7629 // TODO: We need to estimate the cost of intrinsic calls.
7630 switch (I
->getOpcode()) {
7631 case Instruction::GetElementPtr
:
7632 // We mark this instruction as zero-cost because the cost of GEPs in
7633 // vectorized code depends on whether the corresponding memory instruction
7634 // is scalarized or not. Therefore, we handle GEPs with the memory
7635 // instruction cost.
7637 case Instruction::Br
: {
7638 // In cases of scalarized and predicated instructions, there will be VF
7639 // predicated blocks in the vectorized loop. Each branch around these
7640 // blocks requires also an extract of its vector compare i1 element.
7641 bool ScalarPredicatedBB
= false;
7642 BranchInst
*BI
= cast
<BranchInst
>(I
);
7643 if (VF
.isVector() && BI
->isConditional() &&
7644 (PredicatedBBsAfterVectorization
.count(BI
->getSuccessor(0)) ||
7645 PredicatedBBsAfterVectorization
.count(BI
->getSuccessor(1))))
7646 ScalarPredicatedBB
= true;
7648 if (ScalarPredicatedBB
) {
7649 // Not possible to scalarize scalable vector with predicated instructions.
7650 if (VF
.isScalable())
7651 return InstructionCost::getInvalid();
7652 // Return cost for branches around scalarized and predicated blocks.
7654 VectorType::get(IntegerType::getInt1Ty(RetTy
->getContext()), VF
);
7656 TTI
.getScalarizationOverhead(
7657 Vec_i1Ty
, APInt::getAllOnesValue(VF
.getFixedValue()), false,
7659 (TTI
.getCFInstrCost(Instruction::Br
, CostKind
) * VF
.getFixedValue()));
7660 } else if (I
->getParent() == TheLoop
->getLoopLatch() || VF
.isScalar())
7661 // The back-edge branch will remain, as will all scalar branches.
7662 return TTI
.getCFInstrCost(Instruction::Br
, CostKind
);
7664 // This branch will be eliminated by if-conversion.
7666 // Note: We currently assume zero cost for an unconditional branch inside
7667 // a predicated block since it will become a fall-through, although we
7668 // may decide in the future to call TTI for all branches.
7670 case Instruction::PHI
: {
7671 auto *Phi
= cast
<PHINode
>(I
);
7673 // First-order recurrences are replaced by vector shuffles inside the loop.
7674 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7675 if (VF
.isVector() && Legal
->isFirstOrderRecurrence(Phi
))
7676 return TTI
.getShuffleCost(
7677 TargetTransformInfo::SK_ExtractSubvector
, cast
<VectorType
>(VectorTy
),
7678 None
, VF
.getKnownMinValue() - 1, FixedVectorType::get(RetTy
, 1));
7680 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7681 // converted into select instructions. We require N - 1 selects per phi
7682 // node, where N is the number of incoming values.
7683 if (VF
.isVector() && Phi
->getParent() != TheLoop
->getHeader())
7684 return (Phi
->getNumIncomingValues() - 1) *
7685 TTI
.getCmpSelInstrCost(
7686 Instruction::Select
, ToVectorTy(Phi
->getType(), VF
),
7687 ToVectorTy(Type::getInt1Ty(Phi
->getContext()), VF
),
7688 CmpInst::BAD_ICMP_PREDICATE
, CostKind
);
7690 return TTI
.getCFInstrCost(Instruction::PHI
, CostKind
);
7692 case Instruction::UDiv
:
7693 case Instruction::SDiv
:
7694 case Instruction::URem
:
7695 case Instruction::SRem
:
7696 // If we have a predicated instruction, it may not be executed for each
7697 // vector lane. Get the scalarization cost and scale this amount by the
7698 // probability of executing the predicated block. If the instruction is not
7699 // predicated, we fall through to the next case.
7700 if (VF
.isVector() && isScalarWithPredication(I
)) {
7701 InstructionCost Cost
= 0;
7703 // These instructions have a non-void type, so account for the phi nodes
7704 // that we will create. This cost is likely to be zero. The phi node
7705 // cost, if any, should be scaled by the block probability because it
7706 // models a copy at the end of each predicated block.
7707 Cost
+= VF
.getKnownMinValue() *
7708 TTI
.getCFInstrCost(Instruction::PHI
, CostKind
);
7710 // The cost of the non-predicated instruction.
7711 Cost
+= VF
.getKnownMinValue() *
7712 TTI
.getArithmeticInstrCost(I
->getOpcode(), RetTy
, CostKind
);
7714 // The cost of insertelement and extractelement instructions needed for
7716 Cost
+= getScalarizationOverhead(I
, VF
);
7718 // Scale the cost by the probability of executing the predicated blocks.
7719 // This assumes the predicated block for each vector lane is equally
7721 return Cost
/ getReciprocalPredBlockProb();
7724 case Instruction::Add
:
7725 case Instruction::FAdd
:
7726 case Instruction::Sub
:
7727 case Instruction::FSub
:
7728 case Instruction::Mul
:
7729 case Instruction::FMul
:
7730 case Instruction::FDiv
:
7731 case Instruction::FRem
:
7732 case Instruction::Shl
:
7733 case Instruction::LShr
:
7734 case Instruction::AShr
:
7735 case Instruction::And
:
7736 case Instruction::Or
:
7737 case Instruction::Xor
: {
7738 // Since we will replace the stride by 1 the multiplication should go away.
7739 if (I
->getOpcode() == Instruction::Mul
&& isStrideMul(I
, Legal
))
7742 // Detect reduction patterns
7743 if (auto RedCost
= getReductionPatternCost(I
, VF
, VectorTy
, CostKind
))
7746 // Certain instructions can be cheaper to vectorize if they have a constant
7747 // second vector operand. One example of this are shifts on x86.
7748 Value
*Op2
= I
->getOperand(1);
7749 TargetTransformInfo::OperandValueProperties Op2VP
;
7750 TargetTransformInfo::OperandValueKind Op2VK
=
7751 TTI
.getOperandInfo(Op2
, Op2VP
);
7752 if (Op2VK
== TargetTransformInfo::OK_AnyValue
&& Legal
->isUniform(Op2
))
7753 Op2VK
= TargetTransformInfo::OK_UniformValue
;
7755 SmallVector
<const Value
*, 4> Operands(I
->operand_values());
7756 return TTI
.getArithmeticInstrCost(
7757 I
->getOpcode(), VectorTy
, CostKind
, TargetTransformInfo::OK_AnyValue
,
7758 Op2VK
, TargetTransformInfo::OP_None
, Op2VP
, Operands
, I
);
7760 case Instruction::FNeg
: {
7761 return TTI
.getArithmeticInstrCost(
7762 I
->getOpcode(), VectorTy
, CostKind
, TargetTransformInfo::OK_AnyValue
,
7763 TargetTransformInfo::OK_AnyValue
, TargetTransformInfo::OP_None
,
7764 TargetTransformInfo::OP_None
, I
->getOperand(0), I
);
7766 case Instruction::Select
: {
7767 SelectInst
*SI
= cast
<SelectInst
>(I
);
7768 const SCEV
*CondSCEV
= SE
->getSCEV(SI
->getCondition());
7769 bool ScalarCond
= (SE
->isLoopInvariant(CondSCEV
, TheLoop
));
7771 const Value
*Op0
, *Op1
;
7772 using namespace llvm::PatternMatch
;
7773 if (!ScalarCond
&& (match(I
, m_LogicalAnd(m_Value(Op0
), m_Value(Op1
))) ||
7774 match(I
, m_LogicalOr(m_Value(Op0
), m_Value(Op1
))))) {
7775 // select x, y, false --> x & y
7776 // select x, true, y --> x | y
7777 TTI::OperandValueProperties Op1VP
= TTI::OP_None
;
7778 TTI::OperandValueProperties Op2VP
= TTI::OP_None
;
7779 TTI::OperandValueKind Op1VK
= TTI::getOperandInfo(Op0
, Op1VP
);
7780 TTI::OperandValueKind Op2VK
= TTI::getOperandInfo(Op1
, Op2VP
);
7781 assert(Op0
->getType()->getScalarSizeInBits() == 1 &&
7782 Op1
->getType()->getScalarSizeInBits() == 1);
7784 SmallVector
<const Value
*, 2> Operands
{Op0
, Op1
};
7785 return TTI
.getArithmeticInstrCost(
7786 match(I
, m_LogicalOr()) ? Instruction::Or
: Instruction::And
, VectorTy
,
7787 CostKind
, Op1VK
, Op2VK
, Op1VP
, Op2VP
, Operands
, I
);
7790 Type
*CondTy
= SI
->getCondition()->getType();
7792 CondTy
= VectorType::get(CondTy
, VF
);
7793 return TTI
.getCmpSelInstrCost(I
->getOpcode(), VectorTy
, CondTy
,
7794 CmpInst::BAD_ICMP_PREDICATE
, CostKind
, I
);
7796 case Instruction::ICmp
:
7797 case Instruction::FCmp
: {
7798 Type
*ValTy
= I
->getOperand(0)->getType();
7799 Instruction
*Op0AsInstruction
= dyn_cast
<Instruction
>(I
->getOperand(0));
7800 if (canTruncateToMinimalBitwidth(Op0AsInstruction
, VF
))
7801 ValTy
= IntegerType::get(ValTy
->getContext(), MinBWs
[Op0AsInstruction
]);
7802 VectorTy
= ToVectorTy(ValTy
, VF
);
7803 return TTI
.getCmpSelInstrCost(I
->getOpcode(), VectorTy
, nullptr,
7804 CmpInst::BAD_ICMP_PREDICATE
, CostKind
, I
);
7806 case Instruction::Store
:
7807 case Instruction::Load
: {
7808 ElementCount Width
= VF
;
7809 if (Width
.isVector()) {
7810 InstWidening Decision
= getWideningDecision(I
, Width
);
7811 assert(Decision
!= CM_Unknown
&&
7812 "CM decision should be taken at this point");
7813 if (Decision
== CM_Scalarize
)
7814 Width
= ElementCount::getFixed(1);
7816 VectorTy
= ToVectorTy(getLoadStoreType(I
), Width
);
7817 return getMemoryInstructionCost(I
, VF
);
7819 case Instruction::BitCast
:
7820 if (I
->getType()->isPointerTy())
7823 case Instruction::ZExt
:
7824 case Instruction::SExt
:
7825 case Instruction::FPToUI
:
7826 case Instruction::FPToSI
:
7827 case Instruction::FPExt
:
7828 case Instruction::PtrToInt
:
7829 case Instruction::IntToPtr
:
7830 case Instruction::SIToFP
:
7831 case Instruction::UIToFP
:
7832 case Instruction::Trunc
:
7833 case Instruction::FPTrunc
: {
7834 // Computes the CastContextHint from a Load/Store instruction.
7835 auto ComputeCCH
= [&](Instruction
*I
) -> TTI::CastContextHint
{
7836 assert((isa
<LoadInst
>(I
) || isa
<StoreInst
>(I
)) &&
7837 "Expected a load or a store!");
7839 if (VF
.isScalar() || !TheLoop
->contains(I
))
7840 return TTI::CastContextHint::Normal
;
7842 switch (getWideningDecision(I
, VF
)) {
7843 case LoopVectorizationCostModel::CM_GatherScatter
:
7844 return TTI::CastContextHint::GatherScatter
;
7845 case LoopVectorizationCostModel::CM_Interleave
:
7846 return TTI::CastContextHint::Interleave
;
7847 case LoopVectorizationCostModel::CM_Scalarize
:
7848 case LoopVectorizationCostModel::CM_Widen
:
7849 return Legal
->isMaskRequired(I
) ? TTI::CastContextHint::Masked
7850 : TTI::CastContextHint::Normal
;
7851 case LoopVectorizationCostModel::CM_Widen_Reverse
:
7852 return TTI::CastContextHint::Reversed
;
7853 case LoopVectorizationCostModel::CM_Unknown
:
7854 llvm_unreachable("Instr did not go through cost modelling?");
7857 llvm_unreachable("Unhandled case!");
7860 unsigned Opcode
= I
->getOpcode();
7861 TTI::CastContextHint CCH
= TTI::CastContextHint::None
;
7862 // For Trunc, the context is the only user, which must be a StoreInst.
7863 if (Opcode
== Instruction::Trunc
|| Opcode
== Instruction::FPTrunc
) {
7865 if (StoreInst
*Store
= dyn_cast
<StoreInst
>(*I
->user_begin()))
7866 CCH
= ComputeCCH(Store
);
7868 // For Z/Sext, the context is the operand, which must be a LoadInst.
7869 else if (Opcode
== Instruction::ZExt
|| Opcode
== Instruction::SExt
||
7870 Opcode
== Instruction::FPExt
) {
7871 if (LoadInst
*Load
= dyn_cast
<LoadInst
>(I
->getOperand(0)))
7872 CCH
= ComputeCCH(Load
);
7875 // We optimize the truncation of induction variables having constant
7876 // integer steps. The cost of these truncations is the same as the scalar
7878 if (isOptimizableIVTruncate(I
, VF
)) {
7879 auto *Trunc
= cast
<TruncInst
>(I
);
7880 return TTI
.getCastInstrCost(Instruction::Trunc
, Trunc
->getDestTy(),
7881 Trunc
->getSrcTy(), CCH
, CostKind
, Trunc
);
7884 // Detect reduction patterns
7885 if (auto RedCost
= getReductionPatternCost(I
, VF
, VectorTy
, CostKind
))
7888 Type
*SrcScalarTy
= I
->getOperand(0)->getType();
7890 VectorTy
->isVectorTy() ? ToVectorTy(SrcScalarTy
, VF
) : SrcScalarTy
;
7891 if (canTruncateToMinimalBitwidth(I
, VF
)) {
7892 // This cast is going to be shrunk. This may remove the cast or it might
7893 // turn it into slightly different cast. For example, if MinBW == 16,
7894 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7896 // Calculate the modified src and dest types.
7897 Type
*MinVecTy
= VectorTy
;
7898 if (Opcode
== Instruction::Trunc
) {
7899 SrcVecTy
= smallestIntegerVectorType(SrcVecTy
, MinVecTy
);
7901 largestIntegerVectorType(ToVectorTy(I
->getType(), VF
), MinVecTy
);
7902 } else if (Opcode
== Instruction::ZExt
|| Opcode
== Instruction::SExt
) {
7903 SrcVecTy
= largestIntegerVectorType(SrcVecTy
, MinVecTy
);
7905 smallestIntegerVectorType(ToVectorTy(I
->getType(), VF
), MinVecTy
);
7909 return TTI
.getCastInstrCost(Opcode
, VectorTy
, SrcVecTy
, CCH
, CostKind
, I
);
7911 case Instruction::Call
: {
7912 bool NeedToScalarize
;
7913 CallInst
*CI
= cast
<CallInst
>(I
);
7914 InstructionCost CallCost
= getVectorCallCost(CI
, VF
, NeedToScalarize
);
7915 if (getVectorIntrinsicIDForCall(CI
, TLI
)) {
7916 InstructionCost IntrinsicCost
= getVectorIntrinsicCost(CI
, VF
);
7917 return std::min(CallCost
, IntrinsicCost
);
7921 case Instruction::ExtractValue
:
7922 return TTI
.getInstructionCost(I
, TTI::TCK_RecipThroughput
);
7923 case Instruction::Alloca
:
7924 // We cannot easily widen alloca to a scalable alloca, as
7925 // the result would need to be a vector of pointers.
7926 if (VF
.isScalable())
7927 return InstructionCost::getInvalid();
7930 // This opcode is unknown. Assume that it is the same as 'mul'.
7931 return TTI
.getArithmeticInstrCost(Instruction::Mul
, VectorTy
, CostKind
);
7935 char LoopVectorize::ID
= 0;
7937 static const char lv_name
[] = "Loop Vectorization";
7939 INITIALIZE_PASS_BEGIN(LoopVectorize
, LV_NAME
, lv_name
, false, false)
7940 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass
)
7941 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass
)
7942 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass
)
7943 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass
)
7944 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker
)
7945 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass
)
7946 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass
)
7947 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass
)
7948 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass
)
7949 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis
)
7950 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass
)
7951 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass
)
7952 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass
)
7953 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy
)
7954 INITIALIZE_PASS_END(LoopVectorize
, LV_NAME
, lv_name
, false, false)
7958 Pass
*createLoopVectorizePass() { return new LoopVectorize(); }
7960 Pass
*createLoopVectorizePass(bool InterleaveOnlyWhenForced
,
7961 bool VectorizeOnlyWhenForced
) {
7962 return new LoopVectorize(InterleaveOnlyWhenForced
, VectorizeOnlyWhenForced
);
7965 } // end namespace llvm
7967 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction
*Inst
) {
7968 // Check if the pointer operand of a load or store instruction is
7970 if (auto *Ptr
= getLoadStorePointerOperand(Inst
))
7971 return Legal
->isConsecutivePtr(Ptr
);
7975 void LoopVectorizationCostModel::collectValuesToIgnore() {
7976 // Ignore ephemeral values.
7977 CodeMetrics::collectEphemeralValues(TheLoop
, AC
, ValuesToIgnore
);
7979 // Ignore type-promoting instructions we identified during reduction
7981 for (auto &Reduction
: Legal
->getReductionVars()) {
7982 RecurrenceDescriptor
&RedDes
= Reduction
.second
;
7983 const SmallPtrSetImpl
<Instruction
*> &Casts
= RedDes
.getCastInsts();
7984 VecValuesToIgnore
.insert(Casts
.begin(), Casts
.end());
7986 // Ignore type-casting instructions we identified during induction
7988 for (auto &Induction
: Legal
->getInductionVars()) {
7989 InductionDescriptor
&IndDes
= Induction
.second
;
7990 const SmallVectorImpl
<Instruction
*> &Casts
= IndDes
.getCastInsts();
7991 VecValuesToIgnore
.insert(Casts
.begin(), Casts
.end());
7995 void LoopVectorizationCostModel::collectInLoopReductions() {
7996 for (auto &Reduction
: Legal
->getReductionVars()) {
7997 PHINode
*Phi
= Reduction
.first
;
7998 RecurrenceDescriptor
&RdxDesc
= Reduction
.second
;
8000 // We don't collect reductions that are type promoted (yet).
8001 if (RdxDesc
.getRecurrenceType() != Phi
->getType())
8004 // If the target would prefer this reduction to happen "in-loop", then we
8005 // want to record it as such.
8006 unsigned Opcode
= RdxDesc
.getOpcode();
8007 if (!PreferInLoopReductions
&& !useOrderedReductions(RdxDesc
) &&
8008 !TTI
.preferInLoopReduction(Opcode
, Phi
->getType(),
8009 TargetTransformInfo::ReductionFlags()))
8012 // Check that we can correctly put the reductions into the loop, by
8013 // finding the chain of operations that leads from the phi to the loop
8015 SmallVector
<Instruction
*, 4> ReductionOperations
=
8016 RdxDesc
.getReductionOpChain(Phi
, TheLoop
);
8017 bool InLoop
= !ReductionOperations
.empty();
8019 InLoopReductionChains
[Phi
] = ReductionOperations
;
8020 // Add the elements to InLoopReductionImmediateChains for cost modelling.
8021 Instruction
*LastChain
= Phi
;
8022 for (auto *I
: ReductionOperations
) {
8023 InLoopReductionImmediateChains
[I
] = LastChain
;
8027 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop
? "inloop" : "out of loop")
8028 << " reduction for phi: " << *Phi
<< "\n");
8032 // TODO: we could return a pair of values that specify the max VF and
8033 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
8034 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
8035 // doesn't have a cost model that can choose which plan to execute if
8036 // more than one is generated.
8037 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits
,
8038 LoopVectorizationCostModel
&CM
) {
8039 unsigned WidestType
;
8040 std::tie(std::ignore
, WidestType
) = CM
.getSmallestAndWidestTypes();
8041 return WidestVectorRegBits
/ WidestType
;
8045 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF
) {
8046 assert(!UserVF
.isScalable() && "scalable vectors not yet supported");
8047 ElementCount VF
= UserVF
;
8048 // Outer loop handling: They may require CFG and instruction level
8049 // transformations before even evaluating whether vectorization is profitable.
8050 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8051 // the vectorization pipeline.
8052 if (!OrigLoop
->isInnermost()) {
8053 // If the user doesn't provide a vectorization factor, determine a
8055 if (UserVF
.isZero()) {
8056 VF
= ElementCount::getFixed(determineVPlanVF(
8057 TTI
->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector
)
8060 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF
<< ".\n");
8062 // Make sure we have a VF > 1 for stress testing.
8063 if (VPlanBuildStressTest
&& (VF
.isScalar() || VF
.isZero())) {
8064 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
8065 << "overriding computed VF.\n");
8066 VF
= ElementCount::getFixed(4);
8069 assert(EnableVPlanNativePath
&& "VPlan-native path is not enabled.");
8070 assert(isPowerOf2_32(VF
.getKnownMinValue()) &&
8071 "VF needs to be a power of two");
8072 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF
.isZero() ? "user " : "")
8073 << "VF " << VF
<< " to build VPlans.\n");
8074 buildVPlans(VF
, VF
);
8076 // For VPlan build stress testing, we bail out after VPlan construction.
8077 if (VPlanBuildStressTest
)
8078 return VectorizationFactor::Disabled();
8080 return {VF
, 0 /*Cost*/};
8084 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
8085 "VPlan-native path.\n");
8086 return VectorizationFactor::Disabled();
8089 Optional
<VectorizationFactor
>
8090 LoopVectorizationPlanner::plan(ElementCount UserVF
, unsigned UserIC
) {
8091 assert(OrigLoop
->isInnermost() && "Inner loop expected.");
8092 FixedScalableVFPair MaxFactors
= CM
.computeMaxVF(UserVF
, UserIC
);
8093 if (!MaxFactors
) // Cases that should not to be vectorized nor interleaved.
8096 // Invalidate interleave groups if all blocks of loop will be predicated.
8097 if (CM
.blockNeedsPredication(OrigLoop
->getHeader()) &&
8098 !useMaskedInterleavedAccesses(*TTI
)) {
8101 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
8102 "which requires masked-interleaved support.\n");
8103 if (CM
.InterleaveInfo
.invalidateGroups())
8104 // Invalidating interleave groups also requires invalidating all decisions
8105 // based on them, which includes widening decisions and uniform and scalar
8107 CM
.invalidateCostModelingDecisions();
8110 ElementCount MaxUserVF
=
8111 UserVF
.isScalable() ? MaxFactors
.ScalableVF
: MaxFactors
.FixedVF
;
8112 bool UserVFIsLegal
= ElementCount::isKnownLE(UserVF
, MaxUserVF
);
8113 if (!UserVF
.isZero() && UserVFIsLegal
) {
8114 assert(isPowerOf2_32(UserVF
.getKnownMinValue()) &&
8115 "VF needs to be a power of two");
8116 // Collect the instructions (and their associated costs) that will be more
8117 // profitable to scalarize.
8118 if (CM
.selectUserVectorizationFactor(UserVF
)) {
8119 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF
<< ".\n");
8120 CM
.collectInLoopReductions();
8121 buildVPlansWithVPRecipes(UserVF
, UserVF
);
8122 LLVM_DEBUG(printPlans(dbgs()));
8123 return {{UserVF
, 0}};
8125 reportVectorizationInfo("UserVF ignored because of invalid costs.",
8126 "InvalidCost", ORE
, OrigLoop
);
8129 // Populate the set of Vectorization Factor Candidates.
8130 ElementCountSet VFCandidates
;
8131 for (auto VF
= ElementCount::getFixed(1);
8132 ElementCount::isKnownLE(VF
, MaxFactors
.FixedVF
); VF
*= 2)
8133 VFCandidates
.insert(VF
);
8134 for (auto VF
= ElementCount::getScalable(1);
8135 ElementCount::isKnownLE(VF
, MaxFactors
.ScalableVF
); VF
*= 2)
8136 VFCandidates
.insert(VF
);
8138 for (const auto &VF
: VFCandidates
) {
8139 // Collect Uniform and Scalar instructions after vectorization with VF.
8140 CM
.collectUniformsAndScalars(VF
);
8142 // Collect the instructions (and their associated costs) that will be more
8143 // profitable to scalarize.
8145 CM
.collectInstsToScalarize(VF
);
8148 CM
.collectInLoopReductions();
8149 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors
.FixedVF
);
8150 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors
.ScalableVF
);
8152 LLVM_DEBUG(printPlans(dbgs()));
8153 if (!MaxFactors
.hasVector())
8154 return VectorizationFactor::Disabled();
8156 // Select the optimal vectorization factor.
8157 auto SelectedVF
= CM
.selectVectorizationFactor(VFCandidates
);
8159 // Check if it is profitable to vectorize with runtime checks.
8160 unsigned NumRuntimePointerChecks
= Requirements
.getNumRuntimePointerChecks();
8161 if (SelectedVF
.Width
.getKnownMinValue() > 1 && NumRuntimePointerChecks
) {
8162 bool PragmaThresholdReached
=
8163 NumRuntimePointerChecks
> PragmaVectorizeMemoryCheckThreshold
;
8164 bool ThresholdReached
=
8165 NumRuntimePointerChecks
> VectorizerParams::RuntimeMemoryCheckThreshold
;
8166 if ((ThresholdReached
&& !Hints
.allowReordering()) ||
8167 PragmaThresholdReached
) {
8169 return OptimizationRemarkAnalysisAliasing(
8170 DEBUG_TYPE
, "CantReorderMemOps", OrigLoop
->getStartLoc(),
8171 OrigLoop
->getHeader())
8172 << "loop not vectorized: cannot prove it is safe to reorder "
8173 "memory operations";
8175 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
8176 Hints
.emitRemarkWithHints();
8177 return VectorizationFactor::Disabled();
8183 void LoopVectorizationPlanner::setBestPlan(ElementCount VF
, unsigned UF
) {
8184 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF
<< ", UF=" << UF
8189 erase_if(VPlans
, [VF
](const VPlanPtr
&Plan
) {
8190 return !Plan
->hasVF(VF
);
8192 assert(VPlans
.size() == 1 && "Best VF has not a single VPlan.");
8195 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer
&ILV
,
8196 DominatorTree
*DT
) {
8197 // Perform the actual loop transformation.
8199 // 1. Create a new empty loop. Unlink the old loop and connect the new one.
8200 assert(BestVF
.hasValue() && "Vectorization Factor is missing");
8201 assert(VPlans
.size() == 1 && "Not a single VPlan to execute.");
8203 VPTransformState State
{
8204 *BestVF
, BestUF
, LI
, DT
, ILV
.Builder
, &ILV
, VPlans
.front().get()};
8205 State
.CFG
.PrevBB
= ILV
.createVectorizedLoopSkeleton();
8206 State
.TripCount
= ILV
.getOrCreateTripCount(nullptr);
8207 State
.CanonicalIV
= ILV
.Induction
;
8209 ILV
.printDebugTracesAtStart();
8211 //===------------------------------------------------===//
8213 // Notice: any optimization or new instruction that go
8214 // into the code below should also be implemented in
8217 //===------------------------------------------------===//
8219 // 2. Copy and widen instructions from the old loop into the new loop.
8220 VPlans
.front()->execute(&State
);
8222 // 3. Fix the vectorized code: take care of header phi's, live-outs,
8223 // predication, updating analyses.
8224 ILV
.fixVectorizedLoop(State
);
8226 ILV
.printDebugTracesAtEnd();
8229 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
8230 void LoopVectorizationPlanner::printPlans(raw_ostream
&O
) {
8231 for (const auto &Plan
: VPlans
)
8232 if (PrintVPlansInDotFormat
)
8239 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
8240 SmallPtrSetImpl
<Instruction
*> &DeadInstructions
) {
8242 // We create new control-flow for the vectorized loop, so the original exit
8243 // conditions will be dead after vectorization if it's only used by the
8245 SmallVector
<BasicBlock
*> ExitingBlocks
;
8246 OrigLoop
->getExitingBlocks(ExitingBlocks
);
8247 for (auto *BB
: ExitingBlocks
) {
8248 auto *Cmp
= dyn_cast
<Instruction
>(BB
->getTerminator()->getOperand(0));
8249 if (!Cmp
|| !Cmp
->hasOneUse())
8252 // TODO: we should introduce a getUniqueExitingBlocks on Loop
8253 if (!DeadInstructions
.insert(Cmp
).second
)
8256 // The operands of the icmp is often a dead trunc, used by IndUpdate.
8257 // TODO: can recurse through operands in general
8258 for (Value
*Op
: Cmp
->operands()) {
8259 if (isa
<TruncInst
>(Op
) && Op
->hasOneUse())
8260 DeadInstructions
.insert(cast
<Instruction
>(Op
));
8264 // We create new "steps" for induction variable updates to which the original
8265 // induction variables map. An original update instruction will be dead if
8266 // all its users except the induction variable are dead.
8267 auto *Latch
= OrigLoop
->getLoopLatch();
8268 for (auto &Induction
: Legal
->getInductionVars()) {
8269 PHINode
*Ind
= Induction
.first
;
8270 auto *IndUpdate
= cast
<Instruction
>(Ind
->getIncomingValueForBlock(Latch
));
8272 // If the tail is to be folded by masking, the primary induction variable,
8273 // if exists, isn't dead: it will be used for masking. Don't kill it.
8274 if (CM
.foldTailByMasking() && IndUpdate
== Legal
->getPrimaryInduction())
8277 if (llvm::all_of(IndUpdate
->users(), [&](User
*U
) -> bool {
8278 return U
== Ind
|| DeadInstructions
.count(cast
<Instruction
>(U
));
8280 DeadInstructions
.insert(IndUpdate
);
8282 // We record as "Dead" also the type-casting instructions we had identified
8283 // during induction analysis. We don't need any handling for them in the
8284 // vectorized loop because we have proven that, under a proper runtime
8285 // test guarding the vectorized loop, the value of the phi, and the casted
8286 // value of the phi, are the same. The last instruction in this casting chain
8287 // will get its scalar/vector/widened def from the scalar/vector/widened def
8288 // of the respective phi node. Any other casts in the induction def-use chain
8289 // have no other uses outside the phi update chain, and will be ignored.
8290 InductionDescriptor
&IndDes
= Induction
.second
;
8291 const SmallVectorImpl
<Instruction
*> &Casts
= IndDes
.getCastInsts();
8292 DeadInstructions
.insert(Casts
.begin(), Casts
.end());
8296 Value
*InnerLoopUnroller::reverseVector(Value
*Vec
) { return Vec
; }
8298 Value
*InnerLoopUnroller::getBroadcastInstrs(Value
*V
) { return V
; }
8300 Value
*InnerLoopUnroller::getStepVector(Value
*Val
, int StartIdx
, Value
*Step
,
8301 Instruction::BinaryOps BinOp
) {
8302 // When unrolling and the VF is 1, we only need to add a simple scalar.
8303 Type
*Ty
= Val
->getType();
8304 assert(!Ty
->isVectorTy() && "Val must be a scalar");
8306 if (Ty
->isFloatingPointTy()) {
8307 Constant
*C
= ConstantFP::get(Ty
, (double)StartIdx
);
8309 // Floating-point operations inherit FMF via the builder's flags.
8310 Value
*MulOp
= Builder
.CreateFMul(C
, Step
);
8311 return Builder
.CreateBinOp(BinOp
, Val
, MulOp
);
8313 Constant
*C
= ConstantInt::get(Ty
, StartIdx
);
8314 return Builder
.CreateAdd(Val
, Builder
.CreateMul(C
, Step
), "induction");
8317 static void AddRuntimeUnrollDisableMetaData(Loop
*L
) {
8318 SmallVector
<Metadata
*, 4> MDs
;
8319 // Reserve first location for self reference to the LoopID metadata node.
8320 MDs
.push_back(nullptr);
8321 bool IsUnrollMetadata
= false;
8322 MDNode
*LoopID
= L
->getLoopID();
8324 // First find existing loop unrolling disable metadata.
8325 for (unsigned i
= 1, ie
= LoopID
->getNumOperands(); i
< ie
; ++i
) {
8326 auto *MD
= dyn_cast
<MDNode
>(LoopID
->getOperand(i
));
8328 const auto *S
= dyn_cast
<MDString
>(MD
->getOperand(0));
8330 S
&& S
->getString().startswith("llvm.loop.unroll.disable");
8332 MDs
.push_back(LoopID
->getOperand(i
));
8336 if (!IsUnrollMetadata
) {
8337 // Add runtime unroll disable metadata.
8338 LLVMContext
&Context
= L
->getHeader()->getContext();
8339 SmallVector
<Metadata
*, 1> DisableOperands
;
8340 DisableOperands
.push_back(
8341 MDString::get(Context
, "llvm.loop.unroll.runtime.disable"));
8342 MDNode
*DisableNode
= MDNode::get(Context
, DisableOperands
);
8343 MDs
.push_back(DisableNode
);
8344 MDNode
*NewLoopID
= MDNode::get(Context
, MDs
);
8345 // Set operand 0 to refer to the loop id itself.
8346 NewLoopID
->replaceOperandWith(0, NewLoopID
);
8347 L
->setLoopID(NewLoopID
);
8351 //===--------------------------------------------------------------------===//
8352 // EpilogueVectorizerMainLoop
8353 //===--------------------------------------------------------------------===//
8355 /// This function is partially responsible for generating the control flow
8356 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8357 BasicBlock
*EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
8358 MDNode
*OrigLoopID
= OrigLoop
->getLoopID();
8359 Loop
*Lp
= createVectorLoopSkeleton("");
8361 // Generate the code to check the minimum iteration count of the vector
8362 // epilogue (see below).
8363 EPI
.EpilogueIterationCountCheck
=
8364 emitMinimumIterationCountCheck(Lp
, LoopScalarPreHeader
, true);
8365 EPI
.EpilogueIterationCountCheck
->setName("iter.check");
8367 // Generate the code to check any assumptions that we've made for SCEV
8369 EPI
.SCEVSafetyCheck
= emitSCEVChecks(Lp
, LoopScalarPreHeader
);
8371 // Generate the code that checks at runtime if arrays overlap. We put the
8372 // checks into a separate block to make the more common case of few elements
8374 EPI
.MemSafetyCheck
= emitMemRuntimeChecks(Lp
, LoopScalarPreHeader
);
8376 // Generate the iteration count check for the main loop, *after* the check
8377 // for the epilogue loop, so that the path-length is shorter for the case
8378 // that goes directly through the vector epilogue. The longer-path length for
8379 // the main loop is compensated for, by the gain from vectorizing the larger
8380 // trip count. Note: the branch will get updated later on when we vectorize
8382 EPI
.MainLoopIterationCountCheck
=
8383 emitMinimumIterationCountCheck(Lp
, LoopScalarPreHeader
, false);
8385 // Generate the induction variable.
8386 OldInduction
= Legal
->getPrimaryInduction();
8387 Type
*IdxTy
= Legal
->getWidestInductionType();
8388 Value
*StartIdx
= ConstantInt::get(IdxTy
, 0);
8389 Constant
*Step
= ConstantInt::get(IdxTy
, VF
.getKnownMinValue() * UF
);
8390 Value
*CountRoundDown
= getOrCreateVectorTripCount(Lp
);
8391 EPI
.VectorTripCount
= CountRoundDown
;
8393 createInductionVariable(Lp
, StartIdx
, CountRoundDown
, Step
,
8394 getDebugLocFromInstOrOperands(OldInduction
));
8396 // Skip induction resume value creation here because they will be created in
8397 // the second pass. If we created them here, they wouldn't be used anyway,
8398 // because the vplan in the second pass still contains the inductions from the
8401 return completeLoopSkeleton(Lp
, OrigLoopID
);
8404 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
8406 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
8407 << "Main Loop VF:" << EPI
.MainLoopVF
.getKnownMinValue()
8408 << ", Main Loop UF:" << EPI
.MainLoopUF
8409 << ", Epilogue Loop VF:" << EPI
.EpilogueVF
.getKnownMinValue()
8410 << ", Epilogue Loop UF:" << EPI
.EpilogueUF
<< "\n";
8414 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
8415 DEBUG_WITH_TYPE(VerboseDebug
, {
8416 dbgs() << "intermediate fn:\n" << *Induction
->getFunction() << "\n";
8420 BasicBlock
*EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
8421 Loop
*L
, BasicBlock
*Bypass
, bool ForEpilogue
) {
8422 assert(L
&& "Expected valid Loop.");
8423 assert(Bypass
&& "Expected valid bypass basic block.");
8425 ForEpilogue
? EPI
.EpilogueVF
.getKnownMinValue() : VF
.getKnownMinValue();
8426 unsigned UFactor
= ForEpilogue
? EPI
.EpilogueUF
: UF
;
8427 Value
*Count
= getOrCreateTripCount(L
);
8428 // Reuse existing vector loop preheader for TC checks.
8429 // Note that new preheader block is generated for vector loop.
8430 BasicBlock
*const TCCheckBlock
= LoopVectorPreHeader
;
8431 IRBuilder
<> Builder(TCCheckBlock
->getTerminator());
8433 // Generate code to check if the loop's trip count is less than VF * UF of the
8434 // main vector loop.
8435 auto P
= Cost
->requiresScalarEpilogue(ForEpilogue
? EPI
.EpilogueVF
: VF
) ?
8436 ICmpInst::ICMP_ULE
: ICmpInst::ICMP_ULT
;
8438 Value
*CheckMinIters
= Builder
.CreateICmp(
8439 P
, Count
, ConstantInt::get(Count
->getType(), VFactor
* UFactor
),
8443 TCCheckBlock
->setName("vector.main.loop.iter.check");
8445 // Create new preheader for vector loop.
8446 LoopVectorPreHeader
= SplitBlock(TCCheckBlock
, TCCheckBlock
->getTerminator(),
8447 DT
, LI
, nullptr, "vector.ph");
8450 assert(DT
->properlyDominates(DT
->getNode(TCCheckBlock
),
8451 DT
->getNode(Bypass
)->getIDom()) &&
8452 "TC check is expected to dominate Bypass");
8454 // Update dominator for Bypass & LoopExit.
8455 DT
->changeImmediateDominator(Bypass
, TCCheckBlock
);
8456 if (!Cost
->requiresScalarEpilogue(EPI
.EpilogueVF
))
8457 // For loops with multiple exits, there's no edge from the middle block
8458 // to exit blocks (as the epilogue must run) and thus no need to update
8459 // the immediate dominator of the exit blocks.
8460 DT
->changeImmediateDominator(LoopExitBlock
, TCCheckBlock
);
8462 LoopBypassBlocks
.push_back(TCCheckBlock
);
8464 // Save the trip count so we don't have to regenerate it in the
8465 // vec.epilog.iter.check. This is safe to do because the trip count
8466 // generated here dominates the vector epilog iter check.
8467 EPI
.TripCount
= Count
;
8470 ReplaceInstWithInst(
8471 TCCheckBlock
->getTerminator(),
8472 BranchInst::Create(Bypass
, LoopVectorPreHeader
, CheckMinIters
));
8474 return TCCheckBlock
;
8477 //===--------------------------------------------------------------------===//
8478 // EpilogueVectorizerEpilogueLoop
8479 //===--------------------------------------------------------------------===//
8481 /// This function is partially responsible for generating the control flow
8482 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8484 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
8485 MDNode
*OrigLoopID
= OrigLoop
->getLoopID();
8486 Loop
*Lp
= createVectorLoopSkeleton("vec.epilog.");
8488 // Now, compare the remaining count and if there aren't enough iterations to
8489 // execute the vectorized epilogue skip to the scalar part.
8490 BasicBlock
*VecEpilogueIterationCountCheck
= LoopVectorPreHeader
;
8491 VecEpilogueIterationCountCheck
->setName("vec.epilog.iter.check");
8492 LoopVectorPreHeader
=
8493 SplitBlock(LoopVectorPreHeader
, LoopVectorPreHeader
->getTerminator(), DT
,
8494 LI
, nullptr, "vec.epilog.ph");
8495 emitMinimumVectorEpilogueIterCountCheck(Lp
, LoopScalarPreHeader
,
8496 VecEpilogueIterationCountCheck
);
8498 // Adjust the control flow taking the state info from the main loop
8499 // vectorization into account.
8500 assert(EPI
.MainLoopIterationCountCheck
&& EPI
.EpilogueIterationCountCheck
&&
8501 "expected this to be saved from the previous pass.");
8502 EPI
.MainLoopIterationCountCheck
->getTerminator()->replaceUsesOfWith(
8503 VecEpilogueIterationCountCheck
, LoopVectorPreHeader
);
8505 DT
->changeImmediateDominator(LoopVectorPreHeader
,
8506 EPI
.MainLoopIterationCountCheck
);
8508 EPI
.EpilogueIterationCountCheck
->getTerminator()->replaceUsesOfWith(
8509 VecEpilogueIterationCountCheck
, LoopScalarPreHeader
);
8511 if (EPI
.SCEVSafetyCheck
)
8512 EPI
.SCEVSafetyCheck
->getTerminator()->replaceUsesOfWith(
8513 VecEpilogueIterationCountCheck
, LoopScalarPreHeader
);
8514 if (EPI
.MemSafetyCheck
)
8515 EPI
.MemSafetyCheck
->getTerminator()->replaceUsesOfWith(
8516 VecEpilogueIterationCountCheck
, LoopScalarPreHeader
);
8518 DT
->changeImmediateDominator(
8519 VecEpilogueIterationCountCheck
,
8520 VecEpilogueIterationCountCheck
->getSinglePredecessor());
8522 DT
->changeImmediateDominator(LoopScalarPreHeader
,
8523 EPI
.EpilogueIterationCountCheck
);
8524 if (!Cost
->requiresScalarEpilogue(EPI
.EpilogueVF
))
8525 // If there is an epilogue which must run, there's no edge from the
8526 // middle block to exit blocks and thus no need to update the immediate
8527 // dominator of the exit blocks.
8528 DT
->changeImmediateDominator(LoopExitBlock
,
8529 EPI
.EpilogueIterationCountCheck
);
8531 // Keep track of bypass blocks, as they feed start values to the induction
8532 // phis in the scalar loop preheader.
8533 if (EPI
.SCEVSafetyCheck
)
8534 LoopBypassBlocks
.push_back(EPI
.SCEVSafetyCheck
);
8535 if (EPI
.MemSafetyCheck
)
8536 LoopBypassBlocks
.push_back(EPI
.MemSafetyCheck
);
8537 LoopBypassBlocks
.push_back(EPI
.EpilogueIterationCountCheck
);
8539 // Generate a resume induction for the vector epilogue and put it in the
8540 // vector epilogue preheader
8541 Type
*IdxTy
= Legal
->getWidestInductionType();
8542 PHINode
*EPResumeVal
= PHINode::Create(IdxTy
, 2, "vec.epilog.resume.val",
8543 LoopVectorPreHeader
->getFirstNonPHI());
8544 EPResumeVal
->addIncoming(EPI
.VectorTripCount
, VecEpilogueIterationCountCheck
);
8545 EPResumeVal
->addIncoming(ConstantInt::get(IdxTy
, 0),
8546 EPI
.MainLoopIterationCountCheck
);
8548 // Generate the induction variable.
8549 OldInduction
= Legal
->getPrimaryInduction();
8550 Value
*CountRoundDown
= getOrCreateVectorTripCount(Lp
);
8551 Constant
*Step
= ConstantInt::get(IdxTy
, VF
.getKnownMinValue() * UF
);
8552 Value
*StartIdx
= EPResumeVal
;
8554 createInductionVariable(Lp
, StartIdx
, CountRoundDown
, Step
,
8555 getDebugLocFromInstOrOperands(OldInduction
));
8557 // Generate induction resume values. These variables save the new starting
8558 // indexes for the scalar loop. They are used to test if there are any tail
8559 // iterations left once the vector loop has completed.
8560 // Note that when the vectorized epilogue is skipped due to iteration count
8561 // check, then the resume value for the induction variable comes from
8562 // the trip count of the main vector loop, hence passing the AdditionalBypass
8564 createInductionResumeValues(Lp
, CountRoundDown
,
8565 {VecEpilogueIterationCountCheck
,
8566 EPI
.VectorTripCount
} /* AdditionalBypass */);
8568 AddRuntimeUnrollDisableMetaData(Lp
);
8569 return completeLoopSkeleton(Lp
, OrigLoopID
);
8573 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8574 Loop
*L
, BasicBlock
*Bypass
, BasicBlock
*Insert
) {
8576 assert(EPI
.TripCount
&&
8577 "Expected trip count to have been safed in the first pass.");
8579 (!isa
<Instruction
>(EPI
.TripCount
) ||
8580 DT
->dominates(cast
<Instruction
>(EPI
.TripCount
)->getParent(), Insert
)) &&
8581 "saved trip count does not dominate insertion point.");
8582 Value
*TC
= EPI
.TripCount
;
8583 IRBuilder
<> Builder(Insert
->getTerminator());
8584 Value
*Count
= Builder
.CreateSub(TC
, EPI
.VectorTripCount
, "n.vec.remaining");
8586 // Generate code to check if the loop's trip count is less than VF * UF of the
8587 // vector epilogue loop.
8588 auto P
= Cost
->requiresScalarEpilogue(EPI
.EpilogueVF
) ?
8589 ICmpInst::ICMP_ULE
: ICmpInst::ICMP_ULT
;
8591 Value
*CheckMinIters
= Builder
.CreateICmp(
8593 ConstantInt::get(Count
->getType(),
8594 EPI
.EpilogueVF
.getKnownMinValue() * EPI
.EpilogueUF
),
8595 "min.epilog.iters.check");
8597 ReplaceInstWithInst(
8598 Insert
->getTerminator(),
8599 BranchInst::Create(Bypass
, LoopVectorPreHeader
, CheckMinIters
));
8601 LoopBypassBlocks
.push_back(Insert
);
8605 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8607 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8608 << "Epilogue Loop VF:" << EPI
.EpilogueVF
.getKnownMinValue()
8609 << ", Epilogue Loop UF:" << EPI
.EpilogueUF
<< "\n";
8613 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8614 DEBUG_WITH_TYPE(VerboseDebug
, {
8615 dbgs() << "final fn:\n" << *Induction
->getFunction() << "\n";
8619 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8620 const std::function
<bool(ElementCount
)> &Predicate
, VFRange
&Range
) {
8621 assert(!Range
.isEmpty() && "Trying to test an empty VF range.");
8622 bool PredicateAtRangeStart
= Predicate(Range
.Start
);
8624 for (ElementCount TmpVF
= Range
.Start
* 2;
8625 ElementCount::isKnownLT(TmpVF
, Range
.End
); TmpVF
*= 2)
8626 if (Predicate(TmpVF
) != PredicateAtRangeStart
) {
8631 return PredicateAtRangeStart
;
8634 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8635 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8636 /// of VF's starting at a given VF and extending it as much as possible. Each
8637 /// vectorization decision can potentially shorten this sub-range during
8639 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF
,
8640 ElementCount MaxVF
) {
8641 auto MaxVFPlusOne
= MaxVF
.getWithIncrement(1);
8642 for (ElementCount VF
= MinVF
; ElementCount::isKnownLT(VF
, MaxVFPlusOne
);) {
8643 VFRange SubRange
= {VF
, MaxVFPlusOne
};
8644 VPlans
.push_back(buildVPlan(SubRange
));
8649 VPValue
*VPRecipeBuilder::createEdgeMask(BasicBlock
*Src
, BasicBlock
*Dst
,
8651 assert(is_contained(predecessors(Dst
), Src
) && "Invalid edge");
8653 // Look for cached value.
8654 std::pair
<BasicBlock
*, BasicBlock
*> Edge(Src
, Dst
);
8655 EdgeMaskCacheTy::iterator ECEntryIt
= EdgeMaskCache
.find(Edge
);
8656 if (ECEntryIt
!= EdgeMaskCache
.end())
8657 return ECEntryIt
->second
;
8659 VPValue
*SrcMask
= createBlockInMask(Src
, Plan
);
8661 // The terminator has to be a branch inst!
8662 BranchInst
*BI
= dyn_cast
<BranchInst
>(Src
->getTerminator());
8663 assert(BI
&& "Unexpected terminator found");
8665 if (!BI
->isConditional() || BI
->getSuccessor(0) == BI
->getSuccessor(1))
8666 return EdgeMaskCache
[Edge
] = SrcMask
;
8668 // If source is an exiting block, we know the exit edge is dynamically dead
8669 // in the vector loop, and thus we don't need to restrict the mask. Avoid
8670 // adding uses of an otherwise potentially dead instruction.
8671 if (OrigLoop
->isLoopExiting(Src
))
8672 return EdgeMaskCache
[Edge
] = SrcMask
;
8674 VPValue
*EdgeMask
= Plan
->getOrAddVPValue(BI
->getCondition());
8675 assert(EdgeMask
&& "No Edge Mask found for condition");
8677 if (BI
->getSuccessor(0) != Dst
)
8678 EdgeMask
= Builder
.createNot(EdgeMask
);
8680 if (SrcMask
) { // Otherwise block in-mask is all-one, no need to AND.
8681 // The condition is 'SrcMask && EdgeMask', which is equivalent to
8682 // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8683 // The select version does not introduce new UB if SrcMask is false and
8684 // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8685 VPValue
*False
= Plan
->getOrAddVPValue(
8686 ConstantInt::getFalse(BI
->getCondition()->getType()));
8687 EdgeMask
= Builder
.createSelect(SrcMask
, EdgeMask
, False
);
8690 return EdgeMaskCache
[Edge
] = EdgeMask
;
8693 VPValue
*VPRecipeBuilder::createBlockInMask(BasicBlock
*BB
, VPlanPtr
&Plan
) {
8694 assert(OrigLoop
->contains(BB
) && "Block is not a part of a loop");
8696 // Look for cached value.
8697 BlockMaskCacheTy::iterator BCEntryIt
= BlockMaskCache
.find(BB
);
8698 if (BCEntryIt
!= BlockMaskCache
.end())
8699 return BCEntryIt
->second
;
8701 // All-one mask is modelled as no-mask following the convention for masked
8702 // load/store/gather/scatter. Initialize BlockMask to no-mask.
8703 VPValue
*BlockMask
= nullptr;
8705 if (OrigLoop
->getHeader() == BB
) {
8706 if (!CM
.blockNeedsPredication(BB
))
8707 return BlockMaskCache
[BB
] = BlockMask
; // Loop incoming mask is all-one.
8709 // Create the block in mask as the first non-phi instruction in the block.
8710 VPBuilder::InsertPointGuard
Guard(Builder
);
8711 auto NewInsertionPoint
= Builder
.getInsertBlock()->getFirstNonPhi();
8712 Builder
.setInsertPoint(Builder
.getInsertBlock(), NewInsertionPoint
);
8714 // Introduce the early-exit compare IV <= BTC to form header block mask.
8715 // This is used instead of IV < TC because TC may wrap, unlike BTC.
8716 // Start by constructing the desired canonical IV.
8717 VPValue
*IV
= nullptr;
8718 if (Legal
->getPrimaryInduction())
8719 IV
= Plan
->getOrAddVPValue(Legal
->getPrimaryInduction());
8721 auto IVRecipe
= new VPWidenCanonicalIVRecipe();
8722 Builder
.getInsertBlock()->insert(IVRecipe
, NewInsertionPoint
);
8723 IV
= IVRecipe
->getVPSingleValue();
8725 VPValue
*BTC
= Plan
->getOrCreateBackedgeTakenCount();
8726 bool TailFolded
= !CM
.isScalarEpilogueAllowed();
8728 if (TailFolded
&& CM
.TTI
.emitGetActiveLaneMask()) {
8729 // While ActiveLaneMask is a binary op that consumes the loop tripcount
8730 // as a second argument, we only pass the IV here and extract the
8731 // tripcount from the transform state where codegen of the VP instructions
8733 BlockMask
= Builder
.createNaryOp(VPInstruction::ActiveLaneMask
, {IV
});
8735 BlockMask
= Builder
.createNaryOp(VPInstruction::ICmpULE
, {IV
, BTC
});
8737 return BlockMaskCache
[BB
] = BlockMask
;
8740 // This is the block mask. We OR all incoming edges.
8741 for (auto *Predecessor
: predecessors(BB
)) {
8742 VPValue
*EdgeMask
= createEdgeMask(Predecessor
, BB
, Plan
);
8743 if (!EdgeMask
) // Mask of predecessor is all-one so mask of block is too.
8744 return BlockMaskCache
[BB
] = EdgeMask
;
8746 if (!BlockMask
) { // BlockMask has its initialized nullptr value.
8747 BlockMask
= EdgeMask
;
8751 BlockMask
= Builder
.createOr(BlockMask
, EdgeMask
);
8754 return BlockMaskCache
[BB
] = BlockMask
;
8757 VPRecipeBase
*VPRecipeBuilder::tryToWidenMemory(Instruction
*I
,
8758 ArrayRef
<VPValue
*> Operands
,
8761 assert((isa
<LoadInst
>(I
) || isa
<StoreInst
>(I
)) &&
8762 "Must be called with either a load or store");
8764 auto willWiden
= [&](ElementCount VF
) -> bool {
8767 LoopVectorizationCostModel::InstWidening Decision
=
8768 CM
.getWideningDecision(I
, VF
);
8769 assert(Decision
!= LoopVectorizationCostModel::CM_Unknown
&&
8770 "CM decision should be taken at this point.");
8771 if (Decision
== LoopVectorizationCostModel::CM_Interleave
)
8773 if (CM
.isScalarAfterVectorization(I
, VF
) ||
8774 CM
.isProfitableToScalarize(I
, VF
))
8776 return Decision
!= LoopVectorizationCostModel::CM_Scalarize
;
8779 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden
, Range
))
8782 VPValue
*Mask
= nullptr;
8783 if (Legal
->isMaskRequired(I
))
8784 Mask
= createBlockInMask(I
->getParent(), Plan
);
8786 if (LoadInst
*Load
= dyn_cast
<LoadInst
>(I
))
8787 return new VPWidenMemoryInstructionRecipe(*Load
, Operands
[0], Mask
);
8789 StoreInst
*Store
= cast
<StoreInst
>(I
);
8790 return new VPWidenMemoryInstructionRecipe(*Store
, Operands
[1], Operands
[0],
8794 VPWidenIntOrFpInductionRecipe
*
8795 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode
*Phi
,
8796 ArrayRef
<VPValue
*> Operands
) const {
8797 // Check if this is an integer or fp induction. If so, build the recipe that
8798 // produces its scalar and vector values.
8799 InductionDescriptor II
= Legal
->getInductionVars().lookup(Phi
);
8800 if (II
.getKind() == InductionDescriptor::IK_IntInduction
||
8801 II
.getKind() == InductionDescriptor::IK_FpInduction
) {
8802 assert(II
.getStartValue() ==
8803 Phi
->getIncomingValueForBlock(OrigLoop
->getLoopPreheader()));
8804 const SmallVectorImpl
<Instruction
*> &Casts
= II
.getCastInsts();
8805 return new VPWidenIntOrFpInductionRecipe(
8806 Phi
, Operands
[0], Casts
.empty() ? nullptr : Casts
.front());
8812 VPWidenIntOrFpInductionRecipe
*VPRecipeBuilder::tryToOptimizeInductionTruncate(
8813 TruncInst
*I
, ArrayRef
<VPValue
*> Operands
, VFRange
&Range
,
8814 VPlan
&Plan
) const {
8815 // Optimize the special case where the source is a constant integer
8816 // induction variable. Notice that we can only optimize the 'trunc' case
8817 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8818 // (c) other casts depend on pointer size.
8820 // Determine whether \p K is a truncation based on an induction variable that
8821 // can be optimized.
8822 auto isOptimizableIVTruncate
=
8823 [&](Instruction
*K
) -> std::function
<bool(ElementCount
)> {
8824 return [=](ElementCount VF
) -> bool {
8825 return CM
.isOptimizableIVTruncate(K
, VF
);
8829 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8830 isOptimizableIVTruncate(I
), Range
)) {
8832 InductionDescriptor II
=
8833 Legal
->getInductionVars().lookup(cast
<PHINode
>(I
->getOperand(0)));
8834 VPValue
*Start
= Plan
.getOrAddVPValue(II
.getStartValue());
8835 return new VPWidenIntOrFpInductionRecipe(cast
<PHINode
>(I
->getOperand(0)),
8841 VPRecipeOrVPValueTy
VPRecipeBuilder::tryToBlend(PHINode
*Phi
,
8842 ArrayRef
<VPValue
*> Operands
,
8844 // If all incoming values are equal, the incoming VPValue can be used directly
8845 // instead of creating a new VPBlendRecipe.
8846 VPValue
*FirstIncoming
= Operands
[0];
8847 if (all_of(Operands
, [FirstIncoming
](const VPValue
*Inc
) {
8848 return FirstIncoming
== Inc
;
8853 // We know that all PHIs in non-header blocks are converted into selects, so
8854 // we don't have to worry about the insertion order and we can just use the
8855 // builder. At this point we generate the predication tree. There may be
8856 // duplications since this is a simple recursive scan, but future
8857 // optimizations will clean it up.
8858 SmallVector
<VPValue
*, 2> OperandsWithMask
;
8859 unsigned NumIncoming
= Phi
->getNumIncomingValues();
8861 for (unsigned In
= 0; In
< NumIncoming
; In
++) {
8863 createEdgeMask(Phi
->getIncomingBlock(In
), Phi
->getParent(), Plan
);
8864 assert((EdgeMask
|| NumIncoming
== 1) &&
8865 "Multiple predecessors with one having a full mask");
8866 OperandsWithMask
.push_back(Operands
[In
]);
8868 OperandsWithMask
.push_back(EdgeMask
);
8870 return toVPRecipeResult(new VPBlendRecipe(Phi
, OperandsWithMask
));
8873 VPWidenCallRecipe
*VPRecipeBuilder::tryToWidenCall(CallInst
*CI
,
8874 ArrayRef
<VPValue
*> Operands
,
8875 VFRange
&Range
) const {
8877 bool IsPredicated
= LoopVectorizationPlanner::getDecisionAndClampRange(
8878 [this, CI
](ElementCount VF
) { return CM
.isScalarWithPredication(CI
); },
8884 Intrinsic::ID ID
= getVectorIntrinsicIDForCall(CI
, TLI
);
8885 if (ID
&& (ID
== Intrinsic::assume
|| ID
== Intrinsic::lifetime_end
||
8886 ID
== Intrinsic::lifetime_start
|| ID
== Intrinsic::sideeffect
||
8887 ID
== Intrinsic::pseudoprobe
||
8888 ID
== Intrinsic::experimental_noalias_scope_decl
))
8891 auto willWiden
= [&](ElementCount VF
) -> bool {
8892 Intrinsic::ID ID
= getVectorIntrinsicIDForCall(CI
, TLI
);
8893 // The following case may be scalarized depending on the VF.
8894 // The flag shows whether we use Intrinsic or a usual Call for vectorized
8895 // version of the instruction.
8896 // Is it beneficial to perform intrinsic call compared to lib call?
8897 bool NeedToScalarize
= false;
8898 InstructionCost CallCost
= CM
.getVectorCallCost(CI
, VF
, NeedToScalarize
);
8899 InstructionCost IntrinsicCost
= ID
? CM
.getVectorIntrinsicCost(CI
, VF
) : 0;
8900 bool UseVectorIntrinsic
= ID
&& IntrinsicCost
<= CallCost
;
8901 return UseVectorIntrinsic
|| !NeedToScalarize
;
8904 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden
, Range
))
8907 ArrayRef
<VPValue
*> Ops
= Operands
.take_front(CI
->getNumArgOperands());
8908 return new VPWidenCallRecipe(*CI
, make_range(Ops
.begin(), Ops
.end()));
8911 bool VPRecipeBuilder::shouldWiden(Instruction
*I
, VFRange
&Range
) const {
8912 assert(!isa
<BranchInst
>(I
) && !isa
<PHINode
>(I
) && !isa
<LoadInst
>(I
) &&
8913 !isa
<StoreInst
>(I
) && "Instruction should have been handled earlier");
8914 // Instruction should be widened, unless it is scalar after vectorization,
8915 // scalarization is profitable or it is predicated.
8916 auto WillScalarize
= [this, I
](ElementCount VF
) -> bool {
8917 return CM
.isScalarAfterVectorization(I
, VF
) ||
8918 CM
.isProfitableToScalarize(I
, VF
) || CM
.isScalarWithPredication(I
);
8920 return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize
,
8924 VPWidenRecipe
*VPRecipeBuilder::tryToWiden(Instruction
*I
,
8925 ArrayRef
<VPValue
*> Operands
) const {
8926 auto IsVectorizableOpcode
= [](unsigned Opcode
) {
8928 case Instruction::Add
:
8929 case Instruction::And
:
8930 case Instruction::AShr
:
8931 case Instruction::BitCast
:
8932 case Instruction::FAdd
:
8933 case Instruction::FCmp
:
8934 case Instruction::FDiv
:
8935 case Instruction::FMul
:
8936 case Instruction::FNeg
:
8937 case Instruction::FPExt
:
8938 case Instruction::FPToSI
:
8939 case Instruction::FPToUI
:
8940 case Instruction::FPTrunc
:
8941 case Instruction::FRem
:
8942 case Instruction::FSub
:
8943 case Instruction::ICmp
:
8944 case Instruction::IntToPtr
:
8945 case Instruction::LShr
:
8946 case Instruction::Mul
:
8947 case Instruction::Or
:
8948 case Instruction::PtrToInt
:
8949 case Instruction::SDiv
:
8950 case Instruction::Select
:
8951 case Instruction::SExt
:
8952 case Instruction::Shl
:
8953 case Instruction::SIToFP
:
8954 case Instruction::SRem
:
8955 case Instruction::Sub
:
8956 case Instruction::Trunc
:
8957 case Instruction::UDiv
:
8958 case Instruction::UIToFP
:
8959 case Instruction::URem
:
8960 case Instruction::Xor
:
8961 case Instruction::ZExt
:
8967 if (!IsVectorizableOpcode(I
->getOpcode()))
8970 // Success: widen this instruction.
8971 return new VPWidenRecipe(*I
, make_range(Operands
.begin(), Operands
.end()));
8974 void VPRecipeBuilder::fixHeaderPhis() {
8975 BasicBlock
*OrigLatch
= OrigLoop
->getLoopLatch();
8976 for (VPWidenPHIRecipe
*R
: PhisToFix
) {
8977 auto *PN
= cast
<PHINode
>(R
->getUnderlyingValue());
8978 VPRecipeBase
*IncR
=
8979 getRecipe(cast
<Instruction
>(PN
->getIncomingValueForBlock(OrigLatch
)));
8980 R
->addOperand(IncR
->getVPSingleValue());
8984 VPBasicBlock
*VPRecipeBuilder::handleReplication(
8985 Instruction
*I
, VFRange
&Range
, VPBasicBlock
*VPBB
,
8987 bool IsUniform
= LoopVectorizationPlanner::getDecisionAndClampRange(
8988 [&](ElementCount VF
) { return CM
.isUniformAfterVectorization(I
, VF
); },
8991 bool IsPredicated
= LoopVectorizationPlanner::getDecisionAndClampRange(
8992 [&](ElementCount VF
) { return CM
.isPredicatedInst(I
); }, Range
);
8994 // Even if the instruction is not marked as uniform, there are certain
8995 // intrinsic calls that can be effectively treated as such, so we check for
8996 // them here. Conservatively, we only do this for scalable vectors, since
8997 // for fixed-width VFs we can always fall back on full scalarization.
8998 if (!IsUniform
&& Range
.Start
.isScalable() && isa
<IntrinsicInst
>(I
)) {
8999 switch (cast
<IntrinsicInst
>(I
)->getIntrinsicID()) {
9000 case Intrinsic::assume
:
9001 case Intrinsic::lifetime_start
:
9002 case Intrinsic::lifetime_end
:
9003 // For scalable vectors if one of the operands is variant then we still
9004 // want to mark as uniform, which will generate one instruction for just
9005 // the first lane of the vector. We can't scalarize the call in the same
9006 // way as for fixed-width vectors because we don't know how many lanes
9009 // The reasons for doing it this way for scalable vectors are:
9010 // 1. For the assume intrinsic generating the instruction for the first
9011 // lane is still be better than not generating any at all. For
9012 // example, the input may be a splat across all lanes.
9013 // 2. For the lifetime start/end intrinsics the pointer operand only
9014 // does anything useful when the input comes from a stack object,
9015 // which suggests it should always be uniform. For non-stack objects
9016 // the effect is to poison the object, which still allows us to
9025 auto *Recipe
= new VPReplicateRecipe(I
, Plan
->mapToVPValues(I
->operands()),
9026 IsUniform
, IsPredicated
);
9027 setRecipe(I
, Recipe
);
9028 Plan
->addVPValue(I
, Recipe
);
9030 // Find if I uses a predicated instruction. If so, it will use its scalar
9031 // value. Avoid hoisting the insert-element which packs the scalar value into
9032 // a vector value, as that happens iff all users use the vector value.
9033 for (VPValue
*Op
: Recipe
->operands()) {
9034 auto *PredR
= dyn_cast_or_null
<VPPredInstPHIRecipe
>(Op
->getDef());
9038 cast_or_null
<VPReplicateRecipe
>(PredR
->getOperand(0)->getDef());
9039 assert(RepR
->isPredicated() &&
9040 "expected Replicate recipe to be predicated");
9041 RepR
->setAlsoPack(false);
9044 // Finalize the recipe for Instr, first if it is not predicated.
9045 if (!IsPredicated
) {
9046 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I
<< "\n");
9047 VPBB
->appendRecipe(Recipe
);
9050 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I
<< "\n");
9051 assert(VPBB
->getSuccessors().empty() &&
9052 "VPBB has successors when handling predicated replication.");
9053 // Record predicated instructions for above packing optimizations.
9054 VPBlockBase
*Region
= createReplicateRegion(I
, Recipe
, Plan
);
9055 VPBlockUtils::insertBlockAfter(Region
, VPBB
);
9056 auto *RegSucc
= new VPBasicBlock();
9057 VPBlockUtils::insertBlockAfter(RegSucc
, Region
);
9061 VPRegionBlock
*VPRecipeBuilder::createReplicateRegion(Instruction
*Instr
,
9062 VPRecipeBase
*PredRecipe
,
9064 // Instructions marked for predication are replicated and placed under an
9065 // if-then construct to prevent side-effects.
9067 // Generate recipes to compute the block mask for this region.
9068 VPValue
*BlockInMask
= createBlockInMask(Instr
->getParent(), Plan
);
9070 // Build the triangular if-then region.
9071 std::string RegionName
= (Twine("pred.") + Instr
->getOpcodeName()).str();
9072 assert(Instr
->getParent() && "Predicated instruction not in any basic block");
9073 auto *BOMRecipe
= new VPBranchOnMaskRecipe(BlockInMask
);
9074 auto *Entry
= new VPBasicBlock(Twine(RegionName
) + ".entry", BOMRecipe
);
9075 auto *PHIRecipe
= Instr
->getType()->isVoidTy()
9077 : new VPPredInstPHIRecipe(Plan
->getOrAddVPValue(Instr
));
9079 Plan
->removeVPValueFor(Instr
);
9080 Plan
->addVPValue(Instr
, PHIRecipe
);
9082 auto *Exit
= new VPBasicBlock(Twine(RegionName
) + ".continue", PHIRecipe
);
9083 auto *Pred
= new VPBasicBlock(Twine(RegionName
) + ".if", PredRecipe
);
9084 VPRegionBlock
*Region
= new VPRegionBlock(Entry
, Exit
, RegionName
, true);
9086 // Note: first set Entry as region entry and then connect successors starting
9087 // from it in order, to propagate the "parent" of each VPBasicBlock.
9088 VPBlockUtils::insertTwoBlocksAfter(Pred
, Exit
, BlockInMask
, Entry
);
9089 VPBlockUtils::connectBlocks(Pred
, Exit
);
9095 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction
*Instr
,
9096 ArrayRef
<VPValue
*> Operands
,
9097 VFRange
&Range
, VPlanPtr
&Plan
) {
9098 // First, check for specific widening recipes that deal with calls, memory
9099 // operations, inductions and Phi nodes.
9100 if (auto *CI
= dyn_cast
<CallInst
>(Instr
))
9101 return toVPRecipeResult(tryToWidenCall(CI
, Operands
, Range
));
9103 if (isa
<LoadInst
>(Instr
) || isa
<StoreInst
>(Instr
))
9104 return toVPRecipeResult(tryToWidenMemory(Instr
, Operands
, Range
, Plan
));
9106 VPRecipeBase
*Recipe
;
9107 if (auto Phi
= dyn_cast
<PHINode
>(Instr
)) {
9108 if (Phi
->getParent() != OrigLoop
->getHeader())
9109 return tryToBlend(Phi
, Operands
, Plan
);
9110 if ((Recipe
= tryToOptimizeInductionPHI(Phi
, Operands
)))
9111 return toVPRecipeResult(Recipe
);
9113 VPWidenPHIRecipe
*PhiRecipe
= nullptr;
9114 if (Legal
->isReductionVariable(Phi
) || Legal
->isFirstOrderRecurrence(Phi
)) {
9115 VPValue
*StartV
= Operands
[0];
9116 if (Legal
->isReductionVariable(Phi
)) {
9117 RecurrenceDescriptor
&RdxDesc
= Legal
->getReductionVars()[Phi
];
9118 assert(RdxDesc
.getRecurrenceStartValue() ==
9119 Phi
->getIncomingValueForBlock(OrigLoop
->getLoopPreheader()));
9120 PhiRecipe
= new VPReductionPHIRecipe(Phi
, RdxDesc
, *StartV
,
9121 CM
.isInLoopReduction(Phi
),
9122 CM
.useOrderedReductions(RdxDesc
));
9124 PhiRecipe
= new VPFirstOrderRecurrencePHIRecipe(Phi
, *StartV
);
9127 // Record the incoming value from the backedge, so we can add the incoming
9128 // value from the backedge after all recipes have been created.
9129 recordRecipeOf(cast
<Instruction
>(
9130 Phi
->getIncomingValueForBlock(OrigLoop
->getLoopLatch())));
9131 PhisToFix
.push_back(PhiRecipe
);
9133 // TODO: record start and backedge value for remaining pointer induction
9135 assert(Phi
->getType()->isPointerTy() &&
9136 "only pointer phis should be handled here");
9137 PhiRecipe
= new VPWidenPHIRecipe(Phi
);
9140 return toVPRecipeResult(PhiRecipe
);
9143 if (isa
<TruncInst
>(Instr
) &&
9144 (Recipe
= tryToOptimizeInductionTruncate(cast
<TruncInst
>(Instr
), Operands
,
9146 return toVPRecipeResult(Recipe
);
9148 if (!shouldWiden(Instr
, Range
))
9151 if (auto GEP
= dyn_cast
<GetElementPtrInst
>(Instr
))
9152 return toVPRecipeResult(new VPWidenGEPRecipe(
9153 GEP
, make_range(Operands
.begin(), Operands
.end()), OrigLoop
));
9155 if (auto *SI
= dyn_cast
<SelectInst
>(Instr
)) {
9156 bool InvariantCond
=
9157 PSE
.getSE()->isLoopInvariant(PSE
.getSCEV(SI
->getOperand(0)), OrigLoop
);
9158 return toVPRecipeResult(new VPWidenSelectRecipe(
9159 *SI
, make_range(Operands
.begin(), Operands
.end()), InvariantCond
));
9162 return toVPRecipeResult(tryToWiden(Instr
, Operands
));
9165 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF
,
9166 ElementCount MaxVF
) {
9167 assert(OrigLoop
->isInnermost() && "Inner loop expected.");
9169 // Collect instructions from the original loop that will become trivially dead
9170 // in the vectorized loop. We don't need to vectorize these instructions. For
9171 // example, original induction update instructions can become dead because we
9172 // separately emit induction "steps" when generating code for the new loop.
9173 // Similarly, we create a new latch condition when setting up the structure
9174 // of the new loop, so the old one can become dead.
9175 SmallPtrSet
<Instruction
*, 4> DeadInstructions
;
9176 collectTriviallyDeadInstructions(DeadInstructions
);
9178 // Add assume instructions we need to drop to DeadInstructions, to prevent
9179 // them from being added to the VPlan.
9180 // TODO: We only need to drop assumes in blocks that get flattend. If the
9181 // control flow is preserved, we should keep them.
9182 auto &ConditionalAssumes
= Legal
->getConditionalAssumes();
9183 DeadInstructions
.insert(ConditionalAssumes
.begin(), ConditionalAssumes
.end());
9185 MapVector
<Instruction
*, Instruction
*> &SinkAfter
= Legal
->getSinkAfter();
9186 // Dead instructions do not need sinking. Remove them from SinkAfter.
9187 for (Instruction
*I
: DeadInstructions
)
9190 // Cannot sink instructions after dead instructions (there won't be any
9191 // recipes for them). Instead, find the first non-dead previous instruction.
9192 for (auto &P
: Legal
->getSinkAfter()) {
9193 Instruction
*SinkTarget
= P
.second
;
9194 Instruction
*FirstInst
= &*SinkTarget
->getParent()->begin();
9196 while (DeadInstructions
.contains(SinkTarget
)) {
9198 SinkTarget
!= FirstInst
&&
9199 "Must find a live instruction (at least the one feeding the "
9200 "first-order recurrence PHI) before reaching beginning of the block");
9201 SinkTarget
= SinkTarget
->getPrevNode();
9202 assert(SinkTarget
!= P
.first
&&
9203 "sink source equals target, no sinking required");
9205 P
.second
= SinkTarget
;
9208 auto MaxVFPlusOne
= MaxVF
.getWithIncrement(1);
9209 for (ElementCount VF
= MinVF
; ElementCount::isKnownLT(VF
, MaxVFPlusOne
);) {
9210 VFRange SubRange
= {VF
, MaxVFPlusOne
};
9212 buildVPlanWithVPRecipes(SubRange
, DeadInstructions
, SinkAfter
));
9217 VPlanPtr
LoopVectorizationPlanner::buildVPlanWithVPRecipes(
9218 VFRange
&Range
, SmallPtrSetImpl
<Instruction
*> &DeadInstructions
,
9219 const MapVector
<Instruction
*, Instruction
*> &SinkAfter
) {
9221 SmallPtrSet
<const InterleaveGroup
<Instruction
> *, 1> InterleaveGroups
;
9223 VPRecipeBuilder
RecipeBuilder(OrigLoop
, TLI
, Legal
, CM
, PSE
, Builder
);
9225 // ---------------------------------------------------------------------------
9226 // Pre-construction: record ingredients whose recipes we'll need to further
9227 // process after constructing the initial VPlan.
9228 // ---------------------------------------------------------------------------
9230 // Mark instructions we'll need to sink later and their targets as
9231 // ingredients whose recipe we'll need to record.
9232 for (auto &Entry
: SinkAfter
) {
9233 RecipeBuilder
.recordRecipeOf(Entry
.first
);
9234 RecipeBuilder
.recordRecipeOf(Entry
.second
);
9236 for (auto &Reduction
: CM
.getInLoopReductionChains()) {
9237 PHINode
*Phi
= Reduction
.first
;
9238 RecurKind Kind
= Legal
->getReductionVars()[Phi
].getRecurrenceKind();
9239 const SmallVector
<Instruction
*, 4> &ReductionOperations
= Reduction
.second
;
9241 RecipeBuilder
.recordRecipeOf(Phi
);
9242 for (auto &R
: ReductionOperations
) {
9243 RecipeBuilder
.recordRecipeOf(R
);
9244 // For min/max reducitons, where we have a pair of icmp/select, we also
9245 // need to record the ICmp recipe, so it can be removed later.
9246 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind
))
9247 RecipeBuilder
.recordRecipeOf(cast
<Instruction
>(R
->getOperand(0)));
9251 // For each interleave group which is relevant for this (possibly trimmed)
9252 // Range, add it to the set of groups to be later applied to the VPlan and add
9253 // placeholders for its members' Recipes which we'll be replacing with a
9254 // single VPInterleaveRecipe.
9255 for (InterleaveGroup
<Instruction
> *IG
: IAI
.getInterleaveGroups()) {
9256 auto applyIG
= [IG
, this](ElementCount VF
) -> bool {
9257 return (VF
.isVector() && // Query is illegal for VF == 1
9258 CM
.getWideningDecision(IG
->getInsertPos(), VF
) ==
9259 LoopVectorizationCostModel::CM_Interleave
);
9261 if (!getDecisionAndClampRange(applyIG
, Range
))
9263 InterleaveGroups
.insert(IG
);
9264 for (unsigned i
= 0; i
< IG
->getFactor(); i
++)
9265 if (Instruction
*Member
= IG
->getMember(i
))
9266 RecipeBuilder
.recordRecipeOf(Member
);
9269 // ---------------------------------------------------------------------------
9270 // Build initial VPlan: Scan the body of the loop in a topological order to
9271 // visit each basic block after having visited its predecessor basic blocks.
9272 // ---------------------------------------------------------------------------
9274 // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
9275 auto Plan
= std::make_unique
<VPlan
>();
9276 VPBasicBlock
*VPBB
= new VPBasicBlock("Pre-Entry");
9277 Plan
->setEntry(VPBB
);
9279 // Scan the body of the loop in a topological order to visit each basic block
9280 // after having visited its predecessor basic blocks.
9281 LoopBlocksDFS
DFS(OrigLoop
);
9284 for (BasicBlock
*BB
: make_range(DFS
.beginRPO(), DFS
.endRPO())) {
9285 // Relevant instructions from basic block BB will be grouped into VPRecipe
9286 // ingredients and fill a new VPBasicBlock.
9287 unsigned VPBBsForBB
= 0;
9288 auto *FirstVPBBForBB
= new VPBasicBlock(BB
->getName());
9289 VPBlockUtils::insertBlockAfter(FirstVPBBForBB
, VPBB
);
9290 VPBB
= FirstVPBBForBB
;
9291 Builder
.setInsertPoint(VPBB
);
9293 // Introduce each ingredient into VPlan.
9294 // TODO: Model and preserve debug instrinsics in VPlan.
9295 for (Instruction
&I
: BB
->instructionsWithoutDebug()) {
9296 Instruction
*Instr
= &I
;
9298 // First filter out irrelevant instructions, to ensure no recipes are
9300 if (isa
<BranchInst
>(Instr
) || DeadInstructions
.count(Instr
))
9303 SmallVector
<VPValue
*, 4> Operands
;
9304 auto *Phi
= dyn_cast
<PHINode
>(Instr
);
9305 if (Phi
&& Phi
->getParent() == OrigLoop
->getHeader()) {
9306 Operands
.push_back(Plan
->getOrAddVPValue(
9307 Phi
->getIncomingValueForBlock(OrigLoop
->getLoopPreheader())));
9309 auto OpRange
= Plan
->mapToVPValues(Instr
->operands());
9310 Operands
= {OpRange
.begin(), OpRange
.end()};
9312 if (auto RecipeOrValue
= RecipeBuilder
.tryToCreateWidenRecipe(
9313 Instr
, Operands
, Range
, Plan
)) {
9314 // If Instr can be simplified to an existing VPValue, use it.
9315 if (RecipeOrValue
.is
<VPValue
*>()) {
9316 auto *VPV
= RecipeOrValue
.get
<VPValue
*>();
9317 Plan
->addVPValue(Instr
, VPV
);
9318 // If the re-used value is a recipe, register the recipe for the
9319 // instruction, in case the recipe for Instr needs to be recorded.
9320 if (auto *R
= dyn_cast_or_null
<VPRecipeBase
>(VPV
->getDef()))
9321 RecipeBuilder
.setRecipe(Instr
, R
);
9324 // Otherwise, add the new recipe.
9325 VPRecipeBase
*Recipe
= RecipeOrValue
.get
<VPRecipeBase
*>();
9326 for (auto *Def
: Recipe
->definedValues()) {
9327 auto *UV
= Def
->getUnderlyingValue();
9328 Plan
->addVPValue(UV
, Def
);
9331 RecipeBuilder
.setRecipe(Instr
, Recipe
);
9332 VPBB
->appendRecipe(Recipe
);
9336 // Otherwise, if all widening options failed, Instruction is to be
9337 // replicated. This may create a successor for VPBB.
9338 VPBasicBlock
*NextVPBB
=
9339 RecipeBuilder
.handleReplication(Instr
, Range
, VPBB
, Plan
);
9340 if (NextVPBB
!= VPBB
) {
9342 VPBB
->setName(BB
->hasName() ? BB
->getName() + "." + Twine(VPBBsForBB
++)
9348 RecipeBuilder
.fixHeaderPhis();
9350 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
9351 // may also be empty, such as the last one VPBB, reflecting original
9352 // basic-blocks with no recipes.
9353 VPBasicBlock
*PreEntry
= cast
<VPBasicBlock
>(Plan
->getEntry());
9354 assert(PreEntry
->empty() && "Expecting empty pre-entry block.");
9355 VPBlockBase
*Entry
= Plan
->setEntry(PreEntry
->getSingleSuccessor());
9356 VPBlockUtils::disconnectBlocks(PreEntry
, Entry
);
9359 // ---------------------------------------------------------------------------
9360 // Transform initial VPlan: Apply previously taken decisions, in order, to
9361 // bring the VPlan to its final state.
9362 // ---------------------------------------------------------------------------
9364 // Apply Sink-After legal constraints.
9365 auto GetReplicateRegion
= [](VPRecipeBase
*R
) -> VPRegionBlock
* {
9366 auto *Region
= dyn_cast_or_null
<VPRegionBlock
>(R
->getParent()->getParent());
9367 if (Region
&& Region
->isReplicator()) {
9368 assert(Region
->getNumSuccessors() == 1 &&
9369 Region
->getNumPredecessors() == 1 && "Expected SESE region!");
9370 assert(R
->getParent()->size() == 1 &&
9371 "A recipe in an original replicator region must be the only "
9372 "recipe in its block");
9377 for (auto &Entry
: SinkAfter
) {
9378 VPRecipeBase
*Sink
= RecipeBuilder
.getRecipe(Entry
.first
);
9379 VPRecipeBase
*Target
= RecipeBuilder
.getRecipe(Entry
.second
);
9381 auto *TargetRegion
= GetReplicateRegion(Target
);
9382 auto *SinkRegion
= GetReplicateRegion(Sink
);
9384 // If the sink source is not a replicate region, sink the recipe directly.
9386 // The target is in a replication region, make sure to move Sink to
9387 // the block after it, not into the replication region itself.
9388 VPBasicBlock
*NextBlock
=
9389 cast
<VPBasicBlock
>(TargetRegion
->getSuccessors().front());
9390 Sink
->moveBefore(*NextBlock
, NextBlock
->getFirstNonPhi());
9392 Sink
->moveAfter(Target
);
9396 // The sink source is in a replicate region. Unhook the region from the CFG.
9397 auto *SinkPred
= SinkRegion
->getSinglePredecessor();
9398 auto *SinkSucc
= SinkRegion
->getSingleSuccessor();
9399 VPBlockUtils::disconnectBlocks(SinkPred
, SinkRegion
);
9400 VPBlockUtils::disconnectBlocks(SinkRegion
, SinkSucc
);
9401 VPBlockUtils::connectBlocks(SinkPred
, SinkSucc
);
9404 // The target recipe is also in a replicate region, move the sink region
9405 // after the target region.
9406 auto *TargetSucc
= TargetRegion
->getSingleSuccessor();
9407 VPBlockUtils::disconnectBlocks(TargetRegion
, TargetSucc
);
9408 VPBlockUtils::connectBlocks(TargetRegion
, SinkRegion
);
9409 VPBlockUtils::connectBlocks(SinkRegion
, TargetSucc
);
9411 // The sink source is in a replicate region, we need to move the whole
9412 // replicate region, which should only contain a single recipe in the
9415 Target
->getParent()->splitAt(std::next(Target
->getIterator()));
9417 auto *SplitPred
= SplitBlock
->getSinglePredecessor();
9419 VPBlockUtils::disconnectBlocks(SplitPred
, SplitBlock
);
9420 VPBlockUtils::connectBlocks(SplitPred
, SinkRegion
);
9421 VPBlockUtils::connectBlocks(SinkRegion
, SplitBlock
);
9422 if (VPBB
== SplitPred
)
9427 // Introduce a recipe to combine the incoming and previous values of a
9428 // first-order recurrence.
9429 for (VPRecipeBase
&R
: Plan
->getEntry()->getEntryBasicBlock()->phis()) {
9430 auto *RecurPhi
= dyn_cast
<VPFirstOrderRecurrencePHIRecipe
>(&R
);
9434 auto *RecurSplice
= cast
<VPInstruction
>(
9435 Builder
.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice
,
9436 {RecurPhi
, RecurPhi
->getBackedgeValue()}));
9438 VPRecipeBase
*PrevRecipe
= RecurPhi
->getBackedgeRecipe();
9439 if (auto *Region
= GetReplicateRegion(PrevRecipe
)) {
9440 VPBasicBlock
*Succ
= cast
<VPBasicBlock
>(Region
->getSingleSuccessor());
9441 RecurSplice
->moveBefore(*Succ
, Succ
->getFirstNonPhi());
9443 RecurSplice
->moveAfter(PrevRecipe
);
9444 RecurPhi
->replaceAllUsesWith(RecurSplice
);
9445 // Set the first operand of RecurSplice to RecurPhi again, after replacing
9447 RecurSplice
->setOperand(0, RecurPhi
);
9450 // Interleave memory: for each Interleave Group we marked earlier as relevant
9451 // for this VPlan, replace the Recipes widening its memory instructions with a
9452 // single VPInterleaveRecipe at its insertion point.
9453 for (auto IG
: InterleaveGroups
) {
9454 auto *Recipe
= cast
<VPWidenMemoryInstructionRecipe
>(
9455 RecipeBuilder
.getRecipe(IG
->getInsertPos()));
9456 SmallVector
<VPValue
*, 4> StoredValues
;
9457 for (unsigned i
= 0; i
< IG
->getFactor(); ++i
)
9458 if (auto *SI
= dyn_cast_or_null
<StoreInst
>(IG
->getMember(i
))) {
9460 cast
<VPWidenMemoryInstructionRecipe
>(RecipeBuilder
.getRecipe(SI
));
9461 StoredValues
.push_back(StoreR
->getStoredValue());
9464 auto *VPIG
= new VPInterleaveRecipe(IG
, Recipe
->getAddr(), StoredValues
,
9466 VPIG
->insertBefore(Recipe
);
9468 for (unsigned i
= 0; i
< IG
->getFactor(); ++i
)
9469 if (Instruction
*Member
= IG
->getMember(i
)) {
9470 if (!Member
->getType()->isVoidTy()) {
9471 VPValue
*OriginalV
= Plan
->getVPValue(Member
);
9472 Plan
->removeVPValueFor(Member
);
9473 Plan
->addVPValue(Member
, VPIG
->getVPValue(J
));
9474 OriginalV
->replaceAllUsesWith(VPIG
->getVPValue(J
));
9477 RecipeBuilder
.getRecipe(Member
)->eraseFromParent();
9481 // Adjust the recipes for any inloop reductions.
9482 adjustRecipesForReductions(VPBB
, Plan
, RecipeBuilder
, Range
.Start
);
9484 VPlanTransforms::sinkScalarOperands(*Plan
);
9485 VPlanTransforms::mergeReplicateRegions(*Plan
);
9487 std::string PlanName
;
9488 raw_string_ostream
RSO(PlanName
);
9489 ElementCount VF
= Range
.Start
;
9491 RSO
<< "Initial VPlan for VF={" << VF
;
9492 for (VF
*= 2; ElementCount::isKnownLT(VF
, Range
.End
); VF
*= 2) {
9498 Plan
->setName(PlanName
);
9503 VPlanPtr
LoopVectorizationPlanner::buildVPlan(VFRange
&Range
) {
9504 // Outer loop handling: They may require CFG and instruction level
9505 // transformations before even evaluating whether vectorization is profitable.
9506 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9507 // the vectorization pipeline.
9508 assert(!OrigLoop
->isInnermost());
9509 assert(EnableVPlanNativePath
&& "VPlan-native path is not enabled.");
9511 // Create new empty VPlan
9512 auto Plan
= std::make_unique
<VPlan
>();
9514 // Build hierarchical CFG
9515 VPlanHCFGBuilder
HCFGBuilder(OrigLoop
, LI
, *Plan
);
9516 HCFGBuilder
.buildHierarchicalCFG();
9518 for (ElementCount VF
= Range
.Start
; ElementCount::isKnownLT(VF
, Range
.End
);
9522 if (EnableVPlanPredication
) {
9523 VPlanPredicator
VPP(*Plan
);
9526 // Avoid running transformation to recipes until masked code generation in
9527 // VPlan-native path is in place.
9531 SmallPtrSet
<Instruction
*, 1> DeadInstructions
;
9532 VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop
, Plan
,
9533 Legal
->getInductionVars(),
9534 DeadInstructions
, *PSE
.getSE());
9538 // Adjust the recipes for reductions. For in-loop reductions the chain of
9539 // instructions leading from the loop exit instr to the phi need to be converted
9540 // to reductions, with one operand being vector and the other being the scalar
9541 // reduction chain. For other reductions, a select is introduced between the phi
9542 // and live-out recipes when folding the tail.
9543 void LoopVectorizationPlanner::adjustRecipesForReductions(
9544 VPBasicBlock
*LatchVPBB
, VPlanPtr
&Plan
, VPRecipeBuilder
&RecipeBuilder
,
9545 ElementCount MinVF
) {
9546 for (auto &Reduction
: CM
.getInLoopReductionChains()) {
9547 PHINode
*Phi
= Reduction
.first
;
9548 RecurrenceDescriptor
&RdxDesc
= Legal
->getReductionVars()[Phi
];
9549 const SmallVector
<Instruction
*, 4> &ReductionOperations
= Reduction
.second
;
9551 if (MinVF
.isScalar() && !CM
.useOrderedReductions(RdxDesc
))
9554 // ReductionOperations are orders top-down from the phi's use to the
9555 // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9556 // which of the two operands will remain scalar and which will be reduced.
9557 // For minmax the chain will be the select instructions.
9558 Instruction
*Chain
= Phi
;
9559 for (Instruction
*R
: ReductionOperations
) {
9560 VPRecipeBase
*WidenRecipe
= RecipeBuilder
.getRecipe(R
);
9561 RecurKind Kind
= RdxDesc
.getRecurrenceKind();
9563 VPValue
*ChainOp
= Plan
->getVPValue(Chain
);
9565 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind
)) {
9566 assert(isa
<VPWidenSelectRecipe
>(WidenRecipe
) &&
9567 "Expected to replace a VPWidenSelectSC");
9570 assert((MinVF
.isScalar() || isa
<VPWidenRecipe
>(WidenRecipe
)) &&
9571 "Expected to replace a VPWidenSC");
9575 R
->getOperand(FirstOpId
) == Chain
? FirstOpId
+ 1 : FirstOpId
;
9576 VPValue
*VecOp
= Plan
->getVPValue(R
->getOperand(VecOpId
));
9578 auto *CondOp
= CM
.foldTailByMasking()
9579 ? RecipeBuilder
.createBlockInMask(R
->getParent(), Plan
)
9581 VPReductionRecipe
*RedRecipe
= new VPReductionRecipe(
9582 &RdxDesc
, R
, ChainOp
, VecOp
, CondOp
, TTI
);
9583 WidenRecipe
->getVPSingleValue()->replaceAllUsesWith(RedRecipe
);
9584 Plan
->removeVPValueFor(R
);
9585 Plan
->addVPValue(R
, RedRecipe
);
9586 WidenRecipe
->getParent()->insert(RedRecipe
, WidenRecipe
->getIterator());
9587 WidenRecipe
->getVPSingleValue()->replaceAllUsesWith(RedRecipe
);
9588 WidenRecipe
->eraseFromParent();
9590 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind
)) {
9591 VPRecipeBase
*CompareRecipe
=
9592 RecipeBuilder
.getRecipe(cast
<Instruction
>(R
->getOperand(0)));
9593 assert(isa
<VPWidenRecipe
>(CompareRecipe
) &&
9594 "Expected to replace a VPWidenSC");
9595 assert(cast
<VPWidenRecipe
>(CompareRecipe
)->getNumUsers() == 0 &&
9596 "Expected no remaining users");
9597 CompareRecipe
->eraseFromParent();
9603 // If tail is folded by masking, introduce selects between the phi
9604 // and the live-out instruction of each reduction, at the end of the latch.
9605 if (CM
.foldTailByMasking()) {
9606 for (VPRecipeBase
&R
: Plan
->getEntry()->getEntryBasicBlock()->phis()) {
9607 VPReductionPHIRecipe
*PhiR
= dyn_cast
<VPReductionPHIRecipe
>(&R
);
9608 if (!PhiR
|| PhiR
->isInLoop())
9610 Builder
.setInsertPoint(LatchVPBB
);
9612 RecipeBuilder
.createBlockInMask(OrigLoop
->getHeader(), Plan
);
9613 VPValue
*Red
= PhiR
->getBackedgeValue();
9614 Builder
.createNaryOp(Instruction::Select
, {Cond
, Red
, PhiR
});
9619 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9620 void VPInterleaveRecipe::print(raw_ostream
&O
, const Twine
&Indent
,
9621 VPSlotTracker
&SlotTracker
) const {
9622 O
<< Indent
<< "INTERLEAVE-GROUP with factor " << IG
->getFactor() << " at ";
9623 IG
->getInsertPos()->printAsOperand(O
, false);
9625 getAddr()->printAsOperand(O
, SlotTracker
);
9626 VPValue
*Mask
= getMask();
9629 Mask
->printAsOperand(O
, SlotTracker
);
9633 for (unsigned i
= 0; i
< IG
->getFactor(); ++i
) {
9634 if (!IG
->getMember(i
))
9636 if (getNumStoreOperands() > 0) {
9637 O
<< "\n" << Indent
<< " store ";
9638 getOperand(1 + OpIdx
)->printAsOperand(O
, SlotTracker
);
9639 O
<< " to index " << i
;
9641 O
<< "\n" << Indent
<< " ";
9642 getVPValue(OpIdx
)->printAsOperand(O
, SlotTracker
);
9643 O
<< " = load from index " << i
;
9650 void VPWidenCallRecipe::execute(VPTransformState
&State
) {
9651 State
.ILV
->widenCallInstruction(*cast
<CallInst
>(getUnderlyingInstr()), this,
9655 void VPWidenSelectRecipe::execute(VPTransformState
&State
) {
9656 State
.ILV
->widenSelectInstruction(*cast
<SelectInst
>(getUnderlyingInstr()),
9657 this, *this, InvariantCond
, State
);
9660 void VPWidenRecipe::execute(VPTransformState
&State
) {
9661 State
.ILV
->widenInstruction(*getUnderlyingInstr(), this, *this, State
);
9664 void VPWidenGEPRecipe::execute(VPTransformState
&State
) {
9665 State
.ILV
->widenGEP(cast
<GetElementPtrInst
>(getUnderlyingInstr()), this,
9666 *this, State
.UF
, State
.VF
, IsPtrLoopInvariant
,
9667 IsIndexLoopInvariant
, State
);
9670 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState
&State
) {
9671 assert(!State
.Instance
&& "Int or FP induction being replicated.");
9672 State
.ILV
->widenIntOrFpInduction(IV
, getStartValue()->getLiveInIRValue(),
9673 getTruncInst(), getVPValue(0),
9674 getCastValue(), State
);
9677 void VPWidenPHIRecipe::execute(VPTransformState
&State
) {
9678 State
.ILV
->widenPHIInstruction(cast
<PHINode
>(getUnderlyingValue()), this,
9682 void VPBlendRecipe::execute(VPTransformState
&State
) {
9683 State
.ILV
->setDebugLocFromInst(Phi
, &State
.Builder
);
9684 // We know that all PHIs in non-header blocks are converted into
9685 // selects, so we don't have to worry about the insertion order and we
9686 // can just use the builder.
9687 // At this point we generate the predication tree. There may be
9688 // duplications since this is a simple recursive scan, but future
9689 // optimizations will clean it up.
9691 unsigned NumIncoming
= getNumIncomingValues();
9693 // Generate a sequence of selects of the form:
9694 // SELECT(Mask3, In3,
9695 // SELECT(Mask2, In2,
9696 // SELECT(Mask1, In1,
9698 // Note that Mask0 is never used: lanes for which no path reaches this phi and
9699 // are essentially undef are taken from In0.
9700 InnerLoopVectorizer::VectorParts
Entry(State
.UF
);
9701 for (unsigned In
= 0; In
< NumIncoming
; ++In
) {
9702 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
) {
9703 // We might have single edge PHIs (blocks) - use an identity
9704 // 'select' for the first PHI operand.
9705 Value
*In0
= State
.get(getIncomingValue(In
), Part
);
9707 Entry
[Part
] = In0
; // Initialize with the first incoming value.
9709 // Select between the current value and the previous incoming edge
9710 // based on the incoming mask.
9711 Value
*Cond
= State
.get(getMask(In
), Part
);
9713 State
.Builder
.CreateSelect(Cond
, In0
, Entry
[Part
], "predphi");
9717 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
)
9718 State
.set(this, Entry
[Part
], Part
);
9721 void VPInterleaveRecipe::execute(VPTransformState
&State
) {
9722 assert(!State
.Instance
&& "Interleave group being replicated.");
9723 State
.ILV
->vectorizeInterleaveGroup(IG
, definedValues(), State
, getAddr(),
9724 getStoredValues(), getMask());
9727 void VPReductionRecipe::execute(VPTransformState
&State
) {
9728 assert(!State
.Instance
&& "Reduction being replicated.");
9729 Value
*PrevInChain
= State
.get(getChainOp(), 0);
9730 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
) {
9731 RecurKind Kind
= RdxDesc
->getRecurrenceKind();
9732 bool IsOrdered
= State
.ILV
->useOrderedReductions(*RdxDesc
);
9733 Value
*NewVecOp
= State
.get(getVecOp(), Part
);
9734 if (VPValue
*Cond
= getCondOp()) {
9735 Value
*NewCond
= State
.get(Cond
, Part
);
9736 VectorType
*VecTy
= cast
<VectorType
>(NewVecOp
->getType());
9737 Constant
*Iden
= RecurrenceDescriptor::getRecurrenceIdentity(
9738 Kind
, VecTy
->getElementType(), RdxDesc
->getFastMathFlags());
9740 ConstantVector::getSplat(VecTy
->getElementCount(), Iden
);
9741 Value
*Select
= State
.Builder
.CreateSelect(NewCond
, NewVecOp
, IdenVec
);
9747 if (State
.VF
.isVector())
9748 NewRed
= createOrderedReduction(State
.Builder
, *RdxDesc
, NewVecOp
,
9751 NewRed
= State
.Builder
.CreateBinOp(
9752 (Instruction::BinaryOps
)getUnderlyingInstr()->getOpcode(),
9753 PrevInChain
, NewVecOp
);
9754 PrevInChain
= NewRed
;
9756 PrevInChain
= State
.get(getChainOp(), Part
);
9757 NewRed
= createTargetReduction(State
.Builder
, TTI
, *RdxDesc
, NewVecOp
);
9759 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind
)) {
9761 createMinMaxOp(State
.Builder
, RdxDesc
->getRecurrenceKind(),
9762 NewRed
, PrevInChain
);
9763 } else if (IsOrdered
)
9764 NextInChain
= NewRed
;
9766 NextInChain
= State
.Builder
.CreateBinOp(
9767 (Instruction::BinaryOps
)getUnderlyingInstr()->getOpcode(), NewRed
,
9770 State
.set(this, NextInChain
, Part
);
9774 void VPReplicateRecipe::execute(VPTransformState
&State
) {
9775 if (State
.Instance
) { // Generate a single instance.
9776 assert(!State
.VF
.isScalable() && "Can't scalarize a scalable vector");
9777 State
.ILV
->scalarizeInstruction(getUnderlyingInstr(), this, *this,
9778 *State
.Instance
, IsPredicated
, State
);
9779 // Insert scalar instance packing it into a vector.
9780 if (AlsoPack
&& State
.VF
.isVector()) {
9781 // If we're constructing lane 0, initialize to start from poison.
9782 if (State
.Instance
->Lane
.isFirstLane()) {
9783 assert(!State
.VF
.isScalable() && "VF is assumed to be non scalable.");
9784 Value
*Poison
= PoisonValue::get(
9785 VectorType::get(getUnderlyingValue()->getType(), State
.VF
));
9786 State
.set(this, Poison
, State
.Instance
->Part
);
9788 State
.ILV
->packScalarIntoVectorValue(this, *State
.Instance
, State
);
9793 // Generate scalar instances for all VF lanes of all UF parts, unless the
9794 // instruction is uniform inwhich case generate only the first lane for each
9796 unsigned EndLane
= IsUniform
? 1 : State
.VF
.getKnownMinValue();
9797 assert((!State
.VF
.isScalable() || IsUniform
) &&
9798 "Can't scalarize a scalable vector");
9799 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
)
9800 for (unsigned Lane
= 0; Lane
< EndLane
; ++Lane
)
9801 State
.ILV
->scalarizeInstruction(getUnderlyingInstr(), this, *this,
9802 VPIteration(Part
, Lane
), IsPredicated
,
9806 void VPBranchOnMaskRecipe::execute(VPTransformState
&State
) {
9807 assert(State
.Instance
&& "Branch on Mask works only on single instance.");
9809 unsigned Part
= State
.Instance
->Part
;
9810 unsigned Lane
= State
.Instance
->Lane
.getKnownLane();
9812 Value
*ConditionBit
= nullptr;
9813 VPValue
*BlockInMask
= getMask();
9815 ConditionBit
= State
.get(BlockInMask
, Part
);
9816 if (ConditionBit
->getType()->isVectorTy())
9817 ConditionBit
= State
.Builder
.CreateExtractElement(
9818 ConditionBit
, State
.Builder
.getInt32(Lane
));
9819 } else // Block in mask is all-one.
9820 ConditionBit
= State
.Builder
.getTrue();
9822 // Replace the temporary unreachable terminator with a new conditional branch,
9823 // whose two destinations will be set later when they are created.
9824 auto *CurrentTerminator
= State
.CFG
.PrevBB
->getTerminator();
9825 assert(isa
<UnreachableInst
>(CurrentTerminator
) &&
9826 "Expected to replace unreachable terminator with conditional branch.");
9827 auto *CondBr
= BranchInst::Create(State
.CFG
.PrevBB
, nullptr, ConditionBit
);
9828 CondBr
->setSuccessor(0, nullptr);
9829 ReplaceInstWithInst(CurrentTerminator
, CondBr
);
9832 void VPPredInstPHIRecipe::execute(VPTransformState
&State
) {
9833 assert(State
.Instance
&& "Predicated instruction PHI works per instance.");
9834 Instruction
*ScalarPredInst
=
9835 cast
<Instruction
>(State
.get(getOperand(0), *State
.Instance
));
9836 BasicBlock
*PredicatedBB
= ScalarPredInst
->getParent();
9837 BasicBlock
*PredicatingBB
= PredicatedBB
->getSinglePredecessor();
9838 assert(PredicatingBB
&& "Predicated block has no single predecessor.");
9839 assert(isa
<VPReplicateRecipe
>(getOperand(0)) &&
9840 "operand must be VPReplicateRecipe");
9842 // By current pack/unpack logic we need to generate only a single phi node: if
9843 // a vector value for the predicated instruction exists at this point it means
9844 // the instruction has vector users only, and a phi for the vector value is
9845 // needed. In this case the recipe of the predicated instruction is marked to
9846 // also do that packing, thereby "hoisting" the insert-element sequence.
9847 // Otherwise, a phi node for the scalar value is needed.
9848 unsigned Part
= State
.Instance
->Part
;
9849 if (State
.hasVectorValue(getOperand(0), Part
)) {
9850 Value
*VectorValue
= State
.get(getOperand(0), Part
);
9851 InsertElementInst
*IEI
= cast
<InsertElementInst
>(VectorValue
);
9852 PHINode
*VPhi
= State
.Builder
.CreatePHI(IEI
->getType(), 2);
9853 VPhi
->addIncoming(IEI
->getOperand(0), PredicatingBB
); // Unmodified vector.
9854 VPhi
->addIncoming(IEI
, PredicatedBB
); // New vector with inserted element.
9855 if (State
.hasVectorValue(this, Part
))
9856 State
.reset(this, VPhi
, Part
);
9858 State
.set(this, VPhi
, Part
);
9859 // NOTE: Currently we need to update the value of the operand, so the next
9860 // predicated iteration inserts its generated value in the correct vector.
9861 State
.reset(getOperand(0), VPhi
, Part
);
9863 Type
*PredInstType
= getOperand(0)->getUnderlyingValue()->getType();
9864 PHINode
*Phi
= State
.Builder
.CreatePHI(PredInstType
, 2);
9865 Phi
->addIncoming(PoisonValue::get(ScalarPredInst
->getType()),
9867 Phi
->addIncoming(ScalarPredInst
, PredicatedBB
);
9868 if (State
.hasScalarValue(this, *State
.Instance
))
9869 State
.reset(this, Phi
, *State
.Instance
);
9871 State
.set(this, Phi
, *State
.Instance
);
9872 // NOTE: Currently we need to update the value of the operand, so the next
9873 // predicated iteration inserts its generated value in the correct vector.
9874 State
.reset(getOperand(0), Phi
, *State
.Instance
);
9878 void VPWidenMemoryInstructionRecipe::execute(VPTransformState
&State
) {
9879 VPValue
*StoredValue
= isStore() ? getStoredValue() : nullptr;
9880 State
.ILV
->vectorizeMemoryInstruction(
9881 &Ingredient
, State
, StoredValue
? nullptr : getVPSingleValue(), getAddr(),
9882 StoredValue
, getMask());
9885 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9886 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9887 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9889 static ScalarEpilogueLowering
getScalarEpilogueLowering(
9890 Function
*F
, Loop
*L
, LoopVectorizeHints
&Hints
, ProfileSummaryInfo
*PSI
,
9891 BlockFrequencyInfo
*BFI
, TargetTransformInfo
*TTI
, TargetLibraryInfo
*TLI
,
9892 AssumptionCache
*AC
, LoopInfo
*LI
, ScalarEvolution
*SE
, DominatorTree
*DT
,
9893 LoopVectorizationLegality
&LVL
) {
9894 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9895 // don't look at hints or options, and don't request a scalar epilogue.
9896 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9897 // LoopAccessInfo (due to code dependency and not being able to reliably get
9898 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9899 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9900 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9901 // back to the old way and vectorize with versioning when forced. See D81345.)
9902 if (F
->hasOptSize() || (llvm::shouldOptimizeForSize(L
->getHeader(), PSI
, BFI
,
9903 PGSOQueryType::IRPass
) &&
9904 Hints
.getForce() != LoopVectorizeHints::FK_Enabled
))
9905 return CM_ScalarEpilogueNotAllowedOptSize
;
9907 // 2) If set, obey the directives
9908 if (PreferPredicateOverEpilogue
.getNumOccurrences()) {
9909 switch (PreferPredicateOverEpilogue
) {
9910 case PreferPredicateTy::ScalarEpilogue
:
9911 return CM_ScalarEpilogueAllowed
;
9912 case PreferPredicateTy::PredicateElseScalarEpilogue
:
9913 return CM_ScalarEpilogueNotNeededUsePredicate
;
9914 case PreferPredicateTy::PredicateOrDontVectorize
:
9915 return CM_ScalarEpilogueNotAllowedUsePredicate
;
9919 // 3) If set, obey the hints
9920 switch (Hints
.getPredicate()) {
9921 case LoopVectorizeHints::FK_Enabled
:
9922 return CM_ScalarEpilogueNotNeededUsePredicate
;
9923 case LoopVectorizeHints::FK_Disabled
:
9924 return CM_ScalarEpilogueAllowed
;
9927 // 4) if the TTI hook indicates this is profitable, request predication.
9928 if (TTI
->preferPredicateOverEpilogue(L
, LI
, *SE
, *AC
, TLI
, DT
,
9930 return CM_ScalarEpilogueNotNeededUsePredicate
;
9932 return CM_ScalarEpilogueAllowed
;
9935 Value
*VPTransformState::get(VPValue
*Def
, unsigned Part
) {
9936 // If Values have been set for this Def return the one relevant for \p Part.
9937 if (hasVectorValue(Def
, Part
))
9938 return Data
.PerPartOutput
[Def
][Part
];
9940 if (!hasScalarValue(Def
, {Part
, 0})) {
9941 Value
*IRV
= Def
->getLiveInIRValue();
9942 Value
*B
= ILV
->getBroadcastInstrs(IRV
);
9947 Value
*ScalarValue
= get(Def
, {Part
, 0});
9948 // If we aren't vectorizing, we can just copy the scalar map values over
9949 // to the vector map.
9950 if (VF
.isScalar()) {
9951 set(Def
, ScalarValue
, Part
);
9955 auto *RepR
= dyn_cast
<VPReplicateRecipe
>(Def
);
9956 bool IsUniform
= RepR
&& RepR
->isUniform();
9958 unsigned LastLane
= IsUniform
? 0 : VF
.getKnownMinValue() - 1;
9959 // Check if there is a scalar value for the selected lane.
9960 if (!hasScalarValue(Def
, {Part
, LastLane
})) {
9961 // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
9962 assert(isa
<VPWidenIntOrFpInductionRecipe
>(Def
->getDef()) &&
9963 "unexpected recipe found to be invariant");
9968 auto *LastInst
= cast
<Instruction
>(get(Def
, {Part
, LastLane
}));
9969 // Set the insert point after the last scalarized instruction or after the
9970 // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
9971 // will directly follow the scalar definitions.
9972 auto OldIP
= Builder
.saveIP();
9974 isa
<PHINode
>(LastInst
)
9975 ? BasicBlock::iterator(LastInst
->getParent()->getFirstNonPHI())
9976 : std::next(BasicBlock::iterator(LastInst
));
9977 Builder
.SetInsertPoint(&*NewIP
);
9979 // However, if we are vectorizing, we need to construct the vector values.
9980 // If the value is known to be uniform after vectorization, we can just
9981 // broadcast the scalar value corresponding to lane zero for each unroll
9982 // iteration. Otherwise, we construct the vector values using
9983 // insertelement instructions. Since the resulting vectors are stored in
9984 // State, we will only generate the insertelements once.
9985 Value
*VectorValue
= nullptr;
9987 VectorValue
= ILV
->getBroadcastInstrs(ScalarValue
);
9988 set(Def
, VectorValue
, Part
);
9990 // Initialize packing with insertelements to start from undef.
9991 assert(!VF
.isScalable() && "VF is assumed to be non scalable.");
9992 Value
*Undef
= PoisonValue::get(VectorType::get(LastInst
->getType(), VF
));
9993 set(Def
, Undef
, Part
);
9994 for (unsigned Lane
= 0; Lane
< VF
.getKnownMinValue(); ++Lane
)
9995 ILV
->packScalarIntoVectorValue(Def
, {Part
, Lane
}, *this);
9996 VectorValue
= get(Def
, Part
);
9998 Builder
.restoreIP(OldIP
);
10002 // Process the loop in the VPlan-native vectorization path. This path builds
10003 // VPlan upfront in the vectorization pipeline, which allows to apply
10004 // VPlan-to-VPlan transformations from the very beginning without modifying the
10006 static bool processLoopInVPlanNativePath(
10007 Loop
*L
, PredicatedScalarEvolution
&PSE
, LoopInfo
*LI
, DominatorTree
*DT
,
10008 LoopVectorizationLegality
*LVL
, TargetTransformInfo
*TTI
,
10009 TargetLibraryInfo
*TLI
, DemandedBits
*DB
, AssumptionCache
*AC
,
10010 OptimizationRemarkEmitter
*ORE
, BlockFrequencyInfo
*BFI
,
10011 ProfileSummaryInfo
*PSI
, LoopVectorizeHints
&Hints
,
10012 LoopVectorizationRequirements
&Requirements
) {
10014 if (isa
<SCEVCouldNotCompute
>(PSE
.getBackedgeTakenCount())) {
10015 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10018 assert(EnableVPlanNativePath
&& "VPlan-native path is disabled.");
10019 Function
*F
= L
->getHeader()->getParent();
10020 InterleavedAccessInfo
IAI(PSE
, L
, DT
, LI
, LVL
->getLAI());
10022 ScalarEpilogueLowering SEL
= getScalarEpilogueLowering(
10023 F
, L
, Hints
, PSI
, BFI
, TTI
, TLI
, AC
, LI
, PSE
.getSE(), DT
, *LVL
);
10025 LoopVectorizationCostModel
CM(SEL
, L
, PSE
, LI
, LVL
, *TTI
, TLI
, DB
, AC
, ORE
, F
,
10027 // Use the planner for outer loop vectorization.
10028 // TODO: CM is not used at this point inside the planner. Turn CM into an
10029 // optional argument if we don't need it in the future.
10030 LoopVectorizationPlanner
LVP(L
, LI
, TLI
, TTI
, LVL
, CM
, IAI
, PSE
, Hints
,
10031 Requirements
, ORE
);
10033 // Get user vectorization factor.
10034 ElementCount UserVF
= Hints
.getWidth();
10036 CM
.collectElementTypesForWidening();
10038 // Plan how to best vectorize, return the best VF and its cost.
10039 const VectorizationFactor VF
= LVP
.planInVPlanNativePath(UserVF
);
10041 // If we are stress testing VPlan builds, do not attempt to generate vector
10042 // code. Masked vector code generation support will follow soon.
10043 // Also, do not attempt to vectorize if no vector code will be produced.
10044 if (VPlanBuildStressTest
|| EnableVPlanPredication
||
10045 VectorizationFactor::Disabled() == VF
)
10048 LVP
.setBestPlan(VF
.Width
, 1);
10051 GeneratedRTChecks
Checks(*PSE
.getSE(), DT
, LI
,
10052 F
->getParent()->getDataLayout());
10053 InnerLoopVectorizer
LB(L
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
, VF
.Width
, 1, LVL
,
10054 &CM
, BFI
, PSI
, Checks
);
10055 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10056 << L
->getHeader()->getParent()->getName() << "\"\n");
10057 LVP
.executePlan(LB
, DT
);
10060 // Mark the loop as already vectorized to avoid vectorizing again.
10061 Hints
.setAlreadyVectorized();
10062 assert(!verifyFunction(*L
->getHeader()->getParent(), &dbgs()));
10066 // Emit a remark if there are stores to floats that required a floating point
10067 // extension. If the vectorized loop was generated with floating point there
10068 // will be a performance penalty from the conversion overhead and the change in
10069 // the vector width.
10070 static void checkMixedPrecision(Loop
*L
, OptimizationRemarkEmitter
*ORE
) {
10071 SmallVector
<Instruction
*, 4> Worklist
;
10072 for (BasicBlock
*BB
: L
->getBlocks()) {
10073 for (Instruction
&Inst
: *BB
) {
10074 if (auto *S
= dyn_cast
<StoreInst
>(&Inst
)) {
10075 if (S
->getValueOperand()->getType()->isFloatTy())
10076 Worklist
.push_back(S
);
10081 // Traverse the floating point stores upwards searching, for floating point
10083 SmallPtrSet
<const Instruction
*, 4> Visited
;
10084 SmallPtrSet
<const Instruction
*, 4> EmittedRemark
;
10085 while (!Worklist
.empty()) {
10086 auto *I
= Worklist
.pop_back_val();
10087 if (!L
->contains(I
))
10089 if (!Visited
.insert(I
).second
)
10092 // Emit a remark if the floating point store required a floating
10093 // point conversion.
10094 // TODO: More work could be done to identify the root cause such as a
10095 // constant or a function return type and point the user to it.
10096 if (isa
<FPExtInst
>(I
) && EmittedRemark
.insert(I
).second
)
10098 return OptimizationRemarkAnalysis(LV_NAME
, "VectorMixedPrecision",
10099 I
->getDebugLoc(), L
->getHeader())
10100 << "floating point conversion changes vector width. "
10101 << "Mixed floating point precision requires an up/down "
10102 << "cast that will negatively impact performance.";
10105 for (Use
&Op
: I
->operands())
10106 if (auto *OpI
= dyn_cast
<Instruction
>(Op
))
10107 Worklist
.push_back(OpI
);
10111 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts
)
10112 : InterleaveOnlyWhenForced(Opts
.InterleaveOnlyWhenForced
||
10113 !EnableLoopInterleaving
),
10114 VectorizeOnlyWhenForced(Opts
.VectorizeOnlyWhenForced
||
10115 !EnableLoopVectorization
) {}
10117 bool LoopVectorizePass::processLoop(Loop
*L
) {
10118 assert((EnableVPlanNativePath
|| L
->isInnermost()) &&
10119 "VPlan-native path is not enabled. Only process inner loops.");
10122 const std::string DebugLocStr
= getDebugLocString(L
);
10123 #endif /* NDEBUG */
10125 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
10126 << L
->getHeader()->getParent()->getName() << "\" from "
10127 << DebugLocStr
<< "\n");
10129 LoopVectorizeHints
Hints(L
, InterleaveOnlyWhenForced
, *ORE
);
10132 dbgs() << "LV: Loop hints:"
10134 << (Hints
.getForce() == LoopVectorizeHints::FK_Disabled
10136 : (Hints
.getForce() == LoopVectorizeHints::FK_Enabled
10139 << " width=" << Hints
.getWidth()
10140 << " interleave=" << Hints
.getInterleave() << "\n");
10142 // Function containing loop
10143 Function
*F
= L
->getHeader()->getParent();
10145 // Looking at the diagnostic output is the only way to determine if a loop
10146 // was vectorized (other than looking at the IR or machine code), so it
10147 // is important to generate an optimization remark for each loop. Most of
10148 // these messages are generated as OptimizationRemarkAnalysis. Remarks
10149 // generated as OptimizationRemark and OptimizationRemarkMissed are
10150 // less verbose reporting vectorized loops and unvectorized loops that may
10151 // benefit from vectorization, respectively.
10153 if (!Hints
.allowVectorization(F
, L
, VectorizeOnlyWhenForced
)) {
10154 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10158 PredicatedScalarEvolution
PSE(*SE
, *L
);
10160 // Check if it is legal to vectorize the loop.
10161 LoopVectorizationRequirements Requirements
;
10162 LoopVectorizationLegality
LVL(L
, PSE
, DT
, TTI
, TLI
, AA
, F
, GetLAA
, LI
, ORE
,
10163 &Requirements
, &Hints
, DB
, AC
, BFI
, PSI
);
10164 if (!LVL
.canVectorize(EnableVPlanNativePath
)) {
10165 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10166 Hints
.emitRemarkWithHints();
10170 // Check the function attributes and profiles to find out if this function
10171 // should be optimized for size.
10172 ScalarEpilogueLowering SEL
= getScalarEpilogueLowering(
10173 F
, L
, Hints
, PSI
, BFI
, TTI
, TLI
, AC
, LI
, PSE
.getSE(), DT
, LVL
);
10175 // Entrance to the VPlan-native vectorization path. Outer loops are processed
10176 // here. They may require CFG and instruction level transformations before
10177 // even evaluating whether vectorization is profitable. Since we cannot modify
10178 // the incoming IR, we need to build VPlan upfront in the vectorization
10180 if (!L
->isInnermost())
10181 return processLoopInVPlanNativePath(L
, PSE
, LI
, DT
, &LVL
, TTI
, TLI
, DB
, AC
,
10182 ORE
, BFI
, PSI
, Hints
, Requirements
);
10184 assert(L
->isInnermost() && "Inner loop expected.");
10186 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10187 // count by optimizing for size, to minimize overheads.
10188 auto ExpectedTC
= getSmallBestKnownTC(*SE
, L
);
10189 if (ExpectedTC
&& *ExpectedTC
< TinyTripCountVectorThreshold
) {
10190 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10191 << "This loop is worth vectorizing only if no scalar "
10192 << "iteration overheads are incurred.");
10193 if (Hints
.getForce() == LoopVectorizeHints::FK_Enabled
)
10194 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10196 LLVM_DEBUG(dbgs() << "\n");
10197 SEL
= CM_ScalarEpilogueNotAllowedLowTripLoop
;
10201 // Check the function attributes to see if implicit floats are allowed.
10202 // FIXME: This check doesn't seem possibly correct -- what if the loop is
10203 // an integer loop and the vector instructions selected are purely integer
10204 // vector instructions?
10205 if (F
->hasFnAttribute(Attribute::NoImplicitFloat
)) {
10206 reportVectorizationFailure(
10207 "Can't vectorize when the NoImplicitFloat attribute is used",
10208 "loop not vectorized due to NoImplicitFloat attribute",
10209 "NoImplicitFloat", ORE
, L
);
10210 Hints
.emitRemarkWithHints();
10214 // Check if the target supports potentially unsafe FP vectorization.
10215 // FIXME: Add a check for the type of safety issue (denormal, signaling)
10216 // for the target we're vectorizing for, to make sure none of the
10217 // additional fp-math flags can help.
10218 if (Hints
.isPotentiallyUnsafe() &&
10219 TTI
->isFPVectorizationPotentiallyUnsafe()) {
10220 reportVectorizationFailure(
10221 "Potentially unsafe FP op prevents vectorization",
10222 "loop not vectorized due to unsafe FP support.",
10223 "UnsafeFP", ORE
, L
);
10224 Hints
.emitRemarkWithHints();
10228 if (!LVL
.canVectorizeFPMath(ForceOrderedReductions
)) {
10230 auto *ExactFPMathInst
= Requirements
.getExactFPInst();
10231 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE
, "CantReorderFPOps",
10232 ExactFPMathInst
->getDebugLoc(),
10233 ExactFPMathInst
->getParent())
10234 << "loop not vectorized: cannot prove it is safe to reorder "
10235 "floating-point operations";
10237 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10238 "reorder floating-point operations\n");
10239 Hints
.emitRemarkWithHints();
10243 bool UseInterleaved
= TTI
->enableInterleavedAccessVectorization();
10244 InterleavedAccessInfo
IAI(PSE
, L
, DT
, LI
, LVL
.getLAI());
10246 // If an override option has been passed in for interleaved accesses, use it.
10247 if (EnableInterleavedMemAccesses
.getNumOccurrences() > 0)
10248 UseInterleaved
= EnableInterleavedMemAccesses
;
10250 // Analyze interleaved memory accesses.
10251 if (UseInterleaved
) {
10252 IAI
.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI
));
10255 // Use the cost model.
10256 LoopVectorizationCostModel
CM(SEL
, L
, PSE
, LI
, &LVL
, *TTI
, TLI
, DB
, AC
, ORE
,
10258 CM
.collectValuesToIgnore();
10259 CM
.collectElementTypesForWidening();
10261 // Use the planner for vectorization.
10262 LoopVectorizationPlanner
LVP(L
, LI
, TLI
, TTI
, &LVL
, CM
, IAI
, PSE
, Hints
,
10263 Requirements
, ORE
);
10265 // Get user vectorization factor and interleave count.
10266 ElementCount UserVF
= Hints
.getWidth();
10267 unsigned UserIC
= Hints
.getInterleave();
10269 // Plan how to best vectorize, return the best VF and its cost.
10270 Optional
<VectorizationFactor
> MaybeVF
= LVP
.plan(UserVF
, UserIC
);
10272 VectorizationFactor VF
= VectorizationFactor::Disabled();
10277 // Select the interleave count.
10278 IC
= CM
.selectInterleaveCount(VF
.Width
, *VF
.Cost
.getValue());
10281 // Identify the diagnostic messages that should be produced.
10282 std::pair
<StringRef
, std::string
> VecDiagMsg
, IntDiagMsg
;
10283 bool VectorizeLoop
= true, InterleaveLoop
= true;
10284 if (VF
.Width
.isScalar()) {
10285 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10286 VecDiagMsg
= std::make_pair(
10287 "VectorizationNotBeneficial",
10288 "the cost-model indicates that vectorization is not beneficial");
10289 VectorizeLoop
= false;
10292 if (!MaybeVF
&& UserIC
> 1) {
10293 // Tell the user interleaving was avoided up-front, despite being explicitly
10295 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10296 "interleaving should be avoided up front\n");
10297 IntDiagMsg
= std::make_pair(
10298 "InterleavingAvoided",
10299 "Ignoring UserIC, because interleaving was avoided up front");
10300 InterleaveLoop
= false;
10301 } else if (IC
== 1 && UserIC
<= 1) {
10302 // Tell the user interleaving is not beneficial.
10303 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10304 IntDiagMsg
= std::make_pair(
10305 "InterleavingNotBeneficial",
10306 "the cost-model indicates that interleaving is not beneficial");
10307 InterleaveLoop
= false;
10309 IntDiagMsg
.first
= "InterleavingNotBeneficialAndDisabled";
10310 IntDiagMsg
.second
+=
10311 " and is explicitly disabled or interleave count is set to 1";
10313 } else if (IC
> 1 && UserIC
== 1) {
10314 // Tell the user interleaving is beneficial, but it explicitly disabled.
10316 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10317 IntDiagMsg
= std::make_pair(
10318 "InterleavingBeneficialButDisabled",
10319 "the cost-model indicates that interleaving is beneficial "
10320 "but is explicitly disabled or interleave count is set to 1");
10321 InterleaveLoop
= false;
10324 // Override IC if user provided an interleave count.
10325 IC
= UserIC
> 0 ? UserIC
: IC
;
10327 // Emit diagnostic messages, if any.
10328 const char *VAPassName
= Hints
.vectorizeAnalysisPassName();
10329 if (!VectorizeLoop
&& !InterleaveLoop
) {
10330 // Do not vectorize or interleaving the loop.
10332 return OptimizationRemarkMissed(VAPassName
, VecDiagMsg
.first
,
10333 L
->getStartLoc(), L
->getHeader())
10334 << VecDiagMsg
.second
;
10337 return OptimizationRemarkMissed(LV_NAME
, IntDiagMsg
.first
,
10338 L
->getStartLoc(), L
->getHeader())
10339 << IntDiagMsg
.second
;
10342 } else if (!VectorizeLoop
&& InterleaveLoop
) {
10343 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC
<< '\n');
10345 return OptimizationRemarkAnalysis(VAPassName
, VecDiagMsg
.first
,
10346 L
->getStartLoc(), L
->getHeader())
10347 << VecDiagMsg
.second
;
10349 } else if (VectorizeLoop
&& !InterleaveLoop
) {
10350 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF
.Width
10351 << ") in " << DebugLocStr
<< '\n');
10353 return OptimizationRemarkAnalysis(LV_NAME
, IntDiagMsg
.first
,
10354 L
->getStartLoc(), L
->getHeader())
10355 << IntDiagMsg
.second
;
10357 } else if (VectorizeLoop
&& InterleaveLoop
) {
10358 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF
.Width
10359 << ") in " << DebugLocStr
<< '\n');
10360 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC
<< '\n');
10363 bool DisableRuntimeUnroll
= false;
10364 MDNode
*OrigLoopID
= L
->getLoopID();
10366 // Optimistically generate runtime checks. Drop them if they turn out to not
10367 // be profitable. Limit the scope of Checks, so the cleanup happens
10368 // immediately after vector codegeneration is done.
10369 GeneratedRTChecks
Checks(*PSE
.getSE(), DT
, LI
,
10370 F
->getParent()->getDataLayout());
10371 if (!VF
.Width
.isScalar() || IC
> 1)
10372 Checks
.Create(L
, *LVL
.getLAI(), PSE
.getUnionPredicate());
10373 LVP
.setBestPlan(VF
.Width
, IC
);
10375 using namespace ore
;
10376 if (!VectorizeLoop
) {
10377 assert(IC
> 1 && "interleave count should not be 1 or 0");
10378 // If we decided that it is not legal to vectorize the loop, then
10380 InnerLoopUnroller
Unroller(L
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
, IC
, &LVL
,
10381 &CM
, BFI
, PSI
, Checks
);
10382 LVP
.executePlan(Unroller
, DT
);
10385 return OptimizationRemark(LV_NAME
, "Interleaved", L
->getStartLoc(),
10387 << "interleaved loop (interleaved count: "
10388 << NV("InterleaveCount", IC
) << ")";
10391 // If we decided that it is *legal* to vectorize the loop, then do it.
10393 // Consider vectorizing the epilogue too if it's profitable.
10394 VectorizationFactor EpilogueVF
=
10395 CM
.selectEpilogueVectorizationFactor(VF
.Width
, LVP
);
10396 if (EpilogueVF
.Width
.isVector()) {
10398 // The first pass vectorizes the main loop and creates a scalar epilogue
10399 // to be vectorized by executing the plan (potentially with a different
10400 // factor) again shortly afterwards.
10401 EpilogueLoopVectorizationInfo
EPI(VF
.Width
.getKnownMinValue(), IC
,
10402 EpilogueVF
.Width
.getKnownMinValue(),
10404 EpilogueVectorizerMainLoop
MainILV(L
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
,
10405 EPI
, &LVL
, &CM
, BFI
, PSI
, Checks
);
10407 LVP
.setBestPlan(EPI
.MainLoopVF
, EPI
.MainLoopUF
);
10408 LVP
.executePlan(MainILV
, DT
);
10411 simplifyLoop(L
, DT
, LI
, SE
, AC
, nullptr, false /* PreserveLCSSA */);
10412 formLCSSARecursively(*L
, *DT
, LI
, SE
);
10414 // Second pass vectorizes the epilogue and adjusts the control flow
10415 // edges from the first pass.
10416 LVP
.setBestPlan(EPI
.EpilogueVF
, EPI
.EpilogueUF
);
10417 EPI
.MainLoopVF
= EPI
.EpilogueVF
;
10418 EPI
.MainLoopUF
= EPI
.EpilogueUF
;
10419 EpilogueVectorizerEpilogueLoop
EpilogILV(L
, PSE
, LI
, DT
, TLI
, TTI
, AC
,
10420 ORE
, EPI
, &LVL
, &CM
, BFI
, PSI
,
10422 LVP
.executePlan(EpilogILV
, DT
);
10423 ++LoopsEpilogueVectorized
;
10425 if (!MainILV
.areSafetyChecksAdded())
10426 DisableRuntimeUnroll
= true;
10428 InnerLoopVectorizer
LB(L
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
, VF
.Width
, IC
,
10429 &LVL
, &CM
, BFI
, PSI
, Checks
);
10430 LVP
.executePlan(LB
, DT
);
10433 // Add metadata to disable runtime unrolling a scalar loop when there
10434 // are no runtime checks about strides and memory. A scalar loop that is
10435 // rarely used is not worth unrolling.
10436 if (!LB
.areSafetyChecksAdded())
10437 DisableRuntimeUnroll
= true;
10439 // Report the vectorization decision.
10441 return OptimizationRemark(LV_NAME
, "Vectorized", L
->getStartLoc(),
10443 << "vectorized loop (vectorization width: "
10444 << NV("VectorizationFactor", VF
.Width
)
10445 << ", interleaved count: " << NV("InterleaveCount", IC
) << ")";
10449 if (ORE
->allowExtraAnalysis(LV_NAME
))
10450 checkMixedPrecision(L
, ORE
);
10453 Optional
<MDNode
*> RemainderLoopID
=
10454 makeFollowupLoopID(OrigLoopID
, {LLVMLoopVectorizeFollowupAll
,
10455 LLVMLoopVectorizeFollowupEpilogue
});
10456 if (RemainderLoopID
.hasValue()) {
10457 L
->setLoopID(RemainderLoopID
.getValue());
10459 if (DisableRuntimeUnroll
)
10460 AddRuntimeUnrollDisableMetaData(L
);
10462 // Mark the loop as already vectorized to avoid vectorizing again.
10463 Hints
.setAlreadyVectorized();
10466 assert(!verifyFunction(*L
->getHeader()->getParent(), &dbgs()));
10470 LoopVectorizeResult
LoopVectorizePass::runImpl(
10471 Function
&F
, ScalarEvolution
&SE_
, LoopInfo
&LI_
, TargetTransformInfo
&TTI_
,
10472 DominatorTree
&DT_
, BlockFrequencyInfo
&BFI_
, TargetLibraryInfo
*TLI_
,
10473 DemandedBits
&DB_
, AAResults
&AA_
, AssumptionCache
&AC_
,
10474 std::function
<const LoopAccessInfo
&(Loop
&)> &GetLAA_
,
10475 OptimizationRemarkEmitter
&ORE_
, ProfileSummaryInfo
*PSI_
) {
10489 // Don't attempt if
10490 // 1. the target claims to have no vector registers, and
10491 // 2. interleaving won't help ILP.
10493 // The second condition is necessary because, even if the target has no
10494 // vector registers, loop vectorization may still enable scalar
10496 if (!TTI
->getNumberOfRegisters(TTI
->getRegisterClassForType(true)) &&
10497 TTI
->getMaxInterleaveFactor(1) < 2)
10498 return LoopVectorizeResult(false, false);
10500 bool Changed
= false, CFGChanged
= false;
10502 // The vectorizer requires loops to be in simplified form.
10503 // Since simplification may add new inner loops, it has to run before the
10504 // legality and profitability checks. This means running the loop vectorizer
10505 // will simplify all loops, regardless of whether anything end up being
10507 for (auto &L
: *LI
)
10508 Changed
|= CFGChanged
|=
10509 simplifyLoop(L
, DT
, LI
, SE
, AC
, nullptr, false /* PreserveLCSSA */);
10511 // Build up a worklist of inner-loops to vectorize. This is necessary as
10512 // the act of vectorizing or partially unrolling a loop creates new loops
10513 // and can invalidate iterators across the loops.
10514 SmallVector
<Loop
*, 8> Worklist
;
10516 for (Loop
*L
: *LI
)
10517 collectSupportedLoops(*L
, LI
, ORE
, Worklist
);
10519 LoopsAnalyzed
+= Worklist
.size();
10521 // Now walk the identified inner loops.
10522 while (!Worklist
.empty()) {
10523 Loop
*L
= Worklist
.pop_back_val();
10525 // For the inner loops we actually process, form LCSSA to simplify the
10527 Changed
|= formLCSSARecursively(*L
, *DT
, LI
, SE
);
10529 Changed
|= CFGChanged
|= processLoop(L
);
10532 // Process each loop nest in the function.
10533 return LoopVectorizeResult(Changed
, CFGChanged
);
10536 PreservedAnalyses
LoopVectorizePass::run(Function
&F
,
10537 FunctionAnalysisManager
&AM
) {
10538 auto &SE
= AM
.getResult
<ScalarEvolutionAnalysis
>(F
);
10539 auto &LI
= AM
.getResult
<LoopAnalysis
>(F
);
10540 auto &TTI
= AM
.getResult
<TargetIRAnalysis
>(F
);
10541 auto &DT
= AM
.getResult
<DominatorTreeAnalysis
>(F
);
10542 auto &BFI
= AM
.getResult
<BlockFrequencyAnalysis
>(F
);
10543 auto &TLI
= AM
.getResult
<TargetLibraryAnalysis
>(F
);
10544 auto &AA
= AM
.getResult
<AAManager
>(F
);
10545 auto &AC
= AM
.getResult
<AssumptionAnalysis
>(F
);
10546 auto &DB
= AM
.getResult
<DemandedBitsAnalysis
>(F
);
10547 auto &ORE
= AM
.getResult
<OptimizationRemarkEmitterAnalysis
>(F
);
10549 auto &LAM
= AM
.getResult
<LoopAnalysisManagerFunctionProxy
>(F
).getManager();
10550 std::function
<const LoopAccessInfo
&(Loop
&)> GetLAA
=
10551 [&](Loop
&L
) -> const LoopAccessInfo
& {
10552 LoopStandardAnalysisResults AR
= {AA
, AC
, DT
, LI
, SE
,
10553 TLI
, TTI
, nullptr, nullptr};
10554 return LAM
.getResult
<LoopAccessAnalysis
>(L
, AR
);
10556 auto &MAMProxy
= AM
.getResult
<ModuleAnalysisManagerFunctionProxy
>(F
);
10557 ProfileSummaryInfo
*PSI
=
10558 MAMProxy
.getCachedResult
<ProfileSummaryAnalysis
>(*F
.getParent());
10559 LoopVectorizeResult Result
=
10560 runImpl(F
, SE
, LI
, TTI
, DT
, BFI
, &TLI
, DB
, AA
, AC
, GetLAA
, ORE
, PSI
);
10561 if (!Result
.MadeAnyChange
)
10562 return PreservedAnalyses::all();
10563 PreservedAnalyses PA
;
10565 // We currently do not preserve loopinfo/dominator analyses with outer loop
10566 // vectorization. Until this is addressed, mark these analyses as preserved
10567 // only for non-VPlan-native path.
10568 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10569 if (!EnableVPlanNativePath
) {
10570 PA
.preserve
<LoopAnalysis
>();
10571 PA
.preserve
<DominatorTreeAnalysis
>();
10573 if (!Result
.MadeCFGChange
)
10574 PA
.preserveSet
<CFGAnalyses
>();