1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 // of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 // widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 // of vectorization. It decides on the optimal vector width, which
26 // can be one, if vectorization is not profitable.
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
36 //===----------------------------------------------------------------------===//
38 // The reduction-variable vectorization is based on the paper:
39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
41 // Variable uniformity checks are inspired by:
42 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
44 // The interleaved access vectorization is based on the paper:
45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
48 // Other ideas/concepts are from:
49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52 // Vectorizing Compilers.
54 //===----------------------------------------------------------------------===//
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanHCFGTransforms.h"
62 #include "VPlanPredicator.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/LoopSimplify.h"
137 #include "llvm/Transforms/Utils/LoopUtils.h"
138 #include "llvm/Transforms/Utils/LoopVersioning.h"
139 #include "llvm/Transforms/Utils/SizeOpts.h"
140 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
145 #include <functional>
154 using namespace llvm
;
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
160 /// Metadata attribute names
161 static const char *const LLVMLoopVectorizeFollowupAll
=
162 "llvm.loop.vectorize.followup_all";
163 static const char *const LLVMLoopVectorizeFollowupVectorized
=
164 "llvm.loop.vectorize.followup_vectorized";
165 static const char *const LLVMLoopVectorizeFollowupEpilogue
=
166 "llvm.loop.vectorize.followup_epilogue";
169 STATISTIC(LoopsVectorized
, "Number of loops vectorized");
170 STATISTIC(LoopsAnalyzed
, "Number of loops analyzed for vectorization");
172 /// Loops with a known constant trip count below this number are vectorized only
173 /// if no scalar iteration overheads are incurred.
174 static cl::opt
<unsigned> TinyTripCountVectorThreshold(
175 "vectorizer-min-trip-count", cl::init(16), cl::Hidden
,
176 cl::desc("Loops with a constant trip count that is smaller than this "
177 "value are vectorized only if no scalar iteration overheads "
180 // Indicates that an epilogue is undesired, predication is preferred.
181 // This means that the vectorizer will try to fold the loop-tail (epilogue)
182 // into the loop and predicate the loop body accordingly.
183 static cl::opt
<bool> PreferPredicateOverEpilog(
184 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden
,
185 cl::desc("Indicate that an epilogue is undesired, predication should be "
188 static cl::opt
<bool> MaximizeBandwidth(
189 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden
,
190 cl::desc("Maximize bandwidth when selecting vectorization factor which "
191 "will be determined by the smallest type in loop."));
193 static cl::opt
<bool> EnableInterleavedMemAccesses(
194 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden
,
195 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
197 /// An interleave-group may need masking if it resides in a block that needs
198 /// predication, or in order to mask away gaps.
199 static cl::opt
<bool> EnableMaskedInterleavedMemAccesses(
200 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden
,
201 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
203 /// We don't interleave loops with a known constant trip count below this
205 static const unsigned TinyTripCountInterleaveThreshold
= 128;
207 static cl::opt
<unsigned> ForceTargetNumScalarRegs(
208 "force-target-num-scalar-regs", cl::init(0), cl::Hidden
,
209 cl::desc("A flag that overrides the target's number of scalar registers."));
211 static cl::opt
<unsigned> ForceTargetNumVectorRegs(
212 "force-target-num-vector-regs", cl::init(0), cl::Hidden
,
213 cl::desc("A flag that overrides the target's number of vector registers."));
215 static cl::opt
<unsigned> ForceTargetMaxScalarInterleaveFactor(
216 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden
,
217 cl::desc("A flag that overrides the target's max interleave factor for "
220 static cl::opt
<unsigned> ForceTargetMaxVectorInterleaveFactor(
221 "force-target-max-vector-interleave", cl::init(0), cl::Hidden
,
222 cl::desc("A flag that overrides the target's max interleave factor for "
223 "vectorized loops."));
225 static cl::opt
<unsigned> ForceTargetInstructionCost(
226 "force-target-instruction-cost", cl::init(0), cl::Hidden
,
227 cl::desc("A flag that overrides the target's expected cost for "
228 "an instruction to a single constant value. Mostly "
229 "useful for getting consistent testing."));
231 static cl::opt
<unsigned> SmallLoopCost(
232 "small-loop-cost", cl::init(20), cl::Hidden
,
234 "The cost of a loop that is considered 'small' by the interleaver."));
236 static cl::opt
<bool> LoopVectorizeWithBlockFrequency(
237 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden
,
238 cl::desc("Enable the use of the block frequency analysis to access PGO "
239 "heuristics minimizing code growth in cold regions and being more "
240 "aggressive in hot regions."));
242 // Runtime interleave loops for load/store throughput.
243 static cl::opt
<bool> EnableLoadStoreRuntimeInterleave(
244 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden
,
246 "Enable runtime interleaving until load/store ports are saturated"));
248 /// The number of stores in a loop that are allowed to need predication.
249 static cl::opt
<unsigned> NumberOfStoresToPredicate(
250 "vectorize-num-stores-pred", cl::init(1), cl::Hidden
,
251 cl::desc("Max number of stores to be predicated behind an if."));
253 static cl::opt
<bool> EnableIndVarRegisterHeur(
254 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden
,
255 cl::desc("Count the induction variable only once when interleaving"));
257 static cl::opt
<bool> EnableCondStoresVectorization(
258 "enable-cond-stores-vec", cl::init(true), cl::Hidden
,
259 cl::desc("Enable if predication of stores during vectorization."));
261 static cl::opt
<unsigned> MaxNestedScalarReductionIC(
262 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden
,
263 cl::desc("The maximum interleave count to use when interleaving a scalar "
264 "reduction in a nested loop."));
266 cl::opt
<bool> EnableVPlanNativePath(
267 "enable-vplan-native-path", cl::init(false), cl::Hidden
,
268 cl::desc("Enable VPlan-native vectorization path with "
269 "support for outer loop vectorization."));
271 // FIXME: Remove this switch once we have divergence analysis. Currently we
272 // assume divergent non-backedge branches when this switch is true.
273 cl::opt
<bool> EnableVPlanPredication(
274 "enable-vplan-predication", cl::init(false), cl::Hidden
,
275 cl::desc("Enable VPlan-native vectorization path predicator with "
276 "support for outer loop vectorization."));
278 // This flag enables the stress testing of the VPlan H-CFG construction in the
279 // VPlan-native vectorization path. It must be used in conjuction with
280 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
281 // verification of the H-CFGs built.
282 static cl::opt
<bool> VPlanBuildStressTest(
283 "vplan-build-stress-test", cl::init(false), cl::Hidden
,
285 "Build VPlan for every supported loop nest in the function and bail "
286 "out right after the build (stress test the VPlan H-CFG construction "
287 "in the VPlan-native vectorization path)."));
289 cl::opt
<bool> llvm::EnableLoopInterleaving(
290 "interleave-loops", cl::init(true), cl::Hidden
,
291 cl::desc("Enable loop interleaving in Loop vectorization passes"));
292 cl::opt
<bool> llvm::EnableLoopVectorization(
293 "vectorize-loops", cl::init(true), cl::Hidden
,
294 cl::desc("Run the Loop vectorization passes"));
296 /// A helper function for converting Scalar types to vector types.
297 /// If the incoming type is void, we return void. If the VF is 1, we return
299 static Type
*ToVectorTy(Type
*Scalar
, unsigned VF
) {
300 if (Scalar
->isVoidTy() || VF
== 1)
302 return VectorType::get(Scalar
, VF
);
305 /// A helper function that returns the type of loaded or stored value.
306 static Type
*getMemInstValueType(Value
*I
) {
307 assert((isa
<LoadInst
>(I
) || isa
<StoreInst
>(I
)) &&
308 "Expected Load or Store instruction");
309 if (auto *LI
= dyn_cast
<LoadInst
>(I
))
310 return LI
->getType();
311 return cast
<StoreInst
>(I
)->getValueOperand()->getType();
314 /// A helper function that returns true if the given type is irregular. The
315 /// type is irregular if its allocated size doesn't equal the store size of an
316 /// element of the corresponding vector type at the given vectorization factor.
317 static bool hasIrregularType(Type
*Ty
, const DataLayout
&DL
, unsigned VF
) {
318 // Determine if an array of VF elements of type Ty is "bitcast compatible"
319 // with a <VF x Ty> vector.
321 auto *VectorTy
= VectorType::get(Ty
, VF
);
322 return VF
* DL
.getTypeAllocSize(Ty
) != DL
.getTypeStoreSize(VectorTy
);
325 // If the vectorization factor is one, we just check if an array of type Ty
326 // requires padding between elements.
327 return DL
.getTypeAllocSizeInBits(Ty
) != DL
.getTypeSizeInBits(Ty
);
330 /// A helper function that returns the reciprocal of the block probability of
331 /// predicated blocks. If we return X, we are assuming the predicated block
332 /// will execute once for every X iterations of the loop header.
334 /// TODO: We should use actual block probability here, if available. Currently,
335 /// we always assume predicated blocks have a 50% chance of executing.
336 static unsigned getReciprocalPredBlockProb() { return 2; }
338 /// A helper function that adds a 'fast' flag to floating-point operations.
339 static Value
*addFastMathFlag(Value
*V
) {
340 if (isa
<FPMathOperator
>(V
))
341 cast
<Instruction
>(V
)->setFastMathFlags(FastMathFlags::getFast());
345 static Value
*addFastMathFlag(Value
*V
, FastMathFlags FMF
) {
346 if (isa
<FPMathOperator
>(V
))
347 cast
<Instruction
>(V
)->setFastMathFlags(FMF
);
351 /// A helper function that returns an integer or floating-point constant with
353 static Constant
*getSignedIntOrFpConstant(Type
*Ty
, int64_t C
) {
354 return Ty
->isIntegerTy() ? ConstantInt::getSigned(Ty
, C
)
355 : ConstantFP::get(Ty
, C
);
360 /// InnerLoopVectorizer vectorizes loops which contain only one basic
361 /// block to a specified vectorization factor (VF).
362 /// This class performs the widening of scalars into vectors, or multiple
363 /// scalars. This class also implements the following features:
364 /// * It inserts an epilogue loop for handling loops that don't have iteration
365 /// counts that are known to be a multiple of the vectorization factor.
366 /// * It handles the code generation for reduction variables.
367 /// * Scalarization (implementation using scalars) of un-vectorizable
369 /// InnerLoopVectorizer does not perform any vectorization-legality
370 /// checks, and relies on the caller to check for the different legality
371 /// aspects. The InnerLoopVectorizer relies on the
372 /// LoopVectorizationLegality class to provide information about the induction
373 /// and reduction variables that were found to a given vectorization factor.
374 class InnerLoopVectorizer
{
376 InnerLoopVectorizer(Loop
*OrigLoop
, PredicatedScalarEvolution
&PSE
,
377 LoopInfo
*LI
, DominatorTree
*DT
,
378 const TargetLibraryInfo
*TLI
,
379 const TargetTransformInfo
*TTI
, AssumptionCache
*AC
,
380 OptimizationRemarkEmitter
*ORE
, unsigned VecWidth
,
381 unsigned UnrollFactor
, LoopVectorizationLegality
*LVL
,
382 LoopVectorizationCostModel
*CM
)
383 : OrigLoop(OrigLoop
), PSE(PSE
), LI(LI
), DT(DT
), TLI(TLI
), TTI(TTI
),
384 AC(AC
), ORE(ORE
), VF(VecWidth
), UF(UnrollFactor
),
385 Builder(PSE
.getSE()->getContext()),
386 VectorLoopValueMap(UnrollFactor
, VecWidth
), Legal(LVL
), Cost(CM
) {}
387 virtual ~InnerLoopVectorizer() = default;
389 /// Create a new empty loop. Unlink the old loop and connect the new one.
390 /// Return the pre-header block of the new loop.
391 BasicBlock
*createVectorizedLoopSkeleton();
393 /// Widen a single instruction within the innermost loop.
394 void widenInstruction(Instruction
&I
);
396 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
397 void fixVectorizedLoop();
399 // Return true if any runtime check is added.
400 bool areSafetyChecksAdded() { return AddedSafetyChecks
; }
402 /// A type for vectorized values in the new loop. Each value from the
403 /// original loop, when vectorized, is represented by UF vector values in the
404 /// new unrolled loop, where UF is the unroll factor.
405 using VectorParts
= SmallVector
<Value
*, 2>;
407 /// Vectorize a single PHINode in a block. This method handles the induction
408 /// variable canonicalization. It supports both VF = 1 for unrolled loops and
409 /// arbitrary length vectors.
410 void widenPHIInstruction(Instruction
*PN
, unsigned UF
, unsigned VF
);
412 /// A helper function to scalarize a single Instruction in the innermost loop.
413 /// Generates a sequence of scalar instances for each lane between \p MinLane
414 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
416 void scalarizeInstruction(Instruction
*Instr
, const VPIteration
&Instance
,
417 bool IfPredicateInstr
);
419 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
420 /// is provided, the integer induction variable will first be truncated to
421 /// the corresponding type.
422 void widenIntOrFpInduction(PHINode
*IV
, TruncInst
*Trunc
= nullptr);
424 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
425 /// vector or scalar value on-demand if one is not yet available. When
426 /// vectorizing a loop, we visit the definition of an instruction before its
427 /// uses. When visiting the definition, we either vectorize or scalarize the
428 /// instruction, creating an entry for it in the corresponding map. (In some
429 /// cases, such as induction variables, we will create both vector and scalar
430 /// entries.) Then, as we encounter uses of the definition, we derive values
431 /// for each scalar or vector use unless such a value is already available.
432 /// For example, if we scalarize a definition and one of its uses is vector,
433 /// we build the required vector on-demand with an insertelement sequence
434 /// when visiting the use. Otherwise, if the use is scalar, we can use the
435 /// existing scalar definition.
437 /// Return a value in the new loop corresponding to \p V from the original
438 /// loop at unroll index \p Part. If the value has already been vectorized,
439 /// the corresponding vector entry in VectorLoopValueMap is returned. If,
440 /// however, the value has a scalar entry in VectorLoopValueMap, we construct
441 /// a new vector value on-demand by inserting the scalar values into a vector
442 /// with an insertelement sequence. If the value has been neither vectorized
443 /// nor scalarized, it must be loop invariant, so we simply broadcast the
444 /// value into a vector.
445 Value
*getOrCreateVectorValue(Value
*V
, unsigned Part
);
447 /// Return a value in the new loop corresponding to \p V from the original
448 /// loop at unroll and vector indices \p Instance. If the value has been
449 /// vectorized but not scalarized, the necessary extractelement instruction
450 /// will be generated.
451 Value
*getOrCreateScalarValue(Value
*V
, const VPIteration
&Instance
);
453 /// Construct the vector value of a scalarized value \p V one lane at a time.
454 void packScalarIntoVectorValue(Value
*V
, const VPIteration
&Instance
);
456 /// Try to vectorize the interleaved access group that \p Instr belongs to,
457 /// optionally masking the vector operations if \p BlockInMask is non-null.
458 void vectorizeInterleaveGroup(Instruction
*Instr
,
459 VectorParts
*BlockInMask
= nullptr);
461 /// Vectorize Load and Store instructions, optionally masking the vector
462 /// operations if \p BlockInMask is non-null.
463 void vectorizeMemoryInstruction(Instruction
*Instr
,
464 VectorParts
*BlockInMask
= nullptr);
466 /// Set the debug location in the builder using the debug location in
468 void setDebugLocFromInst(IRBuilder
<> &B
, const Value
*Ptr
);
470 /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
471 void fixNonInductionPHIs(void);
474 friend class LoopVectorizationPlanner
;
476 /// A small list of PHINodes.
477 using PhiVector
= SmallVector
<PHINode
*, 4>;
479 /// A type for scalarized values in the new loop. Each value from the
480 /// original loop, when scalarized, is represented by UF x VF scalar values
481 /// in the new unrolled loop, where UF is the unroll factor and VF is the
482 /// vectorization factor.
483 using ScalarParts
= SmallVector
<SmallVector
<Value
*, 4>, 2>;
485 /// Set up the values of the IVs correctly when exiting the vector loop.
486 void fixupIVUsers(PHINode
*OrigPhi
, const InductionDescriptor
&II
,
487 Value
*CountRoundDown
, Value
*EndValue
,
488 BasicBlock
*MiddleBlock
);
490 /// Create a new induction variable inside L.
491 PHINode
*createInductionVariable(Loop
*L
, Value
*Start
, Value
*End
,
492 Value
*Step
, Instruction
*DL
);
494 /// Handle all cross-iteration phis in the header.
495 void fixCrossIterationPHIs();
497 /// Fix a first-order recurrence. This is the second phase of vectorizing
499 void fixFirstOrderRecurrence(PHINode
*Phi
);
501 /// Fix a reduction cross-iteration phi. This is the second phase of
502 /// vectorizing this phi node.
503 void fixReduction(PHINode
*Phi
);
505 /// The Loop exit block may have single value PHI nodes with some
506 /// incoming value. While vectorizing we only handled real values
507 /// that were defined inside the loop and we should have one value for
508 /// each predecessor of its parent basic block. See PR14725.
511 /// Iteratively sink the scalarized operands of a predicated instruction into
512 /// the block that was created for it.
513 void sinkScalarOperands(Instruction
*PredInst
);
515 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
517 void truncateToMinimalBitwidths();
519 /// Insert the new loop to the loop hierarchy and pass manager
520 /// and update the analysis passes.
521 void updateAnalysis();
523 /// Create a broadcast instruction. This method generates a broadcast
524 /// instruction (shuffle) for loop invariant values and for the induction
525 /// value. If this is the induction variable then we extend it to N, N+1, ...
526 /// this is needed because each iteration in the loop corresponds to a SIMD
528 virtual Value
*getBroadcastInstrs(Value
*V
);
530 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
531 /// to each vector element of Val. The sequence starts at StartIndex.
532 /// \p Opcode is relevant for FP induction variable.
533 virtual Value
*getStepVector(Value
*Val
, int StartIdx
, Value
*Step
,
534 Instruction::BinaryOps Opcode
=
535 Instruction::BinaryOpsEnd
);
537 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
538 /// variable on which to base the steps, \p Step is the size of the step, and
539 /// \p EntryVal is the value from the original loop that maps to the steps.
540 /// Note that \p EntryVal doesn't have to be an induction variable - it
541 /// can also be a truncate instruction.
542 void buildScalarSteps(Value
*ScalarIV
, Value
*Step
, Instruction
*EntryVal
,
543 const InductionDescriptor
&ID
);
545 /// Create a vector induction phi node based on an existing scalar one. \p
546 /// EntryVal is the value from the original loop that maps to the vector phi
547 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
548 /// truncate instruction, instead of widening the original IV, we widen a
549 /// version of the IV truncated to \p EntryVal's type.
550 void createVectorIntOrFpInductionPHI(const InductionDescriptor
&II
,
551 Value
*Step
, Instruction
*EntryVal
);
553 /// Returns true if an instruction \p I should be scalarized instead of
554 /// vectorized for the chosen vectorization factor.
555 bool shouldScalarizeInstruction(Instruction
*I
) const;
557 /// Returns true if we should generate a scalar version of \p IV.
558 bool needsScalarInduction(Instruction
*IV
) const;
560 /// If there is a cast involved in the induction variable \p ID, which should
561 /// be ignored in the vectorized loop body, this function records the
562 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
563 /// cast. We had already proved that the casted Phi is equal to the uncasted
564 /// Phi in the vectorized loop (under a runtime guard), and therefore
565 /// there is no need to vectorize the cast - the same value can be used in the
566 /// vector loop for both the Phi and the cast.
567 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
568 /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
570 /// \p EntryVal is the value from the original loop that maps to the vector
571 /// phi node and is used to distinguish what is the IV currently being
572 /// processed - original one (if \p EntryVal is a phi corresponding to the
573 /// original IV) or the "newly-created" one based on the proof mentioned above
574 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
575 /// latter case \p EntryVal is a TruncInst and we must not record anything for
576 /// that IV, but it's error-prone to expect callers of this routine to care
577 /// about that, hence this explicit parameter.
578 void recordVectorLoopValueForInductionCast(const InductionDescriptor
&ID
,
579 const Instruction
*EntryVal
,
580 Value
*VectorLoopValue
,
582 unsigned Lane
= UINT_MAX
);
584 /// Generate a shuffle sequence that will reverse the vector Vec.
585 virtual Value
*reverseVector(Value
*Vec
);
587 /// Returns (and creates if needed) the original loop trip count.
588 Value
*getOrCreateTripCount(Loop
*NewLoop
);
590 /// Returns (and creates if needed) the trip count of the widened loop.
591 Value
*getOrCreateVectorTripCount(Loop
*NewLoop
);
593 /// Returns a bitcasted value to the requested vector type.
594 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
595 Value
*createBitOrPointerCast(Value
*V
, VectorType
*DstVTy
,
596 const DataLayout
&DL
);
598 /// Emit a bypass check to see if the vector trip count is zero, including if
600 void emitMinimumIterationCountCheck(Loop
*L
, BasicBlock
*Bypass
);
602 /// Emit a bypass check to see if all of the SCEV assumptions we've
603 /// had to make are correct.
604 void emitSCEVChecks(Loop
*L
, BasicBlock
*Bypass
);
606 /// Emit bypass checks to check any memory assumptions we may have made.
607 void emitMemRuntimeChecks(Loop
*L
, BasicBlock
*Bypass
);
609 /// Compute the transformed value of Index at offset StartValue using step
611 /// For integer induction, returns StartValue + Index * StepValue.
612 /// For pointer induction, returns StartValue[Index * StepValue].
613 /// FIXME: The newly created binary instructions should contain nsw/nuw
614 /// flags, which can be found from the original scalar operations.
615 Value
*emitTransformedIndex(IRBuilder
<> &B
, Value
*Index
, ScalarEvolution
*SE
,
616 const DataLayout
&DL
,
617 const InductionDescriptor
&ID
) const;
619 /// Add additional metadata to \p To that was not present on \p Orig.
621 /// Currently this is used to add the noalias annotations based on the
622 /// inserted memchecks. Use this for instructions that are *cloned* into the
624 void addNewMetadata(Instruction
*To
, const Instruction
*Orig
);
626 /// Add metadata from one instruction to another.
628 /// This includes both the original MDs from \p From and additional ones (\see
629 /// addNewMetadata). Use this for *newly created* instructions in the vector
631 void addMetadata(Instruction
*To
, Instruction
*From
);
633 /// Similar to the previous function but it adds the metadata to a
634 /// vector of instructions.
635 void addMetadata(ArrayRef
<Value
*> To
, Instruction
*From
);
637 /// The original loop.
640 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
641 /// dynamic knowledge to simplify SCEV expressions and converts them to a
642 /// more usable form.
643 PredicatedScalarEvolution
&PSE
;
654 /// Target Library Info.
655 const TargetLibraryInfo
*TLI
;
657 /// Target Transform Info.
658 const TargetTransformInfo
*TTI
;
660 /// Assumption Cache.
663 /// Interface to emit optimization remarks.
664 OptimizationRemarkEmitter
*ORE
;
666 /// LoopVersioning. It's only set up (non-null) if memchecks were
669 /// This is currently only used to add no-alias metadata based on the
670 /// memchecks. The actually versioning is performed manually.
671 std::unique_ptr
<LoopVersioning
> LVer
;
673 /// The vectorization SIMD factor to use. Each vector will have this many
677 /// The vectorization unroll factor to use. Each scalar is vectorized to this
678 /// many different vector instructions.
681 /// The builder that we use
684 // --- Vectorization state ---
686 /// The vector-loop preheader.
687 BasicBlock
*LoopVectorPreHeader
;
689 /// The scalar-loop preheader.
690 BasicBlock
*LoopScalarPreHeader
;
692 /// Middle Block between the vector and the scalar.
693 BasicBlock
*LoopMiddleBlock
;
695 /// The ExitBlock of the scalar loop.
696 BasicBlock
*LoopExitBlock
;
698 /// The vector loop body.
699 BasicBlock
*LoopVectorBody
;
701 /// The scalar loop body.
702 BasicBlock
*LoopScalarBody
;
704 /// A list of all bypass blocks. The first block is the entry of the loop.
705 SmallVector
<BasicBlock
*, 4> LoopBypassBlocks
;
707 /// The new Induction variable which was added to the new block.
708 PHINode
*Induction
= nullptr;
710 /// The induction variable of the old basic block.
711 PHINode
*OldInduction
= nullptr;
713 /// Maps values from the original loop to their corresponding values in the
714 /// vectorized loop. A key value can map to either vector values, scalar
715 /// values or both kinds of values, depending on whether the key was
716 /// vectorized and scalarized.
717 VectorizerValueMap VectorLoopValueMap
;
719 /// Store instructions that were predicated.
720 SmallVector
<Instruction
*, 4> PredicatedInstructions
;
722 /// Trip count of the original loop.
723 Value
*TripCount
= nullptr;
725 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
726 Value
*VectorTripCount
= nullptr;
728 /// The legality analysis.
729 LoopVectorizationLegality
*Legal
;
731 /// The profitablity analysis.
732 LoopVectorizationCostModel
*Cost
;
734 // Record whether runtime checks are added.
735 bool AddedSafetyChecks
= false;
737 // Holds the end values for each induction variable. We save the end values
738 // so we can later fix-up the external users of the induction variables.
739 DenseMap
<PHINode
*, Value
*> IVEndValues
;
741 // Vector of original scalar PHIs whose corresponding widened PHIs need to be
742 // fixed up at the end of vector code generation.
743 SmallVector
<PHINode
*, 8> OrigPHIsToFix
;
746 class InnerLoopUnroller
: public InnerLoopVectorizer
{
748 InnerLoopUnroller(Loop
*OrigLoop
, PredicatedScalarEvolution
&PSE
,
749 LoopInfo
*LI
, DominatorTree
*DT
,
750 const TargetLibraryInfo
*TLI
,
751 const TargetTransformInfo
*TTI
, AssumptionCache
*AC
,
752 OptimizationRemarkEmitter
*ORE
, unsigned UnrollFactor
,
753 LoopVectorizationLegality
*LVL
,
754 LoopVectorizationCostModel
*CM
)
755 : InnerLoopVectorizer(OrigLoop
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
, 1,
756 UnrollFactor
, LVL
, CM
) {}
759 Value
*getBroadcastInstrs(Value
*V
) override
;
760 Value
*getStepVector(Value
*Val
, int StartIdx
, Value
*Step
,
761 Instruction::BinaryOps Opcode
=
762 Instruction::BinaryOpsEnd
) override
;
763 Value
*reverseVector(Value
*Vec
) override
;
766 } // end namespace llvm
768 /// Look for a meaningful debug location on the instruction or it's
770 static Instruction
*getDebugLocFromInstOrOperands(Instruction
*I
) {
775 if (I
->getDebugLoc() != Empty
)
778 for (User::op_iterator OI
= I
->op_begin(), OE
= I
->op_end(); OI
!= OE
; ++OI
) {
779 if (Instruction
*OpInst
= dyn_cast
<Instruction
>(*OI
))
780 if (OpInst
->getDebugLoc() != Empty
)
787 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder
<> &B
, const Value
*Ptr
) {
788 if (const Instruction
*Inst
= dyn_cast_or_null
<Instruction
>(Ptr
)) {
789 const DILocation
*DIL
= Inst
->getDebugLoc();
790 if (DIL
&& Inst
->getFunction()->isDebugInfoForProfiling() &&
791 !isa
<DbgInfoIntrinsic
>(Inst
)) {
792 auto NewDIL
= DIL
->cloneByMultiplyingDuplicationFactor(UF
* VF
);
794 B
.SetCurrentDebugLocation(NewDIL
.getValue());
797 << "Failed to create new discriminator: "
798 << DIL
->getFilename() << " Line: " << DIL
->getLine());
801 B
.SetCurrentDebugLocation(DIL
);
803 B
.SetCurrentDebugLocation(DebugLoc());
806 /// Write a record \p DebugMsg about vectorization failure to the debug
807 /// output stream. If \p I is passed, it is an instruction that prevents
810 static void debugVectorizationFailure(const StringRef DebugMsg
,
812 dbgs() << "LV: Not vectorizing: " << DebugMsg
;
821 /// Create an analysis remark that explains why vectorization failed
823 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
824 /// RemarkName is the identifier for the remark. If \p I is passed it is an
825 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for
826 /// the location of the remark. \return the remark object that can be
828 static OptimizationRemarkAnalysis
createLVAnalysis(const char *PassName
,
829 StringRef RemarkName
, Loop
*TheLoop
, Instruction
*I
) {
830 Value
*CodeRegion
= TheLoop
->getHeader();
831 DebugLoc DL
= TheLoop
->getStartLoc();
834 CodeRegion
= I
->getParent();
835 // If there is no debug location attached to the instruction, revert back to
837 if (I
->getDebugLoc())
838 DL
= I
->getDebugLoc();
841 OptimizationRemarkAnalysis
R(PassName
, RemarkName
, DL
, CodeRegion
);
842 R
<< "loop not vectorized: ";
848 void reportVectorizationFailure(const StringRef DebugMsg
,
849 const StringRef OREMsg
, const StringRef ORETag
,
850 OptimizationRemarkEmitter
*ORE
, Loop
*TheLoop
, Instruction
*I
) {
851 LLVM_DEBUG(debugVectorizationFailure(DebugMsg
, I
));
852 LoopVectorizeHints
Hints(TheLoop
, true /* doesn't matter */, *ORE
);
853 ORE
->emit(createLVAnalysis(Hints
.vectorizeAnalysisPassName(),
854 ORETag
, TheLoop
, I
) << OREMsg
);
857 } // end namespace llvm
860 /// \return string containing a file name and a line # for the given loop.
861 static std::string
getDebugLocString(const Loop
*L
) {
864 raw_string_ostream
OS(Result
);
865 if (const DebugLoc LoopDbgLoc
= L
->getStartLoc())
866 LoopDbgLoc
.print(OS
);
868 // Just print the module name.
869 OS
<< L
->getHeader()->getParent()->getParent()->getModuleIdentifier();
876 void InnerLoopVectorizer::addNewMetadata(Instruction
*To
,
877 const Instruction
*Orig
) {
878 // If the loop was versioned with memchecks, add the corresponding no-alias
880 if (LVer
&& (isa
<LoadInst
>(Orig
) || isa
<StoreInst
>(Orig
)))
881 LVer
->annotateInstWithNoAlias(To
, Orig
);
884 void InnerLoopVectorizer::addMetadata(Instruction
*To
,
886 propagateMetadata(To
, From
);
887 addNewMetadata(To
, From
);
890 void InnerLoopVectorizer::addMetadata(ArrayRef
<Value
*> To
,
892 for (Value
*V
: To
) {
893 if (Instruction
*I
= dyn_cast
<Instruction
>(V
))
894 addMetadata(I
, From
);
900 // Loop vectorization cost-model hints how the scalar epilogue loop should be
902 enum ScalarEpilogueLowering
{
904 // The default: allowing scalar epilogues.
905 CM_ScalarEpilogueAllowed
,
907 // Vectorization with OptForSize: don't allow epilogues.
908 CM_ScalarEpilogueNotAllowedOptSize
,
910 // A special case of vectorisation with OptForSize: loops with a very small
911 // trip count are considered for vectorization under OptForSize, thereby
912 // making sure the cost of their loop body is dominant, free of runtime
913 // guards and scalar iteration overheads.
914 CM_ScalarEpilogueNotAllowedLowTripLoop
,
916 // Loop hint predicate indicating an epilogue is undesired.
917 CM_ScalarEpilogueNotNeededUsePredicate
920 /// LoopVectorizationCostModel - estimates the expected speedups due to
922 /// In many cases vectorization is not profitable. This can happen because of
923 /// a number of reasons. In this class we mainly attempt to predict the
924 /// expected speedup/slowdowns due to the supported instruction set. We use the
925 /// TargetTransformInfo to query the different backends for the cost of
926 /// different operations.
927 class LoopVectorizationCostModel
{
929 LoopVectorizationCostModel(ScalarEpilogueLowering SEL
, Loop
*L
,
930 PredicatedScalarEvolution
&PSE
, LoopInfo
*LI
,
931 LoopVectorizationLegality
*Legal
,
932 const TargetTransformInfo
&TTI
,
933 const TargetLibraryInfo
*TLI
, DemandedBits
*DB
,
935 OptimizationRemarkEmitter
*ORE
, const Function
*F
,
936 const LoopVectorizeHints
*Hints
,
937 InterleavedAccessInfo
&IAI
)
938 : ScalarEpilogueStatus(SEL
), TheLoop(L
), PSE(PSE
), LI(LI
), Legal(Legal
),
939 TTI(TTI
), TLI(TLI
), DB(DB
), AC(AC
), ORE(ORE
), TheFunction(F
),
940 Hints(Hints
), InterleaveInfo(IAI
) {}
942 /// \return An upper bound for the vectorization factor, or None if
943 /// vectorization and interleaving should be avoided up front.
944 Optional
<unsigned> computeMaxVF();
946 /// \return True if runtime checks are required for vectorization, and false
948 bool runtimeChecksRequired();
950 /// \return The most profitable vectorization factor and the cost of that VF.
951 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
952 /// then this vectorization factor will be selected if vectorization is
954 VectorizationFactor
selectVectorizationFactor(unsigned MaxVF
);
956 /// Setup cost-based decisions for user vectorization factor.
957 void selectUserVectorizationFactor(unsigned UserVF
) {
958 collectUniformsAndScalars(UserVF
);
959 collectInstsToScalarize(UserVF
);
962 /// \return The size (in bits) of the smallest and widest types in the code
963 /// that needs to be vectorized. We ignore values that remain scalar such as
964 /// 64 bit loop indices.
965 std::pair
<unsigned, unsigned> getSmallestAndWidestTypes();
967 /// \return The desired interleave count.
968 /// If interleave count has been specified by metadata it will be returned.
969 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
970 /// are the selected vectorization factor and the cost of the selected VF.
971 unsigned selectInterleaveCount(unsigned VF
, unsigned LoopCost
);
973 /// Memory access instruction may be vectorized in more than one way.
974 /// Form of instruction after vectorization depends on cost.
975 /// This function takes cost-based decisions for Load/Store instructions
976 /// and collects them in a map. This decisions map is used for building
977 /// the lists of loop-uniform and loop-scalar instructions.
978 /// The calculated cost is saved with widening decision in order to
979 /// avoid redundant calculations.
980 void setCostBasedWideningDecision(unsigned VF
);
982 /// A struct that represents some properties of the register usage
984 struct RegisterUsage
{
985 /// Holds the number of loop invariant values that are used in the loop.
986 unsigned LoopInvariantRegs
;
988 /// Holds the maximum number of concurrent live intervals in the loop.
989 unsigned MaxLocalUsers
;
992 /// \return Returns information about the register usages of the loop for the
993 /// given vectorization factors.
994 SmallVector
<RegisterUsage
, 8> calculateRegisterUsage(ArrayRef
<unsigned> VFs
);
996 /// Collect values we want to ignore in the cost model.
997 void collectValuesToIgnore();
999 /// \returns The smallest bitwidth each instruction can be represented with.
1000 /// The vector equivalents of these instructions should be truncated to this
1002 const MapVector
<Instruction
*, uint64_t> &getMinimalBitwidths() const {
1006 /// \returns True if it is more profitable to scalarize instruction \p I for
1007 /// vectorization factor \p VF.
1008 bool isProfitableToScalarize(Instruction
*I
, unsigned VF
) const {
1009 assert(VF
> 1 && "Profitable to scalarize relevant only for VF > 1.");
1011 // Cost model is not run in the VPlan-native path - return conservative
1012 // result until this changes.
1013 if (EnableVPlanNativePath
)
1016 auto Scalars
= InstsToScalarize
.find(VF
);
1017 assert(Scalars
!= InstsToScalarize
.end() &&
1018 "VF not yet analyzed for scalarization profitability");
1019 return Scalars
->second
.find(I
) != Scalars
->second
.end();
1022 /// Returns true if \p I is known to be uniform after vectorization.
1023 bool isUniformAfterVectorization(Instruction
*I
, unsigned VF
) const {
1027 // Cost model is not run in the VPlan-native path - return conservative
1028 // result until this changes.
1029 if (EnableVPlanNativePath
)
1032 auto UniformsPerVF
= Uniforms
.find(VF
);
1033 assert(UniformsPerVF
!= Uniforms
.end() &&
1034 "VF not yet analyzed for uniformity");
1035 return UniformsPerVF
->second
.find(I
) != UniformsPerVF
->second
.end();
1038 /// Returns true if \p I is known to be scalar after vectorization.
1039 bool isScalarAfterVectorization(Instruction
*I
, unsigned VF
) const {
1043 // Cost model is not run in the VPlan-native path - return conservative
1044 // result until this changes.
1045 if (EnableVPlanNativePath
)
1048 auto ScalarsPerVF
= Scalars
.find(VF
);
1049 assert(ScalarsPerVF
!= Scalars
.end() &&
1050 "Scalar values are not calculated for VF");
1051 return ScalarsPerVF
->second
.find(I
) != ScalarsPerVF
->second
.end();
1054 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1055 /// for vectorization factor \p VF.
1056 bool canTruncateToMinimalBitwidth(Instruction
*I
, unsigned VF
) const {
1057 return VF
> 1 && MinBWs
.find(I
) != MinBWs
.end() &&
1058 !isProfitableToScalarize(I
, VF
) &&
1059 !isScalarAfterVectorization(I
, VF
);
1062 /// Decision that was taken during cost calculation for memory instruction.
1065 CM_Widen
, // For consecutive accesses with stride +1.
1066 CM_Widen_Reverse
, // For consecutive accesses with stride -1.
1072 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1073 /// instruction \p I and vector width \p VF.
1074 void setWideningDecision(Instruction
*I
, unsigned VF
, InstWidening W
,
1076 assert(VF
>= 2 && "Expected VF >=2");
1077 WideningDecisions
[std::make_pair(I
, VF
)] = std::make_pair(W
, Cost
);
1080 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1081 /// interleaving group \p Grp and vector width \p VF.
1082 void setWideningDecision(const InterleaveGroup
<Instruction
> *Grp
, unsigned VF
,
1083 InstWidening W
, unsigned Cost
) {
1084 assert(VF
>= 2 && "Expected VF >=2");
1085 /// Broadcast this decicion to all instructions inside the group.
1086 /// But the cost will be assigned to one instruction only.
1087 for (unsigned i
= 0; i
< Grp
->getFactor(); ++i
) {
1088 if (auto *I
= Grp
->getMember(i
)) {
1089 if (Grp
->getInsertPos() == I
)
1090 WideningDecisions
[std::make_pair(I
, VF
)] = std::make_pair(W
, Cost
);
1092 WideningDecisions
[std::make_pair(I
, VF
)] = std::make_pair(W
, 0);
1097 /// Return the cost model decision for the given instruction \p I and vector
1098 /// width \p VF. Return CM_Unknown if this instruction did not pass
1099 /// through the cost modeling.
1100 InstWidening
getWideningDecision(Instruction
*I
, unsigned VF
) {
1101 assert(VF
>= 2 && "Expected VF >=2");
1103 // Cost model is not run in the VPlan-native path - return conservative
1104 // result until this changes.
1105 if (EnableVPlanNativePath
)
1106 return CM_GatherScatter
;
1108 std::pair
<Instruction
*, unsigned> InstOnVF
= std::make_pair(I
, VF
);
1109 auto Itr
= WideningDecisions
.find(InstOnVF
);
1110 if (Itr
== WideningDecisions
.end())
1112 return Itr
->second
.first
;
1115 /// Return the vectorization cost for the given instruction \p I and vector
1117 unsigned getWideningCost(Instruction
*I
, unsigned VF
) {
1118 assert(VF
>= 2 && "Expected VF >=2");
1119 std::pair
<Instruction
*, unsigned> InstOnVF
= std::make_pair(I
, VF
);
1120 assert(WideningDecisions
.find(InstOnVF
) != WideningDecisions
.end() &&
1121 "The cost is not calculated");
1122 return WideningDecisions
[InstOnVF
].second
;
1125 /// Return True if instruction \p I is an optimizable truncate whose operand
1126 /// is an induction variable. Such a truncate will be removed by adding a new
1127 /// induction variable with the destination type.
1128 bool isOptimizableIVTruncate(Instruction
*I
, unsigned VF
) {
1129 // If the instruction is not a truncate, return false.
1130 auto *Trunc
= dyn_cast
<TruncInst
>(I
);
1134 // Get the source and destination types of the truncate.
1135 Type
*SrcTy
= ToVectorTy(cast
<CastInst
>(I
)->getSrcTy(), VF
);
1136 Type
*DestTy
= ToVectorTy(cast
<CastInst
>(I
)->getDestTy(), VF
);
1138 // If the truncate is free for the given types, return false. Replacing a
1139 // free truncate with an induction variable would add an induction variable
1140 // update instruction to each iteration of the loop. We exclude from this
1141 // check the primary induction variable since it will need an update
1142 // instruction regardless.
1143 Value
*Op
= Trunc
->getOperand(0);
1144 if (Op
!= Legal
->getPrimaryInduction() && TTI
.isTruncateFree(SrcTy
, DestTy
))
1147 // If the truncated value is not an induction variable, return false.
1148 return Legal
->isInductionPhi(Op
);
1151 /// Collects the instructions to scalarize for each predicated instruction in
1153 void collectInstsToScalarize(unsigned VF
);
1155 /// Collect Uniform and Scalar values for the given \p VF.
1156 /// The sets depend on CM decision for Load/Store instructions
1157 /// that may be vectorized as interleave, gather-scatter or scalarized.
1158 void collectUniformsAndScalars(unsigned VF
) {
1159 // Do the analysis once.
1160 if (VF
== 1 || Uniforms
.find(VF
) != Uniforms
.end())
1162 setCostBasedWideningDecision(VF
);
1163 collectLoopUniforms(VF
);
1164 collectLoopScalars(VF
);
1167 /// Returns true if the target machine supports masked store operation
1168 /// for the given \p DataType and kind of access to \p Ptr.
1169 bool isLegalMaskedStore(Type
*DataType
, Value
*Ptr
) {
1170 return Legal
->isConsecutivePtr(Ptr
) && TTI
.isLegalMaskedStore(DataType
);
1173 /// Returns true if the target machine supports masked load operation
1174 /// for the given \p DataType and kind of access to \p Ptr.
1175 bool isLegalMaskedLoad(Type
*DataType
, Value
*Ptr
) {
1176 return Legal
->isConsecutivePtr(Ptr
) && TTI
.isLegalMaskedLoad(DataType
);
1179 /// Returns true if the target machine supports masked scatter operation
1180 /// for the given \p DataType.
1181 bool isLegalMaskedScatter(Type
*DataType
) {
1182 return TTI
.isLegalMaskedScatter(DataType
);
1185 /// Returns true if the target machine supports masked gather operation
1186 /// for the given \p DataType.
1187 bool isLegalMaskedGather(Type
*DataType
) {
1188 return TTI
.isLegalMaskedGather(DataType
);
1191 /// Returns true if the target machine can represent \p V as a masked gather
1192 /// or scatter operation.
1193 bool isLegalGatherOrScatter(Value
*V
) {
1194 bool LI
= isa
<LoadInst
>(V
);
1195 bool SI
= isa
<StoreInst
>(V
);
1198 auto *Ty
= getMemInstValueType(V
);
1199 return (LI
&& isLegalMaskedGather(Ty
)) || (SI
&& isLegalMaskedScatter(Ty
));
1202 /// Returns true if \p I is an instruction that will be scalarized with
1203 /// predication. Such instructions include conditional stores and
1204 /// instructions that may divide by zero.
1205 /// If a non-zero VF has been calculated, we check if I will be scalarized
1206 /// predication for that VF.
1207 bool isScalarWithPredication(Instruction
*I
, unsigned VF
= 1);
1209 // Returns true if \p I is an instruction that will be predicated either
1210 // through scalar predication or masked load/store or masked gather/scatter.
1211 // Superset of instructions that return true for isScalarWithPredication.
1212 bool isPredicatedInst(Instruction
*I
) {
1213 if (!blockNeedsPredication(I
->getParent()))
1215 // Loads and stores that need some form of masked operation are predicated
1217 if (isa
<LoadInst
>(I
) || isa
<StoreInst
>(I
))
1218 return Legal
->isMaskRequired(I
);
1219 return isScalarWithPredication(I
);
1222 /// Returns true if \p I is a memory instruction with consecutive memory
1223 /// access that can be widened.
1224 bool memoryInstructionCanBeWidened(Instruction
*I
, unsigned VF
= 1);
1226 /// Returns true if \p I is a memory instruction in an interleaved-group
1227 /// of memory accesses that can be vectorized with wide vector loads/stores
1229 bool interleavedAccessCanBeWidened(Instruction
*I
, unsigned VF
= 1);
1231 /// Check if \p Instr belongs to any interleaved access group.
1232 bool isAccessInterleaved(Instruction
*Instr
) {
1233 return InterleaveInfo
.isInterleaved(Instr
);
1236 /// Get the interleaved access group that \p Instr belongs to.
1237 const InterleaveGroup
<Instruction
> *
1238 getInterleavedAccessGroup(Instruction
*Instr
) {
1239 return InterleaveInfo
.getInterleaveGroup(Instr
);
1242 /// Returns true if an interleaved group requires a scalar iteration
1243 /// to handle accesses with gaps, and there is nothing preventing us from
1244 /// creating a scalar epilogue.
1245 bool requiresScalarEpilogue() const {
1246 return isScalarEpilogueAllowed() && InterleaveInfo
.requiresScalarEpilogue();
1249 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1250 /// loop hint annotation.
1251 bool isScalarEpilogueAllowed() const {
1252 return ScalarEpilogueStatus
== CM_ScalarEpilogueAllowed
;
1255 /// Returns true if all loop blocks should be masked to fold tail loop.
1256 bool foldTailByMasking() const { return FoldTailByMasking
; }
1258 bool blockNeedsPredication(BasicBlock
*BB
) {
1259 return foldTailByMasking() || Legal
->blockNeedsPredication(BB
);
1262 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1263 /// with factor VF. Return the cost of the instruction, including
1264 /// scalarization overhead if it's needed.
1265 unsigned getVectorIntrinsicCost(CallInst
*CI
, unsigned VF
);
1267 /// Estimate cost of a call instruction CI if it were vectorized with factor
1268 /// VF. Return the cost of the instruction, including scalarization overhead
1269 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1271 /// i.e. either vector version isn't available, or is too expensive.
1272 unsigned getVectorCallCost(CallInst
*CI
, unsigned VF
, bool &NeedToScalarize
);
1275 unsigned NumPredStores
= 0;
1277 /// \return An upper bound for the vectorization factor, larger than zero.
1278 /// One is returned if vectorization should best be avoided due to cost.
1279 unsigned computeFeasibleMaxVF(unsigned ConstTripCount
);
1281 /// The vectorization cost is a combination of the cost itself and a boolean
1282 /// indicating whether any of the contributing operations will actually
1284 /// vector values after type legalization in the backend. If this latter value
1286 /// false, then all operations will be scalarized (i.e. no vectorization has
1287 /// actually taken place).
1288 using VectorizationCostTy
= std::pair
<unsigned, bool>;
1290 /// Returns the expected execution cost. The unit of the cost does
1291 /// not matter because we use the 'cost' units to compare different
1292 /// vector widths. The cost that is returned is *not* normalized by
1293 /// the factor width.
1294 VectorizationCostTy
expectedCost(unsigned VF
);
1296 /// Returns the execution time cost of an instruction for a given vector
1297 /// width. Vector width of one means scalar.
1298 VectorizationCostTy
getInstructionCost(Instruction
*I
, unsigned VF
);
1300 /// The cost-computation logic from getInstructionCost which provides
1301 /// the vector type as an output parameter.
1302 unsigned getInstructionCost(Instruction
*I
, unsigned VF
, Type
*&VectorTy
);
1304 /// Calculate vectorization cost of memory instruction \p I.
1305 unsigned getMemoryInstructionCost(Instruction
*I
, unsigned VF
);
1307 /// The cost computation for scalarized memory instruction.
1308 unsigned getMemInstScalarizationCost(Instruction
*I
, unsigned VF
);
1310 /// The cost computation for interleaving group of memory instructions.
1311 unsigned getInterleaveGroupCost(Instruction
*I
, unsigned VF
);
1313 /// The cost computation for Gather/Scatter instruction.
1314 unsigned getGatherScatterCost(Instruction
*I
, unsigned VF
);
1316 /// The cost computation for widening instruction \p I with consecutive
1318 unsigned getConsecutiveMemOpCost(Instruction
*I
, unsigned VF
);
1320 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1321 /// Load: scalar load + broadcast.
1322 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1324 unsigned getUniformMemOpCost(Instruction
*I
, unsigned VF
);
1326 /// Estimate the overhead of scalarizing an instruction. This is a
1327 /// convenience wrapper for the type-based getScalarizationOverhead API.
1328 unsigned getScalarizationOverhead(Instruction
*I
, unsigned VF
);
1330 /// Returns whether the instruction is a load or store and will be a emitted
1331 /// as a vector operation.
1332 bool isConsecutiveLoadOrStore(Instruction
*I
);
1334 /// Returns true if an artificially high cost for emulated masked memrefs
1336 bool useEmulatedMaskMemRefHack(Instruction
*I
);
1338 /// Map of scalar integer values to the smallest bitwidth they can be legally
1339 /// represented as. The vector equivalents of these values should be truncated
1341 MapVector
<Instruction
*, uint64_t> MinBWs
;
1343 /// A type representing the costs for instructions if they were to be
1344 /// scalarized rather than vectorized. The entries are Instruction-Cost
1346 using ScalarCostsTy
= DenseMap
<Instruction
*, unsigned>;
1348 /// A set containing all BasicBlocks that are known to present after
1349 /// vectorization as a predicated block.
1350 SmallPtrSet
<BasicBlock
*, 4> PredicatedBBsAfterVectorization
;
1352 /// Records whether it is allowed to have the original scalar loop execute at
1353 /// least once. This may be needed as a fallback loop in case runtime
1354 /// aliasing/dependence checks fail, or to handle the tail/remainder
1355 /// iterations when the trip count is unknown or doesn't divide by the VF,
1356 /// or as a peel-loop to handle gaps in interleave-groups.
1357 /// Under optsize and when the trip count is very small we don't allow any
1358 /// iterations to execute in the scalar loop.
1359 ScalarEpilogueLowering ScalarEpilogueStatus
= CM_ScalarEpilogueAllowed
;
1361 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1362 bool FoldTailByMasking
= false;
1364 /// A map holding scalar costs for different vectorization factors. The
1365 /// presence of a cost for an instruction in the mapping indicates that the
1366 /// instruction will be scalarized when vectorizing with the associated
1367 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1368 DenseMap
<unsigned, ScalarCostsTy
> InstsToScalarize
;
1370 /// Holds the instructions known to be uniform after vectorization.
1371 /// The data is collected per VF.
1372 DenseMap
<unsigned, SmallPtrSet
<Instruction
*, 4>> Uniforms
;
1374 /// Holds the instructions known to be scalar after vectorization.
1375 /// The data is collected per VF.
1376 DenseMap
<unsigned, SmallPtrSet
<Instruction
*, 4>> Scalars
;
1378 /// Holds the instructions (address computations) that are forced to be
1380 DenseMap
<unsigned, SmallPtrSet
<Instruction
*, 4>> ForcedScalars
;
1382 /// Returns the expected difference in cost from scalarizing the expression
1383 /// feeding a predicated instruction \p PredInst. The instructions to
1384 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1385 /// non-negative return value implies the expression will be scalarized.
1386 /// Currently, only single-use chains are considered for scalarization.
1387 int computePredInstDiscount(Instruction
*PredInst
, ScalarCostsTy
&ScalarCosts
,
1390 /// Collect the instructions that are uniform after vectorization. An
1391 /// instruction is uniform if we represent it with a single scalar value in
1392 /// the vectorized loop corresponding to each vector iteration. Examples of
1393 /// uniform instructions include pointer operands of consecutive or
1394 /// interleaved memory accesses. Note that although uniformity implies an
1395 /// instruction will be scalar, the reverse is not true. In general, a
1396 /// scalarized instruction will be represented by VF scalar values in the
1397 /// vectorized loop, each corresponding to an iteration of the original
1399 void collectLoopUniforms(unsigned VF
);
1401 /// Collect the instructions that are scalar after vectorization. An
1402 /// instruction is scalar if it is known to be uniform or will be scalarized
1403 /// during vectorization. Non-uniform scalarized instructions will be
1404 /// represented by VF values in the vectorized loop, each corresponding to an
1405 /// iteration of the original scalar loop.
1406 void collectLoopScalars(unsigned VF
);
1408 /// Keeps cost model vectorization decision and cost for instructions.
1409 /// Right now it is used for memory instructions only.
1410 using DecisionList
= DenseMap
<std::pair
<Instruction
*, unsigned>,
1411 std::pair
<InstWidening
, unsigned>>;
1413 DecisionList WideningDecisions
;
1415 /// Returns true if \p V is expected to be vectorized and it needs to be
1417 bool needsExtract(Value
*V
, unsigned VF
) const {
1418 Instruction
*I
= dyn_cast
<Instruction
>(V
);
1419 if (VF
== 1 || !I
|| !TheLoop
->contains(I
) || TheLoop
->isLoopInvariant(I
))
1422 // Assume we can vectorize V (and hence we need extraction) if the
1423 // scalars are not computed yet. This can happen, because it is called
1424 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1425 // the scalars are collected. That should be a safe assumption in most
1426 // cases, because we check if the operands have vectorizable types
1427 // beforehand in LoopVectorizationLegality.
1428 return Scalars
.find(VF
) == Scalars
.end() ||
1429 !isScalarAfterVectorization(I
, VF
);
1432 /// Returns a range containing only operands needing to be extracted.
1433 SmallVector
<Value
*, 4> filterExtractingOperands(Instruction::op_range Ops
,
1435 return SmallVector
<Value
*, 4>(make_filter_range(
1436 Ops
, [this, VF
](Value
*V
) { return this->needsExtract(V
, VF
); }));
1440 /// The loop that we evaluate.
1443 /// Predicated scalar evolution analysis.
1444 PredicatedScalarEvolution
&PSE
;
1446 /// Loop Info analysis.
1449 /// Vectorization legality.
1450 LoopVectorizationLegality
*Legal
;
1452 /// Vector target information.
1453 const TargetTransformInfo
&TTI
;
1455 /// Target Library Info.
1456 const TargetLibraryInfo
*TLI
;
1458 /// Demanded bits analysis.
1461 /// Assumption cache.
1462 AssumptionCache
*AC
;
1464 /// Interface to emit optimization remarks.
1465 OptimizationRemarkEmitter
*ORE
;
1467 const Function
*TheFunction
;
1469 /// Loop Vectorize Hint.
1470 const LoopVectorizeHints
*Hints
;
1472 /// The interleave access information contains groups of interleaved accesses
1473 /// with the same stride and close to each other.
1474 InterleavedAccessInfo
&InterleaveInfo
;
1476 /// Values to ignore in the cost model.
1477 SmallPtrSet
<const Value
*, 16> ValuesToIgnore
;
1479 /// Values to ignore in the cost model when VF > 1.
1480 SmallPtrSet
<const Value
*, 16> VecValuesToIgnore
;
1483 } // end namespace llvm
1485 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1486 // vectorization. The loop needs to be annotated with #pragma omp simd
1487 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1488 // vector length information is not provided, vectorization is not considered
1489 // explicit. Interleave hints are not allowed either. These limitations will be
1490 // relaxed in the future.
1491 // Please, note that we are currently forced to abuse the pragma 'clang
1492 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1493 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1494 // provides *explicit vectorization hints* (LV can bypass legal checks and
1495 // assume that vectorization is legal). However, both hints are implemented
1496 // using the same metadata (llvm.loop.vectorize, processed by
1497 // LoopVectorizeHints). This will be fixed in the future when the native IR
1498 // representation for pragma 'omp simd' is introduced.
1499 static bool isExplicitVecOuterLoop(Loop
*OuterLp
,
1500 OptimizationRemarkEmitter
*ORE
) {
1501 assert(!OuterLp
->empty() && "This is not an outer loop");
1502 LoopVectorizeHints
Hints(OuterLp
, true /*DisableInterleaving*/, *ORE
);
1504 // Only outer loops with an explicit vectorization hint are supported.
1505 // Unannotated outer loops are ignored.
1506 if (Hints
.getForce() == LoopVectorizeHints::FK_Undefined
)
1509 Function
*Fn
= OuterLp
->getHeader()->getParent();
1510 if (!Hints
.allowVectorization(Fn
, OuterLp
,
1511 true /*VectorizeOnlyWhenForced*/)) {
1512 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1516 if (Hints
.getInterleave() > 1) {
1517 // TODO: Interleave support is future work.
1518 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1520 Hints
.emitRemarkWithHints();
1527 static void collectSupportedLoops(Loop
&L
, LoopInfo
*LI
,
1528 OptimizationRemarkEmitter
*ORE
,
1529 SmallVectorImpl
<Loop
*> &V
) {
1530 // Collect inner loops and outer loops without irreducible control flow. For
1531 // now, only collect outer loops that have explicit vectorization hints. If we
1532 // are stress testing the VPlan H-CFG construction, we collect the outermost
1533 // loop of every loop nest.
1534 if (L
.empty() || VPlanBuildStressTest
||
1535 (EnableVPlanNativePath
&& isExplicitVecOuterLoop(&L
, ORE
))) {
1536 LoopBlocksRPO
RPOT(&L
);
1538 if (!containsIrreducibleCFG
<const BasicBlock
*>(RPOT
, *LI
)) {
1540 // TODO: Collect inner loops inside marked outer loops in case
1541 // vectorization fails for the outer loop. Do not invoke
1542 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1543 // already known to be reducible. We can use an inherited attribute for
1548 for (Loop
*InnerL
: L
)
1549 collectSupportedLoops(*InnerL
, LI
, ORE
, V
);
1554 /// The LoopVectorize Pass.
1555 struct LoopVectorize
: public FunctionPass
{
1556 /// Pass identification, replacement for typeid
1559 LoopVectorizePass Impl
;
1561 explicit LoopVectorize(bool InterleaveOnlyWhenForced
= false,
1562 bool VectorizeOnlyWhenForced
= false)
1563 : FunctionPass(ID
) {
1564 Impl
.InterleaveOnlyWhenForced
= InterleaveOnlyWhenForced
;
1565 Impl
.VectorizeOnlyWhenForced
= VectorizeOnlyWhenForced
;
1566 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1569 bool runOnFunction(Function
&F
) override
{
1570 if (skipFunction(F
))
1573 auto *SE
= &getAnalysis
<ScalarEvolutionWrapperPass
>().getSE();
1574 auto *LI
= &getAnalysis
<LoopInfoWrapperPass
>().getLoopInfo();
1575 auto *TTI
= &getAnalysis
<TargetTransformInfoWrapperPass
>().getTTI(F
);
1576 auto *DT
= &getAnalysis
<DominatorTreeWrapperPass
>().getDomTree();
1577 auto *BFI
= &getAnalysis
<BlockFrequencyInfoWrapperPass
>().getBFI();
1578 auto *TLIP
= getAnalysisIfAvailable
<TargetLibraryInfoWrapperPass
>();
1579 auto *TLI
= TLIP
? &TLIP
->getTLI() : nullptr;
1580 auto *AA
= &getAnalysis
<AAResultsWrapperPass
>().getAAResults();
1581 auto *AC
= &getAnalysis
<AssumptionCacheTracker
>().getAssumptionCache(F
);
1582 auto *LAA
= &getAnalysis
<LoopAccessLegacyAnalysis
>();
1583 auto *DB
= &getAnalysis
<DemandedBitsWrapperPass
>().getDemandedBits();
1584 auto *ORE
= &getAnalysis
<OptimizationRemarkEmitterWrapperPass
>().getORE();
1585 auto *PSI
= &getAnalysis
<ProfileSummaryInfoWrapperPass
>().getPSI();
1587 std::function
<const LoopAccessInfo
&(Loop
&)> GetLAA
=
1588 [&](Loop
&L
) -> const LoopAccessInfo
& { return LAA
->getInfo(&L
); };
1590 return Impl
.runImpl(F
, *SE
, *LI
, *TTI
, *DT
, *BFI
, TLI
, *DB
, *AA
, *AC
,
1594 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
1595 AU
.addRequired
<AssumptionCacheTracker
>();
1596 AU
.addRequired
<BlockFrequencyInfoWrapperPass
>();
1597 AU
.addRequired
<DominatorTreeWrapperPass
>();
1598 AU
.addRequired
<LoopInfoWrapperPass
>();
1599 AU
.addRequired
<ScalarEvolutionWrapperPass
>();
1600 AU
.addRequired
<TargetTransformInfoWrapperPass
>();
1601 AU
.addRequired
<AAResultsWrapperPass
>();
1602 AU
.addRequired
<LoopAccessLegacyAnalysis
>();
1603 AU
.addRequired
<DemandedBitsWrapperPass
>();
1604 AU
.addRequired
<OptimizationRemarkEmitterWrapperPass
>();
1606 // We currently do not preserve loopinfo/dominator analyses with outer loop
1607 // vectorization. Until this is addressed, mark these analyses as preserved
1608 // only for non-VPlan-native path.
1609 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1610 if (!EnableVPlanNativePath
) {
1611 AU
.addPreserved
<LoopInfoWrapperPass
>();
1612 AU
.addPreserved
<DominatorTreeWrapperPass
>();
1615 AU
.addPreserved
<BasicAAWrapperPass
>();
1616 AU
.addPreserved
<GlobalsAAWrapperPass
>();
1617 AU
.addRequired
<ProfileSummaryInfoWrapperPass
>();
1621 } // end anonymous namespace
1623 //===----------------------------------------------------------------------===//
1624 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1625 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1626 //===----------------------------------------------------------------------===//
1628 Value
*InnerLoopVectorizer::getBroadcastInstrs(Value
*V
) {
1629 // We need to place the broadcast of invariant variables outside the loop,
1630 // but only if it's proven safe to do so. Else, broadcast will be inside
1631 // vector loop body.
1632 Instruction
*Instr
= dyn_cast
<Instruction
>(V
);
1633 bool SafeToHoist
= OrigLoop
->isLoopInvariant(V
) &&
1635 DT
->dominates(Instr
->getParent(), LoopVectorPreHeader
));
1636 // Place the code for broadcasting invariant variables in the new preheader.
1637 IRBuilder
<>::InsertPointGuard
Guard(Builder
);
1639 Builder
.SetInsertPoint(LoopVectorPreHeader
->getTerminator());
1641 // Broadcast the scalar into all locations in the vector.
1642 Value
*Shuf
= Builder
.CreateVectorSplat(VF
, V
, "broadcast");
1647 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1648 const InductionDescriptor
&II
, Value
*Step
, Instruction
*EntryVal
) {
1649 assert((isa
<PHINode
>(EntryVal
) || isa
<TruncInst
>(EntryVal
)) &&
1650 "Expected either an induction phi-node or a truncate of it!");
1651 Value
*Start
= II
.getStartValue();
1653 // Construct the initial value of the vector IV in the vector loop preheader
1654 auto CurrIP
= Builder
.saveIP();
1655 Builder
.SetInsertPoint(LoopVectorPreHeader
->getTerminator());
1656 if (isa
<TruncInst
>(EntryVal
)) {
1657 assert(Start
->getType()->isIntegerTy() &&
1658 "Truncation requires an integer type");
1659 auto *TruncType
= cast
<IntegerType
>(EntryVal
->getType());
1660 Step
= Builder
.CreateTrunc(Step
, TruncType
);
1661 Start
= Builder
.CreateCast(Instruction::Trunc
, Start
, TruncType
);
1663 Value
*SplatStart
= Builder
.CreateVectorSplat(VF
, Start
);
1664 Value
*SteppedStart
=
1665 getStepVector(SplatStart
, 0, Step
, II
.getInductionOpcode());
1667 // We create vector phi nodes for both integer and floating-point induction
1668 // variables. Here, we determine the kind of arithmetic we will perform.
1669 Instruction::BinaryOps AddOp
;
1670 Instruction::BinaryOps MulOp
;
1671 if (Step
->getType()->isIntegerTy()) {
1672 AddOp
= Instruction::Add
;
1673 MulOp
= Instruction::Mul
;
1675 AddOp
= II
.getInductionOpcode();
1676 MulOp
= Instruction::FMul
;
1679 // Multiply the vectorization factor by the step using integer or
1680 // floating-point arithmetic as appropriate.
1681 Value
*ConstVF
= getSignedIntOrFpConstant(Step
->getType(), VF
);
1682 Value
*Mul
= addFastMathFlag(Builder
.CreateBinOp(MulOp
, Step
, ConstVF
));
1684 // Create a vector splat to use in the induction update.
1686 // FIXME: If the step is non-constant, we create the vector splat with
1687 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1688 // handle a constant vector splat.
1689 Value
*SplatVF
= isa
<Constant
>(Mul
)
1690 ? ConstantVector::getSplat(VF
, cast
<Constant
>(Mul
))
1691 : Builder
.CreateVectorSplat(VF
, Mul
);
1692 Builder
.restoreIP(CurrIP
);
1694 // We may need to add the step a number of times, depending on the unroll
1695 // factor. The last of those goes into the PHI.
1696 PHINode
*VecInd
= PHINode::Create(SteppedStart
->getType(), 2, "vec.ind",
1697 &*LoopVectorBody
->getFirstInsertionPt());
1698 VecInd
->setDebugLoc(EntryVal
->getDebugLoc());
1699 Instruction
*LastInduction
= VecInd
;
1700 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
1701 VectorLoopValueMap
.setVectorValue(EntryVal
, Part
, LastInduction
);
1703 if (isa
<TruncInst
>(EntryVal
))
1704 addMetadata(LastInduction
, EntryVal
);
1705 recordVectorLoopValueForInductionCast(II
, EntryVal
, LastInduction
, Part
);
1707 LastInduction
= cast
<Instruction
>(addFastMathFlag(
1708 Builder
.CreateBinOp(AddOp
, LastInduction
, SplatVF
, "step.add")));
1709 LastInduction
->setDebugLoc(EntryVal
->getDebugLoc());
1712 // Move the last step to the end of the latch block. This ensures consistent
1713 // placement of all induction updates.
1714 auto *LoopVectorLatch
= LI
->getLoopFor(LoopVectorBody
)->getLoopLatch();
1715 auto *Br
= cast
<BranchInst
>(LoopVectorLatch
->getTerminator());
1716 auto *ICmp
= cast
<Instruction
>(Br
->getCondition());
1717 LastInduction
->moveBefore(ICmp
);
1718 LastInduction
->setName("vec.ind.next");
1720 VecInd
->addIncoming(SteppedStart
, LoopVectorPreHeader
);
1721 VecInd
->addIncoming(LastInduction
, LoopVectorLatch
);
1724 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction
*I
) const {
1725 return Cost
->isScalarAfterVectorization(I
, VF
) ||
1726 Cost
->isProfitableToScalarize(I
, VF
);
1729 bool InnerLoopVectorizer::needsScalarInduction(Instruction
*IV
) const {
1730 if (shouldScalarizeInstruction(IV
))
1732 auto isScalarInst
= [&](User
*U
) -> bool {
1733 auto *I
= cast
<Instruction
>(U
);
1734 return (OrigLoop
->contains(I
) && shouldScalarizeInstruction(I
));
1736 return llvm::any_of(IV
->users(), isScalarInst
);
1739 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1740 const InductionDescriptor
&ID
, const Instruction
*EntryVal
,
1741 Value
*VectorLoopVal
, unsigned Part
, unsigned Lane
) {
1742 assert((isa
<PHINode
>(EntryVal
) || isa
<TruncInst
>(EntryVal
)) &&
1743 "Expected either an induction phi-node or a truncate of it!");
1745 // This induction variable is not the phi from the original loop but the
1746 // newly-created IV based on the proof that casted Phi is equal to the
1747 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1748 // re-uses the same InductionDescriptor that original IV uses but we don't
1749 // have to do any recording in this case - that is done when original IV is
1751 if (isa
<TruncInst
>(EntryVal
))
1754 const SmallVectorImpl
<Instruction
*> &Casts
= ID
.getCastInsts();
1757 // Only the first Cast instruction in the Casts vector is of interest.
1758 // The rest of the Casts (if exist) have no uses outside the
1759 // induction update chain itself.
1760 Instruction
*CastInst
= *Casts
.begin();
1761 if (Lane
< UINT_MAX
)
1762 VectorLoopValueMap
.setScalarValue(CastInst
, {Part
, Lane
}, VectorLoopVal
);
1764 VectorLoopValueMap
.setVectorValue(CastInst
, Part
, VectorLoopVal
);
1767 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode
*IV
, TruncInst
*Trunc
) {
1768 assert((IV
->getType()->isIntegerTy() || IV
!= OldInduction
) &&
1769 "Primary induction variable must have an integer type");
1771 auto II
= Legal
->getInductionVars()->find(IV
);
1772 assert(II
!= Legal
->getInductionVars()->end() && "IV is not an induction");
1774 auto ID
= II
->second
;
1775 assert(IV
->getType() == ID
.getStartValue()->getType() && "Types must match");
1777 // The scalar value to broadcast. This will be derived from the canonical
1778 // induction variable.
1779 Value
*ScalarIV
= nullptr;
1781 // The value from the original loop to which we are mapping the new induction
1783 Instruction
*EntryVal
= Trunc
? cast
<Instruction
>(Trunc
) : IV
;
1785 // True if we have vectorized the induction variable.
1786 auto VectorizedIV
= false;
1788 // Determine if we want a scalar version of the induction variable. This is
1789 // true if the induction variable itself is not widened, or if it has at
1790 // least one user in the loop that is not widened.
1791 auto NeedsScalarIV
= VF
> 1 && needsScalarInduction(EntryVal
);
1793 // Generate code for the induction step. Note that induction steps are
1794 // required to be loop-invariant
1795 assert(PSE
.getSE()->isLoopInvariant(ID
.getStep(), OrigLoop
) &&
1796 "Induction step should be loop invariant");
1797 auto &DL
= OrigLoop
->getHeader()->getModule()->getDataLayout();
1798 Value
*Step
= nullptr;
1799 if (PSE
.getSE()->isSCEVable(IV
->getType())) {
1800 SCEVExpander
Exp(*PSE
.getSE(), DL
, "induction");
1801 Step
= Exp
.expandCodeFor(ID
.getStep(), ID
.getStep()->getType(),
1802 LoopVectorPreHeader
->getTerminator());
1804 Step
= cast
<SCEVUnknown
>(ID
.getStep())->getValue();
1807 // Try to create a new independent vector induction variable. If we can't
1808 // create the phi node, we will splat the scalar induction variable in each
1810 if (VF
> 1 && !shouldScalarizeInstruction(EntryVal
)) {
1811 createVectorIntOrFpInductionPHI(ID
, Step
, EntryVal
);
1812 VectorizedIV
= true;
1815 // If we haven't yet vectorized the induction variable, or if we will create
1816 // a scalar one, we need to define the scalar induction variable and step
1817 // values. If we were given a truncation type, truncate the canonical
1818 // induction variable and step. Otherwise, derive these values from the
1819 // induction descriptor.
1820 if (!VectorizedIV
|| NeedsScalarIV
) {
1821 ScalarIV
= Induction
;
1822 if (IV
!= OldInduction
) {
1823 ScalarIV
= IV
->getType()->isIntegerTy()
1824 ? Builder
.CreateSExtOrTrunc(Induction
, IV
->getType())
1825 : Builder
.CreateCast(Instruction::SIToFP
, Induction
,
1827 ScalarIV
= emitTransformedIndex(Builder
, ScalarIV
, PSE
.getSE(), DL
, ID
);
1828 ScalarIV
->setName("offset.idx");
1831 auto *TruncType
= cast
<IntegerType
>(Trunc
->getType());
1832 assert(Step
->getType()->isIntegerTy() &&
1833 "Truncation requires an integer step");
1834 ScalarIV
= Builder
.CreateTrunc(ScalarIV
, TruncType
);
1835 Step
= Builder
.CreateTrunc(Step
, TruncType
);
1839 // If we haven't yet vectorized the induction variable, splat the scalar
1840 // induction variable, and build the necessary step vectors.
1841 // TODO: Don't do it unless the vectorized IV is really required.
1842 if (!VectorizedIV
) {
1843 Value
*Broadcasted
= getBroadcastInstrs(ScalarIV
);
1844 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
1846 getStepVector(Broadcasted
, VF
* Part
, Step
, ID
.getInductionOpcode());
1847 VectorLoopValueMap
.setVectorValue(EntryVal
, Part
, EntryPart
);
1849 addMetadata(EntryPart
, Trunc
);
1850 recordVectorLoopValueForInductionCast(ID
, EntryVal
, EntryPart
, Part
);
1854 // If an induction variable is only used for counting loop iterations or
1855 // calculating addresses, it doesn't need to be widened. Create scalar steps
1856 // that can be used by instructions we will later scalarize. Note that the
1857 // addition of the scalar steps will not increase the number of instructions
1858 // in the loop in the common case prior to InstCombine. We will be trading
1859 // one vector extract for each scalar step.
1861 buildScalarSteps(ScalarIV
, Step
, EntryVal
, ID
);
1864 Value
*InnerLoopVectorizer::getStepVector(Value
*Val
, int StartIdx
, Value
*Step
,
1865 Instruction::BinaryOps BinOp
) {
1866 // Create and check the types.
1867 assert(Val
->getType()->isVectorTy() && "Must be a vector");
1868 int VLen
= Val
->getType()->getVectorNumElements();
1870 Type
*STy
= Val
->getType()->getScalarType();
1871 assert((STy
->isIntegerTy() || STy
->isFloatingPointTy()) &&
1872 "Induction Step must be an integer or FP");
1873 assert(Step
->getType() == STy
&& "Step has wrong type");
1875 SmallVector
<Constant
*, 8> Indices
;
1877 if (STy
->isIntegerTy()) {
1878 // Create a vector of consecutive numbers from zero to VF.
1879 for (int i
= 0; i
< VLen
; ++i
)
1880 Indices
.push_back(ConstantInt::get(STy
, StartIdx
+ i
));
1882 // Add the consecutive indices to the vector value.
1883 Constant
*Cv
= ConstantVector::get(Indices
);
1884 assert(Cv
->getType() == Val
->getType() && "Invalid consecutive vec");
1885 Step
= Builder
.CreateVectorSplat(VLen
, Step
);
1886 assert(Step
->getType() == Val
->getType() && "Invalid step vec");
1887 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1888 // which can be found from the original scalar operations.
1889 Step
= Builder
.CreateMul(Cv
, Step
);
1890 return Builder
.CreateAdd(Val
, Step
, "induction");
1893 // Floating point induction.
1894 assert((BinOp
== Instruction::FAdd
|| BinOp
== Instruction::FSub
) &&
1895 "Binary Opcode should be specified for FP induction");
1896 // Create a vector of consecutive numbers from zero to VF.
1897 for (int i
= 0; i
< VLen
; ++i
)
1898 Indices
.push_back(ConstantFP::get(STy
, (double)(StartIdx
+ i
)));
1900 // Add the consecutive indices to the vector value.
1901 Constant
*Cv
= ConstantVector::get(Indices
);
1903 Step
= Builder
.CreateVectorSplat(VLen
, Step
);
1905 // Floating point operations had to be 'fast' to enable the induction.
1906 FastMathFlags Flags
;
1909 Value
*MulOp
= Builder
.CreateFMul(Cv
, Step
);
1910 if (isa
<Instruction
>(MulOp
))
1911 // Have to check, MulOp may be a constant
1912 cast
<Instruction
>(MulOp
)->setFastMathFlags(Flags
);
1914 Value
*BOp
= Builder
.CreateBinOp(BinOp
, Val
, MulOp
, "induction");
1915 if (isa
<Instruction
>(BOp
))
1916 cast
<Instruction
>(BOp
)->setFastMathFlags(Flags
);
1920 void InnerLoopVectorizer::buildScalarSteps(Value
*ScalarIV
, Value
*Step
,
1921 Instruction
*EntryVal
,
1922 const InductionDescriptor
&ID
) {
1923 // We shouldn't have to build scalar steps if we aren't vectorizing.
1924 assert(VF
> 1 && "VF should be greater than one");
1926 // Get the value type and ensure it and the step have the same integer type.
1927 Type
*ScalarIVTy
= ScalarIV
->getType()->getScalarType();
1928 assert(ScalarIVTy
== Step
->getType() &&
1929 "Val and Step should have the same type");
1931 // We build scalar steps for both integer and floating-point induction
1932 // variables. Here, we determine the kind of arithmetic we will perform.
1933 Instruction::BinaryOps AddOp
;
1934 Instruction::BinaryOps MulOp
;
1935 if (ScalarIVTy
->isIntegerTy()) {
1936 AddOp
= Instruction::Add
;
1937 MulOp
= Instruction::Mul
;
1939 AddOp
= ID
.getInductionOpcode();
1940 MulOp
= Instruction::FMul
;
1943 // Determine the number of scalars we need to generate for each unroll
1944 // iteration. If EntryVal is uniform, we only need to generate the first
1945 // lane. Otherwise, we generate all VF values.
1947 Cost
->isUniformAfterVectorization(cast
<Instruction
>(EntryVal
), VF
) ? 1
1949 // Compute the scalar steps and save the results in VectorLoopValueMap.
1950 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
1951 for (unsigned Lane
= 0; Lane
< Lanes
; ++Lane
) {
1952 auto *StartIdx
= getSignedIntOrFpConstant(ScalarIVTy
, VF
* Part
+ Lane
);
1953 auto *Mul
= addFastMathFlag(Builder
.CreateBinOp(MulOp
, StartIdx
, Step
));
1954 auto *Add
= addFastMathFlag(Builder
.CreateBinOp(AddOp
, ScalarIV
, Mul
));
1955 VectorLoopValueMap
.setScalarValue(EntryVal
, {Part
, Lane
}, Add
);
1956 recordVectorLoopValueForInductionCast(ID
, EntryVal
, Add
, Part
, Lane
);
1961 Value
*InnerLoopVectorizer::getOrCreateVectorValue(Value
*V
, unsigned Part
) {
1962 assert(V
!= Induction
&& "The new induction variable should not be used.");
1963 assert(!V
->getType()->isVectorTy() && "Can't widen a vector");
1964 assert(!V
->getType()->isVoidTy() && "Type does not produce a value");
1966 // If we have a stride that is replaced by one, do it here. Defer this for
1967 // the VPlan-native path until we start running Legal checks in that path.
1968 if (!EnableVPlanNativePath
&& Legal
->hasStride(V
))
1969 V
= ConstantInt::get(V
->getType(), 1);
1971 // If we have a vector mapped to this value, return it.
1972 if (VectorLoopValueMap
.hasVectorValue(V
, Part
))
1973 return VectorLoopValueMap
.getVectorValue(V
, Part
);
1975 // If the value has not been vectorized, check if it has been scalarized
1976 // instead. If it has been scalarized, and we actually need the value in
1977 // vector form, we will construct the vector values on demand.
1978 if (VectorLoopValueMap
.hasAnyScalarValue(V
)) {
1979 Value
*ScalarValue
= VectorLoopValueMap
.getScalarValue(V
, {Part
, 0});
1981 // If we've scalarized a value, that value should be an instruction.
1982 auto *I
= cast
<Instruction
>(V
);
1984 // If we aren't vectorizing, we can just copy the scalar map values over to
1987 VectorLoopValueMap
.setVectorValue(V
, Part
, ScalarValue
);
1991 // Get the last scalar instruction we generated for V and Part. If the value
1992 // is known to be uniform after vectorization, this corresponds to lane zero
1993 // of the Part unroll iteration. Otherwise, the last instruction is the one
1994 // we created for the last vector lane of the Part unroll iteration.
1995 unsigned LastLane
= Cost
->isUniformAfterVectorization(I
, VF
) ? 0 : VF
- 1;
1996 auto *LastInst
= cast
<Instruction
>(
1997 VectorLoopValueMap
.getScalarValue(V
, {Part
, LastLane
}));
1999 // Set the insert point after the last scalarized instruction. This ensures
2000 // the insertelement sequence will directly follow the scalar definitions.
2001 auto OldIP
= Builder
.saveIP();
2002 auto NewIP
= std::next(BasicBlock::iterator(LastInst
));
2003 Builder
.SetInsertPoint(&*NewIP
);
2005 // However, if we are vectorizing, we need to construct the vector values.
2006 // If the value is known to be uniform after vectorization, we can just
2007 // broadcast the scalar value corresponding to lane zero for each unroll
2008 // iteration. Otherwise, we construct the vector values using insertelement
2009 // instructions. Since the resulting vectors are stored in
2010 // VectorLoopValueMap, we will only generate the insertelements once.
2011 Value
*VectorValue
= nullptr;
2012 if (Cost
->isUniformAfterVectorization(I
, VF
)) {
2013 VectorValue
= getBroadcastInstrs(ScalarValue
);
2014 VectorLoopValueMap
.setVectorValue(V
, Part
, VectorValue
);
2016 // Initialize packing with insertelements to start from undef.
2017 Value
*Undef
= UndefValue::get(VectorType::get(V
->getType(), VF
));
2018 VectorLoopValueMap
.setVectorValue(V
, Part
, Undef
);
2019 for (unsigned Lane
= 0; Lane
< VF
; ++Lane
)
2020 packScalarIntoVectorValue(V
, {Part
, Lane
});
2021 VectorValue
= VectorLoopValueMap
.getVectorValue(V
, Part
);
2023 Builder
.restoreIP(OldIP
);
2027 // If this scalar is unknown, assume that it is a constant or that it is
2028 // loop invariant. Broadcast V and save the value for future uses.
2029 Value
*B
= getBroadcastInstrs(V
);
2030 VectorLoopValueMap
.setVectorValue(V
, Part
, B
);
2035 InnerLoopVectorizer::getOrCreateScalarValue(Value
*V
,
2036 const VPIteration
&Instance
) {
2037 // If the value is not an instruction contained in the loop, it should
2038 // already be scalar.
2039 if (OrigLoop
->isLoopInvariant(V
))
2042 assert(Instance
.Lane
> 0
2043 ? !Cost
->isUniformAfterVectorization(cast
<Instruction
>(V
), VF
)
2044 : true && "Uniform values only have lane zero");
2046 // If the value from the original loop has not been vectorized, it is
2047 // represented by UF x VF scalar values in the new loop. Return the requested
2049 if (VectorLoopValueMap
.hasScalarValue(V
, Instance
))
2050 return VectorLoopValueMap
.getScalarValue(V
, Instance
);
2052 // If the value has not been scalarized, get its entry in VectorLoopValueMap
2053 // for the given unroll part. If this entry is not a vector type (i.e., the
2054 // vectorization factor is one), there is no need to generate an
2055 // extractelement instruction.
2056 auto *U
= getOrCreateVectorValue(V
, Instance
.Part
);
2057 if (!U
->getType()->isVectorTy()) {
2058 assert(VF
== 1 && "Value not scalarized has non-vector type");
2062 // Otherwise, the value from the original loop has been vectorized and is
2063 // represented by UF vector values. Extract and return the requested scalar
2064 // value from the appropriate vector lane.
2065 return Builder
.CreateExtractElement(U
, Builder
.getInt32(Instance
.Lane
));
2068 void InnerLoopVectorizer::packScalarIntoVectorValue(
2069 Value
*V
, const VPIteration
&Instance
) {
2070 assert(V
!= Induction
&& "The new induction variable should not be used.");
2071 assert(!V
->getType()->isVectorTy() && "Can't pack a vector");
2072 assert(!V
->getType()->isVoidTy() && "Type does not produce a value");
2074 Value
*ScalarInst
= VectorLoopValueMap
.getScalarValue(V
, Instance
);
2075 Value
*VectorValue
= VectorLoopValueMap
.getVectorValue(V
, Instance
.Part
);
2076 VectorValue
= Builder
.CreateInsertElement(VectorValue
, ScalarInst
,
2077 Builder
.getInt32(Instance
.Lane
));
2078 VectorLoopValueMap
.resetVectorValue(V
, Instance
.Part
, VectorValue
);
2081 Value
*InnerLoopVectorizer::reverseVector(Value
*Vec
) {
2082 assert(Vec
->getType()->isVectorTy() && "Invalid type");
2083 SmallVector
<Constant
*, 8> ShuffleMask
;
2084 for (unsigned i
= 0; i
< VF
; ++i
)
2085 ShuffleMask
.push_back(Builder
.getInt32(VF
- i
- 1));
2087 return Builder
.CreateShuffleVector(Vec
, UndefValue::get(Vec
->getType()),
2088 ConstantVector::get(ShuffleMask
),
2092 // Return whether we allow using masked interleave-groups (for dealing with
2093 // strided loads/stores that reside in predicated blocks, or for dealing
2095 static bool useMaskedInterleavedAccesses(const TargetTransformInfo
&TTI
) {
2096 // If an override option has been passed in for interleaved accesses, use it.
2097 if (EnableMaskedInterleavedMemAccesses
.getNumOccurrences() > 0)
2098 return EnableMaskedInterleavedMemAccesses
;
2100 return TTI
.enableMaskedInterleavedAccessVectorization();
2103 // Try to vectorize the interleave group that \p Instr belongs to.
2105 // E.g. Translate following interleaved load group (factor = 3):
2106 // for (i = 0; i < N; i+=3) {
2107 // R = Pic[i]; // Member of index 0
2108 // G = Pic[i+1]; // Member of index 1
2109 // B = Pic[i+2]; // Member of index 2
2110 // ... // do something to R, G, B
2113 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2114 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
2115 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
2116 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
2118 // Or translate following interleaved store group (factor = 3):
2119 // for (i = 0; i < N; i+=3) {
2120 // ... do something to R, G, B
2121 // Pic[i] = R; // Member of index 0
2122 // Pic[i+1] = G; // Member of index 1
2123 // Pic[i+2] = B; // Member of index 2
2126 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2127 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2128 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2129 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2130 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2131 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction
*Instr
,
2132 VectorParts
*BlockInMask
) {
2133 const InterleaveGroup
<Instruction
> *Group
=
2134 Cost
->getInterleavedAccessGroup(Instr
);
2135 assert(Group
&& "Fail to get an interleaved access group.");
2137 // Skip if current instruction is not the insert position.
2138 if (Instr
!= Group
->getInsertPos())
2141 const DataLayout
&DL
= Instr
->getModule()->getDataLayout();
2142 Value
*Ptr
= getLoadStorePointerOperand(Instr
);
2144 // Prepare for the vector type of the interleaved load/store.
2145 Type
*ScalarTy
= getMemInstValueType(Instr
);
2146 unsigned InterleaveFactor
= Group
->getFactor();
2147 Type
*VecTy
= VectorType::get(ScalarTy
, InterleaveFactor
* VF
);
2148 Type
*PtrTy
= VecTy
->getPointerTo(getLoadStoreAddressSpace(Instr
));
2150 // Prepare for the new pointers.
2151 setDebugLocFromInst(Builder
, Ptr
);
2152 SmallVector
<Value
*, 2> NewPtrs
;
2153 unsigned Index
= Group
->getIndex(Instr
);
2156 bool IsMaskForCondRequired
= BlockInMask
;
2157 if (IsMaskForCondRequired
) {
2158 Mask
= *BlockInMask
;
2159 // TODO: extend the masked interleaved-group support to reversed access.
2160 assert(!Group
->isReverse() && "Reversed masked interleave-group "
2164 // If the group is reverse, adjust the index to refer to the last vector lane
2165 // instead of the first. We adjust the index from the first vector lane,
2166 // rather than directly getting the pointer for lane VF - 1, because the
2167 // pointer operand of the interleaved access is supposed to be uniform. For
2168 // uniform instructions, we're only required to generate a value for the
2169 // first vector lane in each unroll iteration.
2170 if (Group
->isReverse())
2171 Index
+= (VF
- 1) * Group
->getFactor();
2173 bool InBounds
= false;
2174 if (auto *gep
= dyn_cast
<GetElementPtrInst
>(Ptr
->stripPointerCasts()))
2175 InBounds
= gep
->isInBounds();
2177 for (unsigned Part
= 0; Part
< UF
; Part
++) {
2178 Value
*NewPtr
= getOrCreateScalarValue(Ptr
, {Part
, 0});
2180 // Notice current instruction could be any index. Need to adjust the address
2181 // to the member of index 0.
2183 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2184 // b = A[i]; // Member of index 0
2185 // Current pointer is pointed to A[i+1], adjust it to A[i].
2187 // E.g. A[i+1] = a; // Member of index 1
2188 // A[i] = b; // Member of index 0
2189 // A[i+2] = c; // Member of index 2 (Current instruction)
2190 // Current pointer is pointed to A[i+2], adjust it to A[i].
2191 NewPtr
= Builder
.CreateGEP(ScalarTy
, NewPtr
, Builder
.getInt32(-Index
));
2193 cast
<GetElementPtrInst
>(NewPtr
)->setIsInBounds(true);
2195 // Cast to the vector pointer type.
2196 NewPtrs
.push_back(Builder
.CreateBitCast(NewPtr
, PtrTy
));
2199 setDebugLocFromInst(Builder
, Instr
);
2200 Value
*UndefVec
= UndefValue::get(VecTy
);
2202 Value
*MaskForGaps
= nullptr;
2203 if (Group
->requiresScalarEpilogue() && !Cost
->isScalarEpilogueAllowed()) {
2204 MaskForGaps
= createBitMaskForGaps(Builder
, VF
, *Group
);
2205 assert(MaskForGaps
&& "Mask for Gaps is required but it is null");
2208 // Vectorize the interleaved load group.
2209 if (isa
<LoadInst
>(Instr
)) {
2210 // For each unroll part, create a wide load for the group.
2211 SmallVector
<Value
*, 2> NewLoads
;
2212 for (unsigned Part
= 0; Part
< UF
; Part
++) {
2213 Instruction
*NewLoad
;
2214 if (IsMaskForCondRequired
|| MaskForGaps
) {
2215 assert(useMaskedInterleavedAccesses(*TTI
) &&
2216 "masked interleaved groups are not allowed.");
2217 Value
*GroupMask
= MaskForGaps
;
2218 if (IsMaskForCondRequired
) {
2219 auto *Undefs
= UndefValue::get(Mask
[Part
]->getType());
2220 auto *RepMask
= createReplicatedMask(Builder
, InterleaveFactor
, VF
);
2221 Value
*ShuffledMask
= Builder
.CreateShuffleVector(
2222 Mask
[Part
], Undefs
, RepMask
, "interleaved.mask");
2223 GroupMask
= MaskForGaps
2224 ? Builder
.CreateBinOp(Instruction::And
, ShuffledMask
,
2229 Builder
.CreateMaskedLoad(NewPtrs
[Part
], Group
->getAlignment(),
2230 GroupMask
, UndefVec
, "wide.masked.vec");
2233 NewLoad
= Builder
.CreateAlignedLoad(VecTy
, NewPtrs
[Part
],
2234 Group
->getAlignment(), "wide.vec");
2235 Group
->addMetadata(NewLoad
);
2236 NewLoads
.push_back(NewLoad
);
2239 // For each member in the group, shuffle out the appropriate data from the
2241 for (unsigned I
= 0; I
< InterleaveFactor
; ++I
) {
2242 Instruction
*Member
= Group
->getMember(I
);
2244 // Skip the gaps in the group.
2248 Constant
*StrideMask
= createStrideMask(Builder
, I
, InterleaveFactor
, VF
);
2249 for (unsigned Part
= 0; Part
< UF
; Part
++) {
2250 Value
*StridedVec
= Builder
.CreateShuffleVector(
2251 NewLoads
[Part
], UndefVec
, StrideMask
, "strided.vec");
2253 // If this member has different type, cast the result type.
2254 if (Member
->getType() != ScalarTy
) {
2255 VectorType
*OtherVTy
= VectorType::get(Member
->getType(), VF
);
2256 StridedVec
= createBitOrPointerCast(StridedVec
, OtherVTy
, DL
);
2259 if (Group
->isReverse())
2260 StridedVec
= reverseVector(StridedVec
);
2262 VectorLoopValueMap
.setVectorValue(Member
, Part
, StridedVec
);
2268 // The sub vector type for current instruction.
2269 VectorType
*SubVT
= VectorType::get(ScalarTy
, VF
);
2271 // Vectorize the interleaved store group.
2272 for (unsigned Part
= 0; Part
< UF
; Part
++) {
2273 // Collect the stored vector from each member.
2274 SmallVector
<Value
*, 4> StoredVecs
;
2275 for (unsigned i
= 0; i
< InterleaveFactor
; i
++) {
2276 // Interleaved store group doesn't allow a gap, so each index has a member
2277 Instruction
*Member
= Group
->getMember(i
);
2278 assert(Member
&& "Fail to get a member from an interleaved store group");
2280 Value
*StoredVec
= getOrCreateVectorValue(
2281 cast
<StoreInst
>(Member
)->getValueOperand(), Part
);
2282 if (Group
->isReverse())
2283 StoredVec
= reverseVector(StoredVec
);
2285 // If this member has different type, cast it to a unified type.
2287 if (StoredVec
->getType() != SubVT
)
2288 StoredVec
= createBitOrPointerCast(StoredVec
, SubVT
, DL
);
2290 StoredVecs
.push_back(StoredVec
);
2293 // Concatenate all vectors into a wide vector.
2294 Value
*WideVec
= concatenateVectors(Builder
, StoredVecs
);
2296 // Interleave the elements in the wide vector.
2297 Constant
*IMask
= createInterleaveMask(Builder
, VF
, InterleaveFactor
);
2298 Value
*IVec
= Builder
.CreateShuffleVector(WideVec
, UndefVec
, IMask
,
2301 Instruction
*NewStoreInstr
;
2302 if (IsMaskForCondRequired
) {
2303 auto *Undefs
= UndefValue::get(Mask
[Part
]->getType());
2304 auto *RepMask
= createReplicatedMask(Builder
, InterleaveFactor
, VF
);
2305 Value
*ShuffledMask
= Builder
.CreateShuffleVector(
2306 Mask
[Part
], Undefs
, RepMask
, "interleaved.mask");
2307 NewStoreInstr
= Builder
.CreateMaskedStore(
2308 IVec
, NewPtrs
[Part
], Group
->getAlignment(), ShuffledMask
);
2311 NewStoreInstr
= Builder
.CreateAlignedStore(IVec
, NewPtrs
[Part
],
2312 Group
->getAlignment());
2314 Group
->addMetadata(NewStoreInstr
);
2318 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction
*Instr
,
2319 VectorParts
*BlockInMask
) {
2320 // Attempt to issue a wide load.
2321 LoadInst
*LI
= dyn_cast
<LoadInst
>(Instr
);
2322 StoreInst
*SI
= dyn_cast
<StoreInst
>(Instr
);
2324 assert((LI
|| SI
) && "Invalid Load/Store instruction");
2326 LoopVectorizationCostModel::InstWidening Decision
=
2327 Cost
->getWideningDecision(Instr
, VF
);
2328 assert(Decision
!= LoopVectorizationCostModel::CM_Unknown
&&
2329 "CM decision should be taken at this point");
2330 if (Decision
== LoopVectorizationCostModel::CM_Interleave
)
2331 return vectorizeInterleaveGroup(Instr
);
2333 Type
*ScalarDataTy
= getMemInstValueType(Instr
);
2334 Type
*DataTy
= VectorType::get(ScalarDataTy
, VF
);
2335 Value
*Ptr
= getLoadStorePointerOperand(Instr
);
2336 unsigned Alignment
= getLoadStoreAlignment(Instr
);
2337 // An alignment of 0 means target abi alignment. We need to use the scalar's
2338 // target abi alignment in such a case.
2339 const DataLayout
&DL
= Instr
->getModule()->getDataLayout();
2341 Alignment
= DL
.getABITypeAlignment(ScalarDataTy
);
2342 unsigned AddressSpace
= getLoadStoreAddressSpace(Instr
);
2344 // Determine if the pointer operand of the access is either consecutive or
2345 // reverse consecutive.
2346 bool Reverse
= (Decision
== LoopVectorizationCostModel::CM_Widen_Reverse
);
2347 bool ConsecutiveStride
=
2348 Reverse
|| (Decision
== LoopVectorizationCostModel::CM_Widen
);
2349 bool CreateGatherScatter
=
2350 (Decision
== LoopVectorizationCostModel::CM_GatherScatter
);
2352 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2353 // gather/scatter. Otherwise Decision should have been to Scalarize.
2354 assert((ConsecutiveStride
|| CreateGatherScatter
) &&
2355 "The instruction should be scalarized");
2357 // Handle consecutive loads/stores.
2358 if (ConsecutiveStride
)
2359 Ptr
= getOrCreateScalarValue(Ptr
, {0, 0});
2362 bool isMaskRequired
= BlockInMask
;
2364 Mask
= *BlockInMask
;
2366 bool InBounds
= false;
2367 if (auto *gep
= dyn_cast
<GetElementPtrInst
>(
2368 getLoadStorePointerOperand(Instr
)->stripPointerCasts()))
2369 InBounds
= gep
->isInBounds();
2371 const auto CreateVecPtr
= [&](unsigned Part
, Value
*Ptr
) -> Value
* {
2372 // Calculate the pointer for the specific unroll-part.
2373 GetElementPtrInst
*PartPtr
= nullptr;
2376 // If the address is consecutive but reversed, then the
2377 // wide store needs to start at the last vector element.
2378 PartPtr
= cast
<GetElementPtrInst
>(
2379 Builder
.CreateGEP(ScalarDataTy
, Ptr
, Builder
.getInt32(-Part
* VF
)));
2380 PartPtr
->setIsInBounds(InBounds
);
2381 PartPtr
= cast
<GetElementPtrInst
>(
2382 Builder
.CreateGEP(ScalarDataTy
, PartPtr
, Builder
.getInt32(1 - VF
)));
2383 PartPtr
->setIsInBounds(InBounds
);
2384 if (isMaskRequired
) // Reverse of a null all-one mask is a null mask.
2385 Mask
[Part
] = reverseVector(Mask
[Part
]);
2387 PartPtr
= cast
<GetElementPtrInst
>(
2388 Builder
.CreateGEP(ScalarDataTy
, Ptr
, Builder
.getInt32(Part
* VF
)));
2389 PartPtr
->setIsInBounds(InBounds
);
2392 return Builder
.CreateBitCast(PartPtr
, DataTy
->getPointerTo(AddressSpace
));
2397 setDebugLocFromInst(Builder
, SI
);
2399 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
2400 Instruction
*NewSI
= nullptr;
2401 Value
*StoredVal
= getOrCreateVectorValue(SI
->getValueOperand(), Part
);
2402 if (CreateGatherScatter
) {
2403 Value
*MaskPart
= isMaskRequired
? Mask
[Part
] : nullptr;
2404 Value
*VectorGep
= getOrCreateVectorValue(Ptr
, Part
);
2405 NewSI
= Builder
.CreateMaskedScatter(StoredVal
, VectorGep
, Alignment
,
2409 // If we store to reverse consecutive memory locations, then we need
2410 // to reverse the order of elements in the stored value.
2411 StoredVal
= reverseVector(StoredVal
);
2412 // We don't want to update the value in the map as it might be used in
2413 // another expression. So don't call resetVectorValue(StoredVal).
2415 auto *VecPtr
= CreateVecPtr(Part
, Ptr
);
2417 NewSI
= Builder
.CreateMaskedStore(StoredVal
, VecPtr
, Alignment
,
2420 NewSI
= Builder
.CreateAlignedStore(StoredVal
, VecPtr
, Alignment
);
2422 addMetadata(NewSI
, SI
);
2428 assert(LI
&& "Must have a load instruction");
2429 setDebugLocFromInst(Builder
, LI
);
2430 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
2432 if (CreateGatherScatter
) {
2433 Value
*MaskPart
= isMaskRequired
? Mask
[Part
] : nullptr;
2434 Value
*VectorGep
= getOrCreateVectorValue(Ptr
, Part
);
2435 NewLI
= Builder
.CreateMaskedGather(VectorGep
, Alignment
, MaskPart
,
2436 nullptr, "wide.masked.gather");
2437 addMetadata(NewLI
, LI
);
2439 auto *VecPtr
= CreateVecPtr(Part
, Ptr
);
2441 NewLI
= Builder
.CreateMaskedLoad(VecPtr
, Alignment
, Mask
[Part
],
2442 UndefValue::get(DataTy
),
2443 "wide.masked.load");
2446 Builder
.CreateAlignedLoad(DataTy
, VecPtr
, Alignment
, "wide.load");
2448 // Add metadata to the load, but setVectorValue to the reverse shuffle.
2449 addMetadata(NewLI
, LI
);
2451 NewLI
= reverseVector(NewLI
);
2453 VectorLoopValueMap
.setVectorValue(Instr
, Part
, NewLI
);
2457 void InnerLoopVectorizer::scalarizeInstruction(Instruction
*Instr
,
2458 const VPIteration
&Instance
,
2459 bool IfPredicateInstr
) {
2460 assert(!Instr
->getType()->isAggregateType() && "Can't handle vectors");
2462 setDebugLocFromInst(Builder
, Instr
);
2464 // Does this instruction return a value ?
2465 bool IsVoidRetTy
= Instr
->getType()->isVoidTy();
2467 Instruction
*Cloned
= Instr
->clone();
2469 Cloned
->setName(Instr
->getName() + ".cloned");
2471 // Replace the operands of the cloned instructions with their scalar
2472 // equivalents in the new loop.
2473 for (unsigned op
= 0, e
= Instr
->getNumOperands(); op
!= e
; ++op
) {
2474 auto *NewOp
= getOrCreateScalarValue(Instr
->getOperand(op
), Instance
);
2475 Cloned
->setOperand(op
, NewOp
);
2477 addNewMetadata(Cloned
, Instr
);
2479 // Place the cloned scalar in the new loop.
2480 Builder
.Insert(Cloned
);
2482 // Add the cloned scalar to the scalar map entry.
2483 VectorLoopValueMap
.setScalarValue(Instr
, Instance
, Cloned
);
2485 // If we just cloned a new assumption, add it the assumption cache.
2486 if (auto *II
= dyn_cast
<IntrinsicInst
>(Cloned
))
2487 if (II
->getIntrinsicID() == Intrinsic::assume
)
2488 AC
->registerAssumption(II
);
2491 if (IfPredicateInstr
)
2492 PredicatedInstructions
.push_back(Cloned
);
2495 PHINode
*InnerLoopVectorizer::createInductionVariable(Loop
*L
, Value
*Start
,
2496 Value
*End
, Value
*Step
,
2498 BasicBlock
*Header
= L
->getHeader();
2499 BasicBlock
*Latch
= L
->getLoopLatch();
2500 // As we're just creating this loop, it's possible no latch exists
2501 // yet. If so, use the header as this will be a single block loop.
2505 IRBuilder
<> Builder(&*Header
->getFirstInsertionPt());
2506 Instruction
*OldInst
= getDebugLocFromInstOrOperands(OldInduction
);
2507 setDebugLocFromInst(Builder
, OldInst
);
2508 auto *Induction
= Builder
.CreatePHI(Start
->getType(), 2, "index");
2510 Builder
.SetInsertPoint(Latch
->getTerminator());
2511 setDebugLocFromInst(Builder
, OldInst
);
2513 // Create i+1 and fill the PHINode.
2514 Value
*Next
= Builder
.CreateAdd(Induction
, Step
, "index.next");
2515 Induction
->addIncoming(Start
, L
->getLoopPreheader());
2516 Induction
->addIncoming(Next
, Latch
);
2517 // Create the compare.
2518 Value
*ICmp
= Builder
.CreateICmpEQ(Next
, End
);
2519 Builder
.CreateCondBr(ICmp
, L
->getExitBlock(), Header
);
2521 // Now we have two terminators. Remove the old one from the block.
2522 Latch
->getTerminator()->eraseFromParent();
2527 Value
*InnerLoopVectorizer::getOrCreateTripCount(Loop
*L
) {
2531 assert(L
&& "Create Trip Count for null loop.");
2532 IRBuilder
<> Builder(L
->getLoopPreheader()->getTerminator());
2533 // Find the loop boundaries.
2534 ScalarEvolution
*SE
= PSE
.getSE();
2535 const SCEV
*BackedgeTakenCount
= PSE
.getBackedgeTakenCount();
2536 assert(BackedgeTakenCount
!= SE
->getCouldNotCompute() &&
2537 "Invalid loop count");
2539 Type
*IdxTy
= Legal
->getWidestInductionType();
2540 assert(IdxTy
&& "No type for induction");
2542 // The exit count might have the type of i64 while the phi is i32. This can
2543 // happen if we have an induction variable that is sign extended before the
2544 // compare. The only way that we get a backedge taken count is that the
2545 // induction variable was signed and as such will not overflow. In such a case
2546 // truncation is legal.
2547 if (BackedgeTakenCount
->getType()->getPrimitiveSizeInBits() >
2548 IdxTy
->getPrimitiveSizeInBits())
2549 BackedgeTakenCount
= SE
->getTruncateOrNoop(BackedgeTakenCount
, IdxTy
);
2550 BackedgeTakenCount
= SE
->getNoopOrZeroExtend(BackedgeTakenCount
, IdxTy
);
2552 // Get the total trip count from the count by adding 1.
2553 const SCEV
*ExitCount
= SE
->getAddExpr(
2554 BackedgeTakenCount
, SE
->getOne(BackedgeTakenCount
->getType()));
2556 const DataLayout
&DL
= L
->getHeader()->getModule()->getDataLayout();
2558 // Expand the trip count and place the new instructions in the preheader.
2559 // Notice that the pre-header does not change, only the loop body.
2560 SCEVExpander
Exp(*SE
, DL
, "induction");
2562 // Count holds the overall loop count (N).
2563 TripCount
= Exp
.expandCodeFor(ExitCount
, ExitCount
->getType(),
2564 L
->getLoopPreheader()->getTerminator());
2566 if (TripCount
->getType()->isPointerTy())
2568 CastInst::CreatePointerCast(TripCount
, IdxTy
, "exitcount.ptrcnt.to.int",
2569 L
->getLoopPreheader()->getTerminator());
2574 Value
*InnerLoopVectorizer::getOrCreateVectorTripCount(Loop
*L
) {
2575 if (VectorTripCount
)
2576 return VectorTripCount
;
2578 Value
*TC
= getOrCreateTripCount(L
);
2579 IRBuilder
<> Builder(L
->getLoopPreheader()->getTerminator());
2581 Type
*Ty
= TC
->getType();
2582 Constant
*Step
= ConstantInt::get(Ty
, VF
* UF
);
2584 // If the tail is to be folded by masking, round the number of iterations N
2585 // up to a multiple of Step instead of rounding down. This is done by first
2586 // adding Step-1 and then rounding down. Note that it's ok if this addition
2587 // overflows: the vector induction variable will eventually wrap to zero given
2588 // that it starts at zero and its Step is a power of two; the loop will then
2589 // exit, with the last early-exit vector comparison also producing all-true.
2590 if (Cost
->foldTailByMasking()) {
2591 assert(isPowerOf2_32(VF
* UF
) &&
2592 "VF*UF must be a power of 2 when folding tail by masking");
2593 TC
= Builder
.CreateAdd(TC
, ConstantInt::get(Ty
, VF
* UF
- 1), "n.rnd.up");
2596 // Now we need to generate the expression for the part of the loop that the
2597 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2598 // iterations are not required for correctness, or N - Step, otherwise. Step
2599 // is equal to the vectorization factor (number of SIMD elements) times the
2600 // unroll factor (number of SIMD instructions).
2601 Value
*R
= Builder
.CreateURem(TC
, Step
, "n.mod.vf");
2603 // If there is a non-reversed interleaved group that may speculatively access
2604 // memory out-of-bounds, we need to ensure that there will be at least one
2605 // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2606 // the trip count, we set the remainder to be equal to the step. If the step
2607 // does not evenly divide the trip count, no adjustment is necessary since
2608 // there will already be scalar iterations. Note that the minimum iterations
2609 // check ensures that N >= Step.
2610 if (VF
> 1 && Cost
->requiresScalarEpilogue()) {
2611 auto *IsZero
= Builder
.CreateICmpEQ(R
, ConstantInt::get(R
->getType(), 0));
2612 R
= Builder
.CreateSelect(IsZero
, Step
, R
);
2615 VectorTripCount
= Builder
.CreateSub(TC
, R
, "n.vec");
2617 return VectorTripCount
;
2620 Value
*InnerLoopVectorizer::createBitOrPointerCast(Value
*V
, VectorType
*DstVTy
,
2621 const DataLayout
&DL
) {
2622 // Verify that V is a vector type with same number of elements as DstVTy.
2623 unsigned VF
= DstVTy
->getNumElements();
2624 VectorType
*SrcVecTy
= cast
<VectorType
>(V
->getType());
2625 assert((VF
== SrcVecTy
->getNumElements()) && "Vector dimensions do not match");
2626 Type
*SrcElemTy
= SrcVecTy
->getElementType();
2627 Type
*DstElemTy
= DstVTy
->getElementType();
2628 assert((DL
.getTypeSizeInBits(SrcElemTy
) == DL
.getTypeSizeInBits(DstElemTy
)) &&
2629 "Vector elements must have same size");
2631 // Do a direct cast if element types are castable.
2632 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy
, DstElemTy
, DL
)) {
2633 return Builder
.CreateBitOrPointerCast(V
, DstVTy
);
2635 // V cannot be directly casted to desired vector type.
2636 // May happen when V is a floating point vector but DstVTy is a vector of
2637 // pointers or vice-versa. Handle this using a two-step bitcast using an
2638 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2639 assert((DstElemTy
->isPointerTy() != SrcElemTy
->isPointerTy()) &&
2640 "Only one type should be a pointer type");
2641 assert((DstElemTy
->isFloatingPointTy() != SrcElemTy
->isFloatingPointTy()) &&
2642 "Only one type should be a floating point type");
2644 IntegerType::getIntNTy(V
->getContext(), DL
.getTypeSizeInBits(SrcElemTy
));
2645 VectorType
*VecIntTy
= VectorType::get(IntTy
, VF
);
2646 Value
*CastVal
= Builder
.CreateBitOrPointerCast(V
, VecIntTy
);
2647 return Builder
.CreateBitOrPointerCast(CastVal
, DstVTy
);
2650 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop
*L
,
2651 BasicBlock
*Bypass
) {
2652 Value
*Count
= getOrCreateTripCount(L
);
2653 BasicBlock
*BB
= L
->getLoopPreheader();
2654 IRBuilder
<> Builder(BB
->getTerminator());
2656 // Generate code to check if the loop's trip count is less than VF * UF, or
2657 // equal to it in case a scalar epilogue is required; this implies that the
2658 // vector trip count is zero. This check also covers the case where adding one
2659 // to the backedge-taken count overflowed leading to an incorrect trip count
2660 // of zero. In this case we will also jump to the scalar loop.
2661 auto P
= Cost
->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2662 : ICmpInst::ICMP_ULT
;
2664 // If tail is to be folded, vector loop takes care of all iterations.
2665 Value
*CheckMinIters
= Builder
.getFalse();
2666 if (!Cost
->foldTailByMasking())
2667 CheckMinIters
= Builder
.CreateICmp(
2668 P
, Count
, ConstantInt::get(Count
->getType(), VF
* UF
),
2671 BasicBlock
*NewBB
= BB
->splitBasicBlock(BB
->getTerminator(), "vector.ph");
2672 // Update dominator tree immediately if the generated block is a
2673 // LoopBypassBlock because SCEV expansions to generate loop bypass
2674 // checks may query it before the current function is finished.
2675 DT
->addNewBlock(NewBB
, BB
);
2676 if (L
->getParentLoop())
2677 L
->getParentLoop()->addBasicBlockToLoop(NewBB
, *LI
);
2678 ReplaceInstWithInst(BB
->getTerminator(),
2679 BranchInst::Create(Bypass
, NewBB
, CheckMinIters
));
2680 LoopBypassBlocks
.push_back(BB
);
2683 void InnerLoopVectorizer::emitSCEVChecks(Loop
*L
, BasicBlock
*Bypass
) {
2684 BasicBlock
*BB
= L
->getLoopPreheader();
2686 // Generate the code to check that the SCEV assumptions that we made.
2687 // We want the new basic block to start at the first instruction in a
2688 // sequence of instructions that form a check.
2689 SCEVExpander
Exp(*PSE
.getSE(), Bypass
->getModule()->getDataLayout(),
2692 Exp
.expandCodeForPredicate(&PSE
.getUnionPredicate(), BB
->getTerminator());
2694 if (auto *C
= dyn_cast
<ConstantInt
>(SCEVCheck
))
2698 assert(!Cost
->foldTailByMasking() &&
2699 "Cannot SCEV check stride or overflow when folding tail");
2700 // Create a new block containing the stride check.
2701 BB
->setName("vector.scevcheck");
2702 auto *NewBB
= BB
->splitBasicBlock(BB
->getTerminator(), "vector.ph");
2703 // Update dominator tree immediately if the generated block is a
2704 // LoopBypassBlock because SCEV expansions to generate loop bypass
2705 // checks may query it before the current function is finished.
2706 DT
->addNewBlock(NewBB
, BB
);
2707 if (L
->getParentLoop())
2708 L
->getParentLoop()->addBasicBlockToLoop(NewBB
, *LI
);
2709 ReplaceInstWithInst(BB
->getTerminator(),
2710 BranchInst::Create(Bypass
, NewBB
, SCEVCheck
));
2711 LoopBypassBlocks
.push_back(BB
);
2712 AddedSafetyChecks
= true;
2715 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop
*L
, BasicBlock
*Bypass
) {
2716 // VPlan-native path does not do any analysis for runtime checks currently.
2717 if (EnableVPlanNativePath
)
2720 BasicBlock
*BB
= L
->getLoopPreheader();
2722 // Generate the code that checks in runtime if arrays overlap. We put the
2723 // checks into a separate block to make the more common case of few elements
2725 Instruction
*FirstCheckInst
;
2726 Instruction
*MemRuntimeCheck
;
2727 std::tie(FirstCheckInst
, MemRuntimeCheck
) =
2728 Legal
->getLAI()->addRuntimeChecks(BB
->getTerminator());
2729 if (!MemRuntimeCheck
)
2732 assert(!Cost
->foldTailByMasking() && "Cannot check memory when folding tail");
2733 // Create a new block containing the memory check.
2734 BB
->setName("vector.memcheck");
2735 auto *NewBB
= BB
->splitBasicBlock(BB
->getTerminator(), "vector.ph");
2736 // Update dominator tree immediately if the generated block is a
2737 // LoopBypassBlock because SCEV expansions to generate loop bypass
2738 // checks may query it before the current function is finished.
2739 DT
->addNewBlock(NewBB
, BB
);
2740 if (L
->getParentLoop())
2741 L
->getParentLoop()->addBasicBlockToLoop(NewBB
, *LI
);
2742 ReplaceInstWithInst(BB
->getTerminator(),
2743 BranchInst::Create(Bypass
, NewBB
, MemRuntimeCheck
));
2744 LoopBypassBlocks
.push_back(BB
);
2745 AddedSafetyChecks
= true;
2747 // We currently don't use LoopVersioning for the actual loop cloning but we
2748 // still use it to add the noalias metadata.
2749 LVer
= std::make_unique
<LoopVersioning
>(*Legal
->getLAI(), OrigLoop
, LI
, DT
,
2751 LVer
->prepareNoAliasMetadata();
2754 Value
*InnerLoopVectorizer::emitTransformedIndex(
2755 IRBuilder
<> &B
, Value
*Index
, ScalarEvolution
*SE
, const DataLayout
&DL
,
2756 const InductionDescriptor
&ID
) const {
2758 SCEVExpander
Exp(*SE
, DL
, "induction");
2759 auto Step
= ID
.getStep();
2760 auto StartValue
= ID
.getStartValue();
2761 assert(Index
->getType() == Step
->getType() &&
2762 "Index type does not match StepValue type");
2764 // Note: the IR at this point is broken. We cannot use SE to create any new
2765 // SCEV and then expand it, hoping that SCEV's simplification will give us
2766 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2767 // lead to various SCEV crashes. So all we can do is to use builder and rely
2768 // on InstCombine for future simplifications. Here we handle some trivial
2770 auto CreateAdd
= [&B
](Value
*X
, Value
*Y
) {
2771 assert(X
->getType() == Y
->getType() && "Types don't match!");
2772 if (auto *CX
= dyn_cast
<ConstantInt
>(X
))
2775 if (auto *CY
= dyn_cast
<ConstantInt
>(Y
))
2778 return B
.CreateAdd(X
, Y
);
2781 auto CreateMul
= [&B
](Value
*X
, Value
*Y
) {
2782 assert(X
->getType() == Y
->getType() && "Types don't match!");
2783 if (auto *CX
= dyn_cast
<ConstantInt
>(X
))
2786 if (auto *CY
= dyn_cast
<ConstantInt
>(Y
))
2789 return B
.CreateMul(X
, Y
);
2792 switch (ID
.getKind()) {
2793 case InductionDescriptor::IK_IntInduction
: {
2794 assert(Index
->getType() == StartValue
->getType() &&
2795 "Index type does not match StartValue type");
2796 if (ID
.getConstIntStepValue() && ID
.getConstIntStepValue()->isMinusOne())
2797 return B
.CreateSub(StartValue
, Index
);
2798 auto *Offset
= CreateMul(
2799 Index
, Exp
.expandCodeFor(Step
, Index
->getType(), &*B
.GetInsertPoint()));
2800 return CreateAdd(StartValue
, Offset
);
2802 case InductionDescriptor::IK_PtrInduction
: {
2803 assert(isa
<SCEVConstant
>(Step
) &&
2804 "Expected constant step for pointer induction");
2806 StartValue
->getType()->getPointerElementType(), StartValue
,
2807 CreateMul(Index
, Exp
.expandCodeFor(Step
, Index
->getType(),
2808 &*B
.GetInsertPoint())));
2810 case InductionDescriptor::IK_FpInduction
: {
2811 assert(Step
->getType()->isFloatingPointTy() && "Expected FP Step value");
2812 auto InductionBinOp
= ID
.getInductionBinOp();
2813 assert(InductionBinOp
&&
2814 (InductionBinOp
->getOpcode() == Instruction::FAdd
||
2815 InductionBinOp
->getOpcode() == Instruction::FSub
) &&
2816 "Original bin op should be defined for FP induction");
2818 Value
*StepValue
= cast
<SCEVUnknown
>(Step
)->getValue();
2820 // Floating point operations had to be 'fast' to enable the induction.
2821 FastMathFlags Flags
;
2824 Value
*MulExp
= B
.CreateFMul(StepValue
, Index
);
2825 if (isa
<Instruction
>(MulExp
))
2826 // We have to check, the MulExp may be a constant.
2827 cast
<Instruction
>(MulExp
)->setFastMathFlags(Flags
);
2829 Value
*BOp
= B
.CreateBinOp(InductionBinOp
->getOpcode(), StartValue
, MulExp
,
2831 if (isa
<Instruction
>(BOp
))
2832 cast
<Instruction
>(BOp
)->setFastMathFlags(Flags
);
2836 case InductionDescriptor::IK_NoInduction
:
2839 llvm_unreachable("invalid enum");
2842 BasicBlock
*InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2844 In this function we generate a new loop. The new loop will contain
2845 the vectorized instructions while the old loop will continue to run the
2848 [ ] <-- loop iteration number check.
2851 | [ ] <-- vector loop bypass (may consist of multiple blocks).
2854 || [ ] <-- vector pre header.
2858 | [ ]_| <-- vector loop.
2861 | -[ ] <--- middle-block.
2864 -|- >[ ] <--- new preheader.
2868 | [ ]_| <-- old scalar loop to handle remainder.
2871 >[ ] <-- exit block.
2875 BasicBlock
*OldBasicBlock
= OrigLoop
->getHeader();
2876 BasicBlock
*VectorPH
= OrigLoop
->getLoopPreheader();
2877 BasicBlock
*ExitBlock
= OrigLoop
->getExitBlock();
2878 MDNode
*OrigLoopID
= OrigLoop
->getLoopID();
2879 assert(VectorPH
&& "Invalid loop structure");
2880 assert(ExitBlock
&& "Must have an exit block");
2882 // Some loops have a single integer induction variable, while other loops
2883 // don't. One example is c++ iterators that often have multiple pointer
2884 // induction variables. In the code below we also support a case where we
2885 // don't have a single induction variable.
2887 // We try to obtain an induction variable from the original loop as hard
2888 // as possible. However if we don't find one that:
2890 // - counts from zero, stepping by one
2891 // - is the size of the widest induction variable type
2892 // then we create a new one.
2893 OldInduction
= Legal
->getPrimaryInduction();
2894 Type
*IdxTy
= Legal
->getWidestInductionType();
2896 // Split the single block loop into the two loop structure described above.
2897 BasicBlock
*VecBody
=
2898 VectorPH
->splitBasicBlock(VectorPH
->getTerminator(), "vector.body");
2899 BasicBlock
*MiddleBlock
=
2900 VecBody
->splitBasicBlock(VecBody
->getTerminator(), "middle.block");
2901 BasicBlock
*ScalarPH
=
2902 MiddleBlock
->splitBasicBlock(MiddleBlock
->getTerminator(), "scalar.ph");
2904 // Create and register the new vector loop.
2905 Loop
*Lp
= LI
->AllocateLoop();
2906 Loop
*ParentLoop
= OrigLoop
->getParentLoop();
2908 // Insert the new loop into the loop nest and register the new basic blocks
2909 // before calling any utilities such as SCEV that require valid LoopInfo.
2911 ParentLoop
->addChildLoop(Lp
);
2912 ParentLoop
->addBasicBlockToLoop(ScalarPH
, *LI
);
2913 ParentLoop
->addBasicBlockToLoop(MiddleBlock
, *LI
);
2915 LI
->addTopLevelLoop(Lp
);
2917 Lp
->addBasicBlockToLoop(VecBody
, *LI
);
2919 // Find the loop boundaries.
2920 Value
*Count
= getOrCreateTripCount(Lp
);
2922 Value
*StartIdx
= ConstantInt::get(IdxTy
, 0);
2924 // Now, compare the new count to zero. If it is zero skip the vector loop and
2925 // jump to the scalar loop. This check also covers the case where the
2926 // backedge-taken count is uint##_max: adding one to it will overflow leading
2927 // to an incorrect trip count of zero. In this (rare) case we will also jump
2928 // to the scalar loop.
2929 emitMinimumIterationCountCheck(Lp
, ScalarPH
);
2931 // Generate the code to check any assumptions that we've made for SCEV
2933 emitSCEVChecks(Lp
, ScalarPH
);
2935 // Generate the code that checks in runtime if arrays overlap. We put the
2936 // checks into a separate block to make the more common case of few elements
2938 emitMemRuntimeChecks(Lp
, ScalarPH
);
2940 // Generate the induction variable.
2941 // The loop step is equal to the vectorization factor (num of SIMD elements)
2942 // times the unroll factor (num of SIMD instructions).
2943 Value
*CountRoundDown
= getOrCreateVectorTripCount(Lp
);
2944 Constant
*Step
= ConstantInt::get(IdxTy
, VF
* UF
);
2946 createInductionVariable(Lp
, StartIdx
, CountRoundDown
, Step
,
2947 getDebugLocFromInstOrOperands(OldInduction
));
2949 // We are going to resume the execution of the scalar loop.
2950 // Go over all of the induction variables that we found and fix the
2951 // PHIs that are left in the scalar version of the loop.
2952 // The starting values of PHI nodes depend on the counter of the last
2953 // iteration in the vectorized loop.
2954 // If we come from a bypass edge then we need to start from the original
2957 // This variable saves the new starting index for the scalar loop. It is used
2958 // to test if there are any tail iterations left once the vector loop has
2960 LoopVectorizationLegality::InductionList
*List
= Legal
->getInductionVars();
2961 for (auto &InductionEntry
: *List
) {
2962 PHINode
*OrigPhi
= InductionEntry
.first
;
2963 InductionDescriptor II
= InductionEntry
.second
;
2965 // Create phi nodes to merge from the backedge-taken check block.
2966 PHINode
*BCResumeVal
= PHINode::Create(
2967 OrigPhi
->getType(), 3, "bc.resume.val", ScalarPH
->getTerminator());
2968 // Copy original phi DL over to the new one.
2969 BCResumeVal
->setDebugLoc(OrigPhi
->getDebugLoc());
2970 Value
*&EndValue
= IVEndValues
[OrigPhi
];
2971 if (OrigPhi
== OldInduction
) {
2972 // We know what the end value is.
2973 EndValue
= CountRoundDown
;
2975 IRBuilder
<> B(Lp
->getLoopPreheader()->getTerminator());
2976 Type
*StepType
= II
.getStep()->getType();
2977 Instruction::CastOps CastOp
=
2978 CastInst::getCastOpcode(CountRoundDown
, true, StepType
, true);
2979 Value
*CRD
= B
.CreateCast(CastOp
, CountRoundDown
, StepType
, "cast.crd");
2980 const DataLayout
&DL
= OrigLoop
->getHeader()->getModule()->getDataLayout();
2981 EndValue
= emitTransformedIndex(B
, CRD
, PSE
.getSE(), DL
, II
);
2982 EndValue
->setName("ind.end");
2985 // The new PHI merges the original incoming value, in case of a bypass,
2986 // or the value at the end of the vectorized loop.
2987 BCResumeVal
->addIncoming(EndValue
, MiddleBlock
);
2989 // Fix the scalar body counter (PHI node).
2990 // The old induction's phi node in the scalar body needs the truncated
2992 for (BasicBlock
*BB
: LoopBypassBlocks
)
2993 BCResumeVal
->addIncoming(II
.getStartValue(), BB
);
2994 OrigPhi
->setIncomingValueForBlock(ScalarPH
, BCResumeVal
);
2997 // We need the OrigLoop (scalar loop part) latch terminator to help
2998 // produce correct debug info for the middle block BB instructions.
2999 // The legality check stage guarantees that the loop will have a single
3001 assert(isa
<BranchInst
>(OrigLoop
->getLoopLatch()->getTerminator()) &&
3002 "Scalar loop latch terminator isn't a branch");
3003 BranchInst
*ScalarLatchBr
=
3004 cast
<BranchInst
>(OrigLoop
->getLoopLatch()->getTerminator());
3006 // Add a check in the middle block to see if we have completed
3007 // all of the iterations in the first vector loop.
3008 // If (N - N%VF) == N, then we *don't* need to run the remainder.
3009 // If tail is to be folded, we know we don't need to run the remainder.
3010 Value
*CmpN
= Builder
.getTrue();
3011 if (!Cost
->foldTailByMasking()) {
3013 CmpInst::Create(Instruction::ICmp
, CmpInst::ICMP_EQ
, Count
,
3014 CountRoundDown
, "cmp.n", MiddleBlock
->getTerminator());
3016 // Here we use the same DebugLoc as the scalar loop latch branch instead
3017 // of the corresponding compare because they may have ended up with
3018 // different line numbers and we want to avoid awkward line stepping while
3019 // debugging. Eg. if the compare has got a line number inside the loop.
3020 cast
<Instruction
>(CmpN
)->setDebugLoc(ScalarLatchBr
->getDebugLoc());
3023 BranchInst
*BrInst
= BranchInst::Create(ExitBlock
, ScalarPH
, CmpN
);
3024 BrInst
->setDebugLoc(ScalarLatchBr
->getDebugLoc());
3025 ReplaceInstWithInst(MiddleBlock
->getTerminator(), BrInst
);
3027 // Get ready to start creating new instructions into the vectorized body.
3028 Builder
.SetInsertPoint(&*VecBody
->getFirstInsertionPt());
3031 LoopVectorPreHeader
= Lp
->getLoopPreheader();
3032 LoopScalarPreHeader
= ScalarPH
;
3033 LoopMiddleBlock
= MiddleBlock
;
3034 LoopExitBlock
= ExitBlock
;
3035 LoopVectorBody
= VecBody
;
3036 LoopScalarBody
= OldBasicBlock
;
3038 Optional
<MDNode
*> VectorizedLoopID
=
3039 makeFollowupLoopID(OrigLoopID
, {LLVMLoopVectorizeFollowupAll
,
3040 LLVMLoopVectorizeFollowupVectorized
});
3041 if (VectorizedLoopID
.hasValue()) {
3042 Lp
->setLoopID(VectorizedLoopID
.getValue());
3044 // Do not setAlreadyVectorized if loop attributes have been defined
3046 return LoopVectorPreHeader
;
3049 // Keep all loop hints from the original loop on the vector loop (we'll
3050 // replace the vectorizer-specific hints below).
3051 if (MDNode
*LID
= OrigLoop
->getLoopID())
3054 LoopVectorizeHints
Hints(Lp
, true, *ORE
);
3055 Hints
.setAlreadyVectorized();
3057 return LoopVectorPreHeader
;
3060 // Fix up external users of the induction variable. At this point, we are
3061 // in LCSSA form, with all external PHIs that use the IV having one input value,
3062 // coming from the remainder loop. We need those PHIs to also have a correct
3063 // value for the IV when arriving directly from the middle block.
3064 void InnerLoopVectorizer::fixupIVUsers(PHINode
*OrigPhi
,
3065 const InductionDescriptor
&II
,
3066 Value
*CountRoundDown
, Value
*EndValue
,
3067 BasicBlock
*MiddleBlock
) {
3068 // There are two kinds of external IV usages - those that use the value
3069 // computed in the last iteration (the PHI) and those that use the penultimate
3070 // value (the value that feeds into the phi from the loop latch).
3071 // We allow both, but they, obviously, have different values.
3073 assert(OrigLoop
->getExitBlock() && "Expected a single exit block");
3075 DenseMap
<Value
*, Value
*> MissingVals
;
3077 // An external user of the last iteration's value should see the value that
3078 // the remainder loop uses to initialize its own IV.
3079 Value
*PostInc
= OrigPhi
->getIncomingValueForBlock(OrigLoop
->getLoopLatch());
3080 for (User
*U
: PostInc
->users()) {
3081 Instruction
*UI
= cast
<Instruction
>(U
);
3082 if (!OrigLoop
->contains(UI
)) {
3083 assert(isa
<PHINode
>(UI
) && "Expected LCSSA form");
3084 MissingVals
[UI
] = EndValue
;
3088 // An external user of the penultimate value need to see EndValue - Step.
3089 // The simplest way to get this is to recompute it from the constituent SCEVs,
3090 // that is Start + (Step * (CRD - 1)).
3091 for (User
*U
: OrigPhi
->users()) {
3092 auto *UI
= cast
<Instruction
>(U
);
3093 if (!OrigLoop
->contains(UI
)) {
3094 const DataLayout
&DL
=
3095 OrigLoop
->getHeader()->getModule()->getDataLayout();
3096 assert(isa
<PHINode
>(UI
) && "Expected LCSSA form");
3098 IRBuilder
<> B(MiddleBlock
->getTerminator());
3099 Value
*CountMinusOne
= B
.CreateSub(
3100 CountRoundDown
, ConstantInt::get(CountRoundDown
->getType(), 1));
3102 !II
.getStep()->getType()->isIntegerTy()
3103 ? B
.CreateCast(Instruction::SIToFP
, CountMinusOne
,
3104 II
.getStep()->getType())
3105 : B
.CreateSExtOrTrunc(CountMinusOne
, II
.getStep()->getType());
3106 CMO
->setName("cast.cmo");
3107 Value
*Escape
= emitTransformedIndex(B
, CMO
, PSE
.getSE(), DL
, II
);
3108 Escape
->setName("ind.escape");
3109 MissingVals
[UI
] = Escape
;
3113 for (auto &I
: MissingVals
) {
3114 PHINode
*PHI
= cast
<PHINode
>(I
.first
);
3115 // One corner case we have to handle is two IVs "chasing" each-other,
3116 // that is %IV2 = phi [...], [ %IV1, %latch ]
3117 // In this case, if IV1 has an external use, we need to avoid adding both
3118 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3119 // don't already have an incoming value for the middle block.
3120 if (PHI
->getBasicBlockIndex(MiddleBlock
) == -1)
3121 PHI
->addIncoming(I
.second
, MiddleBlock
);
3127 struct CSEDenseMapInfo
{
3128 static bool canHandle(const Instruction
*I
) {
3129 return isa
<InsertElementInst
>(I
) || isa
<ExtractElementInst
>(I
) ||
3130 isa
<ShuffleVectorInst
>(I
) || isa
<GetElementPtrInst
>(I
);
3133 static inline Instruction
*getEmptyKey() {
3134 return DenseMapInfo
<Instruction
*>::getEmptyKey();
3137 static inline Instruction
*getTombstoneKey() {
3138 return DenseMapInfo
<Instruction
*>::getTombstoneKey();
3141 static unsigned getHashValue(const Instruction
*I
) {
3142 assert(canHandle(I
) && "Unknown instruction!");
3143 return hash_combine(I
->getOpcode(), hash_combine_range(I
->value_op_begin(),
3144 I
->value_op_end()));
3147 static bool isEqual(const Instruction
*LHS
, const Instruction
*RHS
) {
3148 if (LHS
== getEmptyKey() || RHS
== getEmptyKey() ||
3149 LHS
== getTombstoneKey() || RHS
== getTombstoneKey())
3151 return LHS
->isIdenticalTo(RHS
);
3155 } // end anonymous namespace
3157 ///Perform cse of induction variable instructions.
3158 static void cse(BasicBlock
*BB
) {
3159 // Perform simple cse.
3160 SmallDenseMap
<Instruction
*, Instruction
*, 4, CSEDenseMapInfo
> CSEMap
;
3161 for (BasicBlock::iterator I
= BB
->begin(), E
= BB
->end(); I
!= E
;) {
3162 Instruction
*In
= &*I
++;
3164 if (!CSEDenseMapInfo::canHandle(In
))
3167 // Check if we can replace this instruction with any of the
3168 // visited instructions.
3169 if (Instruction
*V
= CSEMap
.lookup(In
)) {
3170 In
->replaceAllUsesWith(V
);
3171 In
->eraseFromParent();
3179 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst
*CI
,
3181 bool &NeedToScalarize
) {
3182 Function
*F
= CI
->getCalledFunction();
3183 StringRef FnName
= CI
->getCalledFunction()->getName();
3184 Type
*ScalarRetTy
= CI
->getType();
3185 SmallVector
<Type
*, 4> Tys
, ScalarTys
;
3186 for (auto &ArgOp
: CI
->arg_operands())
3187 ScalarTys
.push_back(ArgOp
->getType());
3189 // Estimate cost of scalarized vector call. The source operands are assumed
3190 // to be vectors, so we need to extract individual elements from there,
3191 // execute VF scalar calls, and then gather the result into the vector return
3193 unsigned ScalarCallCost
= TTI
.getCallInstrCost(F
, ScalarRetTy
, ScalarTys
);
3195 return ScalarCallCost
;
3197 // Compute corresponding vector type for return value and arguments.
3198 Type
*RetTy
= ToVectorTy(ScalarRetTy
, VF
);
3199 for (Type
*ScalarTy
: ScalarTys
)
3200 Tys
.push_back(ToVectorTy(ScalarTy
, VF
));
3202 // Compute costs of unpacking argument values for the scalar calls and
3203 // packing the return values to a vector.
3204 unsigned ScalarizationCost
= getScalarizationOverhead(CI
, VF
);
3206 unsigned Cost
= ScalarCallCost
* VF
+ ScalarizationCost
;
3208 // If we can't emit a vector call for this function, then the currently found
3209 // cost is the cost we need to return.
3210 NeedToScalarize
= true;
3211 if (!TLI
|| !TLI
->isFunctionVectorizable(FnName
, VF
) || CI
->isNoBuiltin())
3214 // If the corresponding vector cost is cheaper, return its cost.
3215 unsigned VectorCallCost
= TTI
.getCallInstrCost(nullptr, RetTy
, Tys
);
3216 if (VectorCallCost
< Cost
) {
3217 NeedToScalarize
= false;
3218 return VectorCallCost
;
3223 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst
*CI
,
3225 Intrinsic::ID ID
= getVectorIntrinsicIDForCall(CI
, TLI
);
3226 assert(ID
&& "Expected intrinsic call!");
3229 if (auto *FPMO
= dyn_cast
<FPMathOperator
>(CI
))
3230 FMF
= FPMO
->getFastMathFlags();
3232 SmallVector
<Value
*, 4> Operands(CI
->arg_operands());
3233 return TTI
.getIntrinsicInstrCost(ID
, CI
->getType(), Operands
, FMF
, VF
);
3236 static Type
*smallestIntegerVectorType(Type
*T1
, Type
*T2
) {
3237 auto *I1
= cast
<IntegerType
>(T1
->getVectorElementType());
3238 auto *I2
= cast
<IntegerType
>(T2
->getVectorElementType());
3239 return I1
->getBitWidth() < I2
->getBitWidth() ? T1
: T2
;
3241 static Type
*largestIntegerVectorType(Type
*T1
, Type
*T2
) {
3242 auto *I1
= cast
<IntegerType
>(T1
->getVectorElementType());
3243 auto *I2
= cast
<IntegerType
>(T2
->getVectorElementType());
3244 return I1
->getBitWidth() > I2
->getBitWidth() ? T1
: T2
;
3247 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3248 // For every instruction `I` in MinBWs, truncate the operands, create a
3249 // truncated version of `I` and reextend its result. InstCombine runs
3250 // later and will remove any ext/trunc pairs.
3251 SmallPtrSet
<Value
*, 4> Erased
;
3252 for (const auto &KV
: Cost
->getMinimalBitwidths()) {
3253 // If the value wasn't vectorized, we must maintain the original scalar
3254 // type. The absence of the value from VectorLoopValueMap indicates that it
3255 // wasn't vectorized.
3256 if (!VectorLoopValueMap
.hasAnyVectorValue(KV
.first
))
3258 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3259 Value
*I
= getOrCreateVectorValue(KV
.first
, Part
);
3260 if (Erased
.find(I
) != Erased
.end() || I
->use_empty() ||
3261 !isa
<Instruction
>(I
))
3263 Type
*OriginalTy
= I
->getType();
3264 Type
*ScalarTruncatedTy
=
3265 IntegerType::get(OriginalTy
->getContext(), KV
.second
);
3266 Type
*TruncatedTy
= VectorType::get(ScalarTruncatedTy
,
3267 OriginalTy
->getVectorNumElements());
3268 if (TruncatedTy
== OriginalTy
)
3271 IRBuilder
<> B(cast
<Instruction
>(I
));
3272 auto ShrinkOperand
= [&](Value
*V
) -> Value
* {
3273 if (auto *ZI
= dyn_cast
<ZExtInst
>(V
))
3274 if (ZI
->getSrcTy() == TruncatedTy
)
3275 return ZI
->getOperand(0);
3276 return B
.CreateZExtOrTrunc(V
, TruncatedTy
);
3279 // The actual instruction modification depends on the instruction type,
3281 Value
*NewI
= nullptr;
3282 if (auto *BO
= dyn_cast
<BinaryOperator
>(I
)) {
3283 NewI
= B
.CreateBinOp(BO
->getOpcode(), ShrinkOperand(BO
->getOperand(0)),
3284 ShrinkOperand(BO
->getOperand(1)));
3286 // Any wrapping introduced by shrinking this operation shouldn't be
3287 // considered undefined behavior. So, we can't unconditionally copy
3288 // arithmetic wrapping flags to NewI.
3289 cast
<BinaryOperator
>(NewI
)->copyIRFlags(I
, /*IncludeWrapFlags=*/false);
3290 } else if (auto *CI
= dyn_cast
<ICmpInst
>(I
)) {
3292 B
.CreateICmp(CI
->getPredicate(), ShrinkOperand(CI
->getOperand(0)),
3293 ShrinkOperand(CI
->getOperand(1)));
3294 } else if (auto *SI
= dyn_cast
<SelectInst
>(I
)) {
3295 NewI
= B
.CreateSelect(SI
->getCondition(),
3296 ShrinkOperand(SI
->getTrueValue()),
3297 ShrinkOperand(SI
->getFalseValue()));
3298 } else if (auto *CI
= dyn_cast
<CastInst
>(I
)) {
3299 switch (CI
->getOpcode()) {
3301 llvm_unreachable("Unhandled cast!");
3302 case Instruction::Trunc
:
3303 NewI
= ShrinkOperand(CI
->getOperand(0));
3305 case Instruction::SExt
:
3306 NewI
= B
.CreateSExtOrTrunc(
3308 smallestIntegerVectorType(OriginalTy
, TruncatedTy
));
3310 case Instruction::ZExt
:
3311 NewI
= B
.CreateZExtOrTrunc(
3313 smallestIntegerVectorType(OriginalTy
, TruncatedTy
));
3316 } else if (auto *SI
= dyn_cast
<ShuffleVectorInst
>(I
)) {
3317 auto Elements0
= SI
->getOperand(0)->getType()->getVectorNumElements();
3318 auto *O0
= B
.CreateZExtOrTrunc(
3319 SI
->getOperand(0), VectorType::get(ScalarTruncatedTy
, Elements0
));
3320 auto Elements1
= SI
->getOperand(1)->getType()->getVectorNumElements();
3321 auto *O1
= B
.CreateZExtOrTrunc(
3322 SI
->getOperand(1), VectorType::get(ScalarTruncatedTy
, Elements1
));
3324 NewI
= B
.CreateShuffleVector(O0
, O1
, SI
->getMask());
3325 } else if (isa
<LoadInst
>(I
) || isa
<PHINode
>(I
)) {
3326 // Don't do anything with the operands, just extend the result.
3328 } else if (auto *IE
= dyn_cast
<InsertElementInst
>(I
)) {
3329 auto Elements
= IE
->getOperand(0)->getType()->getVectorNumElements();
3330 auto *O0
= B
.CreateZExtOrTrunc(
3331 IE
->getOperand(0), VectorType::get(ScalarTruncatedTy
, Elements
));
3332 auto *O1
= B
.CreateZExtOrTrunc(IE
->getOperand(1), ScalarTruncatedTy
);
3333 NewI
= B
.CreateInsertElement(O0
, O1
, IE
->getOperand(2));
3334 } else if (auto *EE
= dyn_cast
<ExtractElementInst
>(I
)) {
3335 auto Elements
= EE
->getOperand(0)->getType()->getVectorNumElements();
3336 auto *O0
= B
.CreateZExtOrTrunc(
3337 EE
->getOperand(0), VectorType::get(ScalarTruncatedTy
, Elements
));
3338 NewI
= B
.CreateExtractElement(O0
, EE
->getOperand(2));
3340 // If we don't know what to do, be conservative and don't do anything.
3344 // Lastly, extend the result.
3345 NewI
->takeName(cast
<Instruction
>(I
));
3346 Value
*Res
= B
.CreateZExtOrTrunc(NewI
, OriginalTy
);
3347 I
->replaceAllUsesWith(Res
);
3348 cast
<Instruction
>(I
)->eraseFromParent();
3350 VectorLoopValueMap
.resetVectorValue(KV
.first
, Part
, Res
);
3354 // We'll have created a bunch of ZExts that are now parentless. Clean up.
3355 for (const auto &KV
: Cost
->getMinimalBitwidths()) {
3356 // If the value wasn't vectorized, we must maintain the original scalar
3357 // type. The absence of the value from VectorLoopValueMap indicates that it
3358 // wasn't vectorized.
3359 if (!VectorLoopValueMap
.hasAnyVectorValue(KV
.first
))
3361 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3362 Value
*I
= getOrCreateVectorValue(KV
.first
, Part
);
3363 ZExtInst
*Inst
= dyn_cast
<ZExtInst
>(I
);
3364 if (Inst
&& Inst
->use_empty()) {
3365 Value
*NewI
= Inst
->getOperand(0);
3366 Inst
->eraseFromParent();
3367 VectorLoopValueMap
.resetVectorValue(KV
.first
, Part
, NewI
);
3373 void InnerLoopVectorizer::fixVectorizedLoop() {
3374 // Insert truncates and extends for any truncated instructions as hints to
3377 truncateToMinimalBitwidths();
3379 // Fix widened non-induction PHIs by setting up the PHI operands.
3380 if (OrigPHIsToFix
.size()) {
3381 assert(EnableVPlanNativePath
&&
3382 "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3383 fixNonInductionPHIs();
3386 // At this point every instruction in the original loop is widened to a
3387 // vector form. Now we need to fix the recurrences in the loop. These PHI
3388 // nodes are currently empty because we did not want to introduce cycles.
3389 // This is the second stage of vectorizing recurrences.
3390 fixCrossIterationPHIs();
3392 // Update the dominator tree.
3394 // FIXME: After creating the structure of the new loop, the dominator tree is
3395 // no longer up-to-date, and it remains that way until we update it
3396 // here. An out-of-date dominator tree is problematic for SCEV,
3397 // because SCEVExpander uses it to guide code generation. The
3398 // vectorizer use SCEVExpanders in several places. Instead, we should
3399 // keep the dominator tree up-to-date as we go.
3402 // Fix-up external users of the induction variables.
3403 for (auto &Entry
: *Legal
->getInductionVars())
3404 fixupIVUsers(Entry
.first
, Entry
.second
,
3405 getOrCreateVectorTripCount(LI
->getLoopFor(LoopVectorBody
)),
3406 IVEndValues
[Entry
.first
], LoopMiddleBlock
);
3409 for (Instruction
*PI
: PredicatedInstructions
)
3410 sinkScalarOperands(&*PI
);
3412 // Remove redundant induction instructions.
3413 cse(LoopVectorBody
);
3416 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3417 // In order to support recurrences we need to be able to vectorize Phi nodes.
3418 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3419 // stage #2: We now need to fix the recurrences by adding incoming edges to
3420 // the currently empty PHI nodes. At this point every instruction in the
3421 // original loop is widened to a vector form so we can use them to construct
3422 // the incoming edges.
3423 for (PHINode
&Phi
: OrigLoop
->getHeader()->phis()) {
3424 // Handle first-order recurrences and reductions that need to be fixed.
3425 if (Legal
->isFirstOrderRecurrence(&Phi
))
3426 fixFirstOrderRecurrence(&Phi
);
3427 else if (Legal
->isReductionVariable(&Phi
))
3432 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode
*Phi
) {
3433 // This is the second phase of vectorizing first-order recurrences. An
3434 // overview of the transformation is described below. Suppose we have the
3437 // for (int i = 0; i < n; ++i)
3438 // b[i] = a[i] - a[i - 1];
3440 // There is a first-order recurrence on "a". For this loop, the shorthand
3441 // scalar IR looks like:
3448 // i = phi [0, scalar.ph], [i+1, scalar.body]
3449 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3452 // br cond, scalar.body, ...
3454 // In this example, s1 is a recurrence because it's value depends on the
3455 // previous iteration. In the first phase of vectorization, we created a
3456 // temporary value for s1. We now complete the vectorization and produce the
3457 // shorthand vector IR shown below (for VF = 4, UF = 1).
3460 // v_init = vector(..., ..., ..., a[-1])
3464 // i = phi [0, vector.ph], [i+4, vector.body]
3465 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3466 // v2 = a[i, i+1, i+2, i+3];
3467 // v3 = vector(v1(3), v2(0, 1, 2))
3468 // b[i, i+1, i+2, i+3] = v2 - v3
3469 // br cond, vector.body, middle.block
3476 // s_init = phi [x, middle.block], [a[-1], otherwise]
3479 // After execution completes the vector loop, we extract the next value of
3480 // the recurrence (x) to use as the initial value in the scalar loop.
3482 // Get the original loop preheader and single loop latch.
3483 auto *Preheader
= OrigLoop
->getLoopPreheader();
3484 auto *Latch
= OrigLoop
->getLoopLatch();
3486 // Get the initial and previous values of the scalar recurrence.
3487 auto *ScalarInit
= Phi
->getIncomingValueForBlock(Preheader
);
3488 auto *Previous
= Phi
->getIncomingValueForBlock(Latch
);
3490 // Create a vector from the initial value.
3491 auto *VectorInit
= ScalarInit
;
3493 Builder
.SetInsertPoint(LoopVectorPreHeader
->getTerminator());
3494 VectorInit
= Builder
.CreateInsertElement(
3495 UndefValue::get(VectorType::get(VectorInit
->getType(), VF
)), VectorInit
,
3496 Builder
.getInt32(VF
- 1), "vector.recur.init");
3499 // We constructed a temporary phi node in the first phase of vectorization.
3500 // This phi node will eventually be deleted.
3501 Builder
.SetInsertPoint(
3502 cast
<Instruction
>(VectorLoopValueMap
.getVectorValue(Phi
, 0)));
3504 // Create a phi node for the new recurrence. The current value will either be
3505 // the initial value inserted into a vector or loop-varying vector value.
3506 auto *VecPhi
= Builder
.CreatePHI(VectorInit
->getType(), 2, "vector.recur");
3507 VecPhi
->addIncoming(VectorInit
, LoopVectorPreHeader
);
3509 // Get the vectorized previous value of the last part UF - 1. It appears last
3510 // among all unrolled iterations, due to the order of their construction.
3511 Value
*PreviousLastPart
= getOrCreateVectorValue(Previous
, UF
- 1);
3513 // Set the insertion point after the previous value if it is an instruction.
3514 // Note that the previous value may have been constant-folded so it is not
3515 // guaranteed to be an instruction in the vector loop. Also, if the previous
3516 // value is a phi node, we should insert after all the phi nodes to avoid
3517 // breaking basic block verification.
3518 if (LI
->getLoopFor(LoopVectorBody
)->isLoopInvariant(PreviousLastPart
) ||
3519 isa
<PHINode
>(PreviousLastPart
))
3520 Builder
.SetInsertPoint(&*LoopVectorBody
->getFirstInsertionPt());
3522 Builder
.SetInsertPoint(
3523 &*++BasicBlock::iterator(cast
<Instruction
>(PreviousLastPart
)));
3525 // We will construct a vector for the recurrence by combining the values for
3526 // the current and previous iterations. This is the required shuffle mask.
3527 SmallVector
<Constant
*, 8> ShuffleMask(VF
);
3528 ShuffleMask
[0] = Builder
.getInt32(VF
- 1);
3529 for (unsigned I
= 1; I
< VF
; ++I
)
3530 ShuffleMask
[I
] = Builder
.getInt32(I
+ VF
- 1);
3532 // The vector from which to take the initial value for the current iteration
3533 // (actual or unrolled). Initially, this is the vector phi node.
3534 Value
*Incoming
= VecPhi
;
3536 // Shuffle the current and previous vector and update the vector parts.
3537 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3538 Value
*PreviousPart
= getOrCreateVectorValue(Previous
, Part
);
3539 Value
*PhiPart
= VectorLoopValueMap
.getVectorValue(Phi
, Part
);
3541 VF
> 1 ? Builder
.CreateShuffleVector(Incoming
, PreviousPart
,
3542 ConstantVector::get(ShuffleMask
))
3544 PhiPart
->replaceAllUsesWith(Shuffle
);
3545 cast
<Instruction
>(PhiPart
)->eraseFromParent();
3546 VectorLoopValueMap
.resetVectorValue(Phi
, Part
, Shuffle
);
3547 Incoming
= PreviousPart
;
3550 // Fix the latch value of the new recurrence in the vector loop.
3551 VecPhi
->addIncoming(Incoming
, LI
->getLoopFor(LoopVectorBody
)->getLoopLatch());
3553 // Extract the last vector element in the middle block. This will be the
3554 // initial value for the recurrence when jumping to the scalar loop.
3555 auto *ExtractForScalar
= Incoming
;
3557 Builder
.SetInsertPoint(LoopMiddleBlock
->getTerminator());
3558 ExtractForScalar
= Builder
.CreateExtractElement(
3559 ExtractForScalar
, Builder
.getInt32(VF
- 1), "vector.recur.extract");
3561 // Extract the second last element in the middle block if the
3562 // Phi is used outside the loop. We need to extract the phi itself
3563 // and not the last element (the phi update in the current iteration). This
3564 // will be the value when jumping to the exit block from the LoopMiddleBlock,
3565 // when the scalar loop is not run at all.
3566 Value
*ExtractForPhiUsedOutsideLoop
= nullptr;
3568 ExtractForPhiUsedOutsideLoop
= Builder
.CreateExtractElement(
3569 Incoming
, Builder
.getInt32(VF
- 2), "vector.recur.extract.for.phi");
3570 // When loop is unrolled without vectorizing, initialize
3571 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3572 // `Incoming`. This is analogous to the vectorized case above: extracting the
3573 // second last element when VF > 1.
3575 ExtractForPhiUsedOutsideLoop
= getOrCreateVectorValue(Previous
, UF
- 2);
3577 // Fix the initial value of the original recurrence in the scalar loop.
3578 Builder
.SetInsertPoint(&*LoopScalarPreHeader
->begin());
3579 auto *Start
= Builder
.CreatePHI(Phi
->getType(), 2, "scalar.recur.init");
3580 for (auto *BB
: predecessors(LoopScalarPreHeader
)) {
3581 auto *Incoming
= BB
== LoopMiddleBlock
? ExtractForScalar
: ScalarInit
;
3582 Start
->addIncoming(Incoming
, BB
);
3585 Phi
->setIncomingValueForBlock(LoopScalarPreHeader
, Start
);
3586 Phi
->setName("scalar.recur");
3588 // Finally, fix users of the recurrence outside the loop. The users will need
3589 // either the last value of the scalar recurrence or the last value of the
3590 // vector recurrence we extracted in the middle block. Since the loop is in
3591 // LCSSA form, we just need to find all the phi nodes for the original scalar
3592 // recurrence in the exit block, and then add an edge for the middle block.
3593 for (PHINode
&LCSSAPhi
: LoopExitBlock
->phis()) {
3594 if (LCSSAPhi
.getIncomingValue(0) == Phi
) {
3595 LCSSAPhi
.addIncoming(ExtractForPhiUsedOutsideLoop
, LoopMiddleBlock
);
3600 void InnerLoopVectorizer::fixReduction(PHINode
*Phi
) {
3601 Constant
*Zero
= Builder
.getInt32(0);
3603 // Get it's reduction variable descriptor.
3604 assert(Legal
->isReductionVariable(Phi
) &&
3605 "Unable to find the reduction variable");
3606 RecurrenceDescriptor RdxDesc
= (*Legal
->getReductionVars())[Phi
];
3608 RecurrenceDescriptor::RecurrenceKind RK
= RdxDesc
.getRecurrenceKind();
3609 TrackingVH
<Value
> ReductionStartValue
= RdxDesc
.getRecurrenceStartValue();
3610 Instruction
*LoopExitInst
= RdxDesc
.getLoopExitInstr();
3611 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind
=
3612 RdxDesc
.getMinMaxRecurrenceKind();
3613 setDebugLocFromInst(Builder
, ReductionStartValue
);
3615 // We need to generate a reduction vector from the incoming scalar.
3616 // To do so, we need to generate the 'identity' vector and override
3617 // one of the elements with the incoming scalar reduction. We need
3618 // to do it in the vector-loop preheader.
3619 Builder
.SetInsertPoint(LoopVectorPreHeader
->getTerminator());
3621 // This is the vector-clone of the value that leaves the loop.
3622 Type
*VecTy
= getOrCreateVectorValue(LoopExitInst
, 0)->getType();
3624 // Find the reduction identity variable. Zero for addition, or, xor,
3625 // one for multiplication, -1 for And.
3628 if (RK
== RecurrenceDescriptor::RK_IntegerMinMax
||
3629 RK
== RecurrenceDescriptor::RK_FloatMinMax
) {
3630 // MinMax reduction have the start value as their identify.
3632 VectorStart
= Identity
= ReductionStartValue
;
3634 VectorStart
= Identity
=
3635 Builder
.CreateVectorSplat(VF
, ReductionStartValue
, "minmax.ident");
3638 // Handle other reduction kinds:
3639 Constant
*Iden
= RecurrenceDescriptor::getRecurrenceIdentity(
3640 RK
, VecTy
->getScalarType());
3643 // This vector is the Identity vector where the first element is the
3644 // incoming scalar reduction.
3645 VectorStart
= ReductionStartValue
;
3647 Identity
= ConstantVector::getSplat(VF
, Iden
);
3649 // This vector is the Identity vector where the first element is the
3650 // incoming scalar reduction.
3652 Builder
.CreateInsertElement(Identity
, ReductionStartValue
, Zero
);
3656 // Fix the vector-loop phi.
3658 // Reductions do not have to start at zero. They can start with
3659 // any loop invariant values.
3660 BasicBlock
*Latch
= OrigLoop
->getLoopLatch();
3661 Value
*LoopVal
= Phi
->getIncomingValueForBlock(Latch
);
3662 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3663 Value
*VecRdxPhi
= getOrCreateVectorValue(Phi
, Part
);
3664 Value
*Val
= getOrCreateVectorValue(LoopVal
, Part
);
3665 // Make sure to add the reduction stat value only to the
3666 // first unroll part.
3667 Value
*StartVal
= (Part
== 0) ? VectorStart
: Identity
;
3668 cast
<PHINode
>(VecRdxPhi
)->addIncoming(StartVal
, LoopVectorPreHeader
);
3669 cast
<PHINode
>(VecRdxPhi
)
3670 ->addIncoming(Val
, LI
->getLoopFor(LoopVectorBody
)->getLoopLatch());
3673 // Before each round, move the insertion point right between
3674 // the PHIs and the values we are going to write.
3675 // This allows us to write both PHINodes and the extractelement
3677 Builder
.SetInsertPoint(&*LoopMiddleBlock
->getFirstInsertionPt());
3679 setDebugLocFromInst(Builder
, LoopExitInst
);
3681 // If tail is folded by masking, the vector value to leave the loop should be
3682 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3683 // instead of the former.
3684 if (Cost
->foldTailByMasking()) {
3685 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3686 Value
*VecLoopExitInst
=
3687 VectorLoopValueMap
.getVectorValue(LoopExitInst
, Part
);
3688 Value
*Sel
= nullptr;
3689 for (User
*U
: VecLoopExitInst
->users()) {
3690 if (isa
<SelectInst
>(U
)) {
3691 assert(!Sel
&& "Reduction exit feeding two selects");
3694 assert(isa
<PHINode
>(U
) && "Reduction exit must feed Phi's or select");
3696 assert(Sel
&& "Reduction exit feeds no select");
3697 VectorLoopValueMap
.resetVectorValue(LoopExitInst
, Part
, Sel
);
3701 // If the vector reduction can be performed in a smaller type, we truncate
3702 // then extend the loop exit value to enable InstCombine to evaluate the
3703 // entire expression in the smaller type.
3704 if (VF
> 1 && Phi
->getType() != RdxDesc
.getRecurrenceType()) {
3705 Type
*RdxVecTy
= VectorType::get(RdxDesc
.getRecurrenceType(), VF
);
3706 Builder
.SetInsertPoint(
3707 LI
->getLoopFor(LoopVectorBody
)->getLoopLatch()->getTerminator());
3708 VectorParts
RdxParts(UF
);
3709 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3710 RdxParts
[Part
] = VectorLoopValueMap
.getVectorValue(LoopExitInst
, Part
);
3711 Value
*Trunc
= Builder
.CreateTrunc(RdxParts
[Part
], RdxVecTy
);
3712 Value
*Extnd
= RdxDesc
.isSigned() ? Builder
.CreateSExt(Trunc
, VecTy
)
3713 : Builder
.CreateZExt(Trunc
, VecTy
);
3714 for (Value::user_iterator UI
= RdxParts
[Part
]->user_begin();
3715 UI
!= RdxParts
[Part
]->user_end();)
3717 (*UI
++)->replaceUsesOfWith(RdxParts
[Part
], Extnd
);
3718 RdxParts
[Part
] = Extnd
;
3723 Builder
.SetInsertPoint(&*LoopMiddleBlock
->getFirstInsertionPt());
3724 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3725 RdxParts
[Part
] = Builder
.CreateTrunc(RdxParts
[Part
], RdxVecTy
);
3726 VectorLoopValueMap
.resetVectorValue(LoopExitInst
, Part
, RdxParts
[Part
]);
3730 // Reduce all of the unrolled parts into a single vector.
3731 Value
*ReducedPartRdx
= VectorLoopValueMap
.getVectorValue(LoopExitInst
, 0);
3732 unsigned Op
= RecurrenceDescriptor::getRecurrenceBinOp(RK
);
3734 // The middle block terminator has already been assigned a DebugLoc here (the
3735 // OrigLoop's single latch terminator). We want the whole middle block to
3736 // appear to execute on this line because: (a) it is all compiler generated,
3737 // (b) these instructions are always executed after evaluating the latch
3738 // conditional branch, and (c) other passes may add new predecessors which
3739 // terminate on this line. This is the easiest way to ensure we don't
3740 // accidentally cause an extra step back into the loop while debugging.
3741 setDebugLocFromInst(Builder
, LoopMiddleBlock
->getTerminator());
3742 for (unsigned Part
= 1; Part
< UF
; ++Part
) {
3743 Value
*RdxPart
= VectorLoopValueMap
.getVectorValue(LoopExitInst
, Part
);
3744 if (Op
!= Instruction::ICmp
&& Op
!= Instruction::FCmp
)
3745 // Floating point operations had to be 'fast' to enable the reduction.
3746 ReducedPartRdx
= addFastMathFlag(
3747 Builder
.CreateBinOp((Instruction::BinaryOps
)Op
, RdxPart
,
3748 ReducedPartRdx
, "bin.rdx"),
3749 RdxDesc
.getFastMathFlags());
3751 ReducedPartRdx
= createMinMaxOp(Builder
, MinMaxKind
, ReducedPartRdx
,
3756 bool NoNaN
= Legal
->hasFunNoNaNAttr();
3758 createTargetReduction(Builder
, TTI
, RdxDesc
, ReducedPartRdx
, NoNaN
);
3759 // If the reduction can be performed in a smaller type, we need to extend
3760 // the reduction to the wider type before we branch to the original loop.
3761 if (Phi
->getType() != RdxDesc
.getRecurrenceType())
3764 ? Builder
.CreateSExt(ReducedPartRdx
, Phi
->getType())
3765 : Builder
.CreateZExt(ReducedPartRdx
, Phi
->getType());
3768 // Create a phi node that merges control-flow from the backedge-taken check
3769 // block and the middle block.
3770 PHINode
*BCBlockPhi
= PHINode::Create(Phi
->getType(), 2, "bc.merge.rdx",
3771 LoopScalarPreHeader
->getTerminator());
3772 for (unsigned I
= 0, E
= LoopBypassBlocks
.size(); I
!= E
; ++I
)
3773 BCBlockPhi
->addIncoming(ReductionStartValue
, LoopBypassBlocks
[I
]);
3774 BCBlockPhi
->addIncoming(ReducedPartRdx
, LoopMiddleBlock
);
3776 // Now, we need to fix the users of the reduction variable
3777 // inside and outside of the scalar remainder loop.
3778 // We know that the loop is in LCSSA form. We need to update the
3779 // PHI nodes in the exit blocks.
3780 for (PHINode
&LCSSAPhi
: LoopExitBlock
->phis()) {
3781 // All PHINodes need to have a single entry edge, or two if
3782 // we already fixed them.
3783 assert(LCSSAPhi
.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3785 // We found a reduction value exit-PHI. Update it with the
3786 // incoming bypass edge.
3787 if (LCSSAPhi
.getIncomingValue(0) == LoopExitInst
)
3788 LCSSAPhi
.addIncoming(ReducedPartRdx
, LoopMiddleBlock
);
3789 } // end of the LCSSA phi scan.
3791 // Fix the scalar loop reduction variable with the incoming reduction sum
3792 // from the vector body and from the backedge value.
3793 int IncomingEdgeBlockIdx
=
3794 Phi
->getBasicBlockIndex(OrigLoop
->getLoopLatch());
3795 assert(IncomingEdgeBlockIdx
>= 0 && "Invalid block index");
3796 // Pick the other block.
3797 int SelfEdgeBlockIdx
= (IncomingEdgeBlockIdx
? 0 : 1);
3798 Phi
->setIncomingValue(SelfEdgeBlockIdx
, BCBlockPhi
);
3799 Phi
->setIncomingValue(IncomingEdgeBlockIdx
, LoopExitInst
);
3802 void InnerLoopVectorizer::fixLCSSAPHIs() {
3803 for (PHINode
&LCSSAPhi
: LoopExitBlock
->phis()) {
3804 if (LCSSAPhi
.getNumIncomingValues() == 1) {
3805 auto *IncomingValue
= LCSSAPhi
.getIncomingValue(0);
3806 // Non-instruction incoming values will have only one value.
3807 unsigned LastLane
= 0;
3808 if (isa
<Instruction
>(IncomingValue
))
3809 LastLane
= Cost
->isUniformAfterVectorization(
3810 cast
<Instruction
>(IncomingValue
), VF
)
3813 // Can be a loop invariant incoming value or the last scalar value to be
3814 // extracted from the vectorized loop.
3815 Builder
.SetInsertPoint(LoopMiddleBlock
->getTerminator());
3816 Value
*lastIncomingValue
=
3817 getOrCreateScalarValue(IncomingValue
, { UF
- 1, LastLane
});
3818 LCSSAPhi
.addIncoming(lastIncomingValue
, LoopMiddleBlock
);
3823 void InnerLoopVectorizer::sinkScalarOperands(Instruction
*PredInst
) {
3824 // The basic block and loop containing the predicated instruction.
3825 auto *PredBB
= PredInst
->getParent();
3826 auto *VectorLoop
= LI
->getLoopFor(PredBB
);
3828 // Initialize a worklist with the operands of the predicated instruction.
3829 SetVector
<Value
*> Worklist(PredInst
->op_begin(), PredInst
->op_end());
3831 // Holds instructions that we need to analyze again. An instruction may be
3832 // reanalyzed if we don't yet know if we can sink it or not.
3833 SmallVector
<Instruction
*, 8> InstsToReanalyze
;
3835 // Returns true if a given use occurs in the predicated block. Phi nodes use
3836 // their operands in their corresponding predecessor blocks.
3837 auto isBlockOfUsePredicated
= [&](Use
&U
) -> bool {
3838 auto *I
= cast
<Instruction
>(U
.getUser());
3839 BasicBlock
*BB
= I
->getParent();
3840 if (auto *Phi
= dyn_cast
<PHINode
>(I
))
3841 BB
= Phi
->getIncomingBlock(
3842 PHINode::getIncomingValueNumForOperand(U
.getOperandNo()));
3843 return BB
== PredBB
;
3846 // Iteratively sink the scalarized operands of the predicated instruction
3847 // into the block we created for it. When an instruction is sunk, it's
3848 // operands are then added to the worklist. The algorithm ends after one pass
3849 // through the worklist doesn't sink a single instruction.
3852 // Add the instructions that need to be reanalyzed to the worklist, and
3853 // reset the changed indicator.
3854 Worklist
.insert(InstsToReanalyze
.begin(), InstsToReanalyze
.end());
3855 InstsToReanalyze
.clear();
3858 while (!Worklist
.empty()) {
3859 auto *I
= dyn_cast
<Instruction
>(Worklist
.pop_back_val());
3861 // We can't sink an instruction if it is a phi node, is already in the
3862 // predicated block, is not in the loop, or may have side effects.
3863 if (!I
|| isa
<PHINode
>(I
) || I
->getParent() == PredBB
||
3864 !VectorLoop
->contains(I
) || I
->mayHaveSideEffects())
3867 // It's legal to sink the instruction if all its uses occur in the
3868 // predicated block. Otherwise, there's nothing to do yet, and we may
3869 // need to reanalyze the instruction.
3870 if (!llvm::all_of(I
->uses(), isBlockOfUsePredicated
)) {
3871 InstsToReanalyze
.push_back(I
);
3875 // Move the instruction to the beginning of the predicated block, and add
3876 // it's operands to the worklist.
3877 I
->moveBefore(&*PredBB
->getFirstInsertionPt());
3878 Worklist
.insert(I
->op_begin(), I
->op_end());
3880 // The sinking may have enabled other instructions to be sunk, so we will
3887 void InnerLoopVectorizer::fixNonInductionPHIs() {
3888 for (PHINode
*OrigPhi
: OrigPHIsToFix
) {
3890 cast
<PHINode
>(VectorLoopValueMap
.getVectorValue(OrigPhi
, 0));
3891 unsigned NumIncomingValues
= OrigPhi
->getNumIncomingValues();
3893 SmallVector
<BasicBlock
*, 2> ScalarBBPredecessors(
3894 predecessors(OrigPhi
->getParent()));
3895 SmallVector
<BasicBlock
*, 2> VectorBBPredecessors(
3896 predecessors(NewPhi
->getParent()));
3897 assert(ScalarBBPredecessors
.size() == VectorBBPredecessors
.size() &&
3898 "Scalar and Vector BB should have the same number of predecessors");
3900 // The insertion point in Builder may be invalidated by the time we get
3901 // here. Force the Builder insertion point to something valid so that we do
3902 // not run into issues during insertion point restore in
3903 // getOrCreateVectorValue calls below.
3904 Builder
.SetInsertPoint(NewPhi
);
3906 // The predecessor order is preserved and we can rely on mapping between
3907 // scalar and vector block predecessors.
3908 for (unsigned i
= 0; i
< NumIncomingValues
; ++i
) {
3909 BasicBlock
*NewPredBB
= VectorBBPredecessors
[i
];
3911 // When looking up the new scalar/vector values to fix up, use incoming
3912 // values from original phi.
3914 OrigPhi
->getIncomingValueForBlock(ScalarBBPredecessors
[i
]);
3916 // Scalar incoming value may need a broadcast
3917 Value
*NewIncV
= getOrCreateVectorValue(ScIncV
, 0);
3918 NewPhi
->addIncoming(NewIncV
, NewPredBB
);
3923 void InnerLoopVectorizer::widenPHIInstruction(Instruction
*PN
, unsigned UF
,
3925 PHINode
*P
= cast
<PHINode
>(PN
);
3926 if (EnableVPlanNativePath
) {
3927 // Currently we enter here in the VPlan-native path for non-induction
3928 // PHIs where all control flow is uniform. We simply widen these PHIs.
3929 // Create a vector phi with no operands - the vector phi operands will be
3930 // set at the end of vector code generation.
3932 (VF
== 1) ? PN
->getType() : VectorType::get(PN
->getType(), VF
);
3933 Value
*VecPhi
= Builder
.CreatePHI(VecTy
, PN
->getNumOperands(), "vec.phi");
3934 VectorLoopValueMap
.setVectorValue(P
, 0, VecPhi
);
3935 OrigPHIsToFix
.push_back(P
);
3940 assert(PN
->getParent() == OrigLoop
->getHeader() &&
3941 "Non-header phis should have been handled elsewhere");
3943 // In order to support recurrences we need to be able to vectorize Phi nodes.
3944 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3945 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3946 // this value when we vectorize all of the instructions that use the PHI.
3947 if (Legal
->isReductionVariable(P
) || Legal
->isFirstOrderRecurrence(P
)) {
3948 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3949 // This is phase one of vectorizing PHIs.
3951 (VF
== 1) ? PN
->getType() : VectorType::get(PN
->getType(), VF
);
3952 Value
*EntryPart
= PHINode::Create(
3953 VecTy
, 2, "vec.phi", &*LoopVectorBody
->getFirstInsertionPt());
3954 VectorLoopValueMap
.setVectorValue(P
, Part
, EntryPart
);
3959 setDebugLocFromInst(Builder
, P
);
3961 // This PHINode must be an induction variable.
3962 // Make sure that we know about it.
3963 assert(Legal
->getInductionVars()->count(P
) && "Not an induction variable");
3965 InductionDescriptor II
= Legal
->getInductionVars()->lookup(P
);
3966 const DataLayout
&DL
= OrigLoop
->getHeader()->getModule()->getDataLayout();
3968 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
3969 // which can be found from the original scalar operations.
3970 switch (II
.getKind()) {
3971 case InductionDescriptor::IK_NoInduction
:
3972 llvm_unreachable("Unknown induction");
3973 case InductionDescriptor::IK_IntInduction
:
3974 case InductionDescriptor::IK_FpInduction
:
3975 llvm_unreachable("Integer/fp induction is handled elsewhere.");
3976 case InductionDescriptor::IK_PtrInduction
: {
3977 // Handle the pointer induction variable case.
3978 assert(P
->getType()->isPointerTy() && "Unexpected type.");
3979 // This is the normalized GEP that starts counting at zero.
3980 Value
*PtrInd
= Induction
;
3981 PtrInd
= Builder
.CreateSExtOrTrunc(PtrInd
, II
.getStep()->getType());
3982 // Determine the number of scalars we need to generate for each unroll
3983 // iteration. If the instruction is uniform, we only need to generate the
3984 // first lane. Otherwise, we generate all VF values.
3985 unsigned Lanes
= Cost
->isUniformAfterVectorization(P
, VF
) ? 1 : VF
;
3986 // These are the scalar results. Notice that we don't generate vector GEPs
3987 // because scalar GEPs result in better code.
3988 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3989 for (unsigned Lane
= 0; Lane
< Lanes
; ++Lane
) {
3990 Constant
*Idx
= ConstantInt::get(PtrInd
->getType(), Lane
+ Part
* VF
);
3991 Value
*GlobalIdx
= Builder
.CreateAdd(PtrInd
, Idx
);
3993 emitTransformedIndex(Builder
, GlobalIdx
, PSE
.getSE(), DL
, II
);
3994 SclrGep
->setName("next.gep");
3995 VectorLoopValueMap
.setScalarValue(P
, {Part
, Lane
}, SclrGep
);
4003 /// A helper function for checking whether an integer division-related
4004 /// instruction may divide by zero (in which case it must be predicated if
4005 /// executed conditionally in the scalar code).
4006 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4007 /// Non-zero divisors that are non compile-time constants will not be
4008 /// converted into multiplication, so we will still end up scalarizing
4009 /// the division, but can do so w/o predication.
4010 static bool mayDivideByZero(Instruction
&I
) {
4011 assert((I
.getOpcode() == Instruction::UDiv
||
4012 I
.getOpcode() == Instruction::SDiv
||
4013 I
.getOpcode() == Instruction::URem
||
4014 I
.getOpcode() == Instruction::SRem
) &&
4015 "Unexpected instruction");
4016 Value
*Divisor
= I
.getOperand(1);
4017 auto *CInt
= dyn_cast
<ConstantInt
>(Divisor
);
4018 return !CInt
|| CInt
->isZero();
4021 void InnerLoopVectorizer::widenInstruction(Instruction
&I
) {
4022 switch (I
.getOpcode()) {
4023 case Instruction::Br
:
4024 case Instruction::PHI
:
4025 llvm_unreachable("This instruction is handled by a different recipe.");
4026 case Instruction::GetElementPtr
: {
4027 // Construct a vector GEP by widening the operands of the scalar GEP as
4028 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4029 // results in a vector of pointers when at least one operand of the GEP
4030 // is vector-typed. Thus, to keep the representation compact, we only use
4031 // vector-typed operands for loop-varying values.
4032 auto *GEP
= cast
<GetElementPtrInst
>(&I
);
4034 if (VF
> 1 && OrigLoop
->hasLoopInvariantOperands(GEP
)) {
4035 // If we are vectorizing, but the GEP has only loop-invariant operands,
4036 // the GEP we build (by only using vector-typed operands for
4037 // loop-varying values) would be a scalar pointer. Thus, to ensure we
4038 // produce a vector of pointers, we need to either arbitrarily pick an
4039 // operand to broadcast, or broadcast a clone of the original GEP.
4040 // Here, we broadcast a clone of the original.
4042 // TODO: If at some point we decide to scalarize instructions having
4043 // loop-invariant operands, this special case will no longer be
4044 // required. We would add the scalarization decision to
4045 // collectLoopScalars() and teach getVectorValue() to broadcast
4046 // the lane-zero scalar value.
4047 auto *Clone
= Builder
.Insert(GEP
->clone());
4048 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4049 Value
*EntryPart
= Builder
.CreateVectorSplat(VF
, Clone
);
4050 VectorLoopValueMap
.setVectorValue(&I
, Part
, EntryPart
);
4051 addMetadata(EntryPart
, GEP
);
4054 // If the GEP has at least one loop-varying operand, we are sure to
4055 // produce a vector of pointers. But if we are only unrolling, we want
4056 // to produce a scalar GEP for each unroll part. Thus, the GEP we
4057 // produce with the code below will be scalar (if VF == 1) or vector
4058 // (otherwise). Note that for the unroll-only case, we still maintain
4059 // values in the vector mapping with initVector, as we do for other
4061 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4062 // The pointer operand of the new GEP. If it's loop-invariant, we
4063 // won't broadcast it.
4065 OrigLoop
->isLoopInvariant(GEP
->getPointerOperand())
4066 ? GEP
->getPointerOperand()
4067 : getOrCreateVectorValue(GEP
->getPointerOperand(), Part
);
4069 // Collect all the indices for the new GEP. If any index is
4070 // loop-invariant, we won't broadcast it.
4071 SmallVector
<Value
*, 4> Indices
;
4072 for (auto &U
: make_range(GEP
->idx_begin(), GEP
->idx_end())) {
4073 if (OrigLoop
->isLoopInvariant(U
.get()))
4074 Indices
.push_back(U
.get());
4076 Indices
.push_back(getOrCreateVectorValue(U
.get(), Part
));
4079 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4080 // but it should be a vector, otherwise.
4083 ? Builder
.CreateInBoundsGEP(GEP
->getSourceElementType(), Ptr
,
4085 : Builder
.CreateGEP(GEP
->getSourceElementType(), Ptr
, Indices
);
4086 assert((VF
== 1 || NewGEP
->getType()->isVectorTy()) &&
4087 "NewGEP is not a pointer vector");
4088 VectorLoopValueMap
.setVectorValue(&I
, Part
, NewGEP
);
4089 addMetadata(NewGEP
, GEP
);
4095 case Instruction::UDiv
:
4096 case Instruction::SDiv
:
4097 case Instruction::SRem
:
4098 case Instruction::URem
:
4099 case Instruction::Add
:
4100 case Instruction::FAdd
:
4101 case Instruction::Sub
:
4102 case Instruction::FSub
:
4103 case Instruction::FNeg
:
4104 case Instruction::Mul
:
4105 case Instruction::FMul
:
4106 case Instruction::FDiv
:
4107 case Instruction::FRem
:
4108 case Instruction::Shl
:
4109 case Instruction::LShr
:
4110 case Instruction::AShr
:
4111 case Instruction::And
:
4112 case Instruction::Or
:
4113 case Instruction::Xor
: {
4114 // Just widen unops and binops.
4115 setDebugLocFromInst(Builder
, &I
);
4117 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4118 SmallVector
<Value
*, 2> Ops
;
4119 for (Value
*Op
: I
.operands())
4120 Ops
.push_back(getOrCreateVectorValue(Op
, Part
));
4122 Value
*V
= Builder
.CreateNAryOp(I
.getOpcode(), Ops
);
4124 if (auto *VecOp
= dyn_cast
<Instruction
>(V
))
4125 VecOp
->copyIRFlags(&I
);
4127 // Use this vector value for all users of the original instruction.
4128 VectorLoopValueMap
.setVectorValue(&I
, Part
, V
);
4134 case Instruction::Select
: {
4136 // If the selector is loop invariant we can create a select
4137 // instruction with a scalar condition. Otherwise, use vector-select.
4138 auto *SE
= PSE
.getSE();
4139 bool InvariantCond
=
4140 SE
->isLoopInvariant(PSE
.getSCEV(I
.getOperand(0)), OrigLoop
);
4141 setDebugLocFromInst(Builder
, &I
);
4143 // The condition can be loop invariant but still defined inside the
4144 // loop. This means that we can't just use the original 'cond' value.
4145 // We have to take the 'vectorized' value and pick the first lane.
4146 // Instcombine will make this a no-op.
4148 auto *ScalarCond
= getOrCreateScalarValue(I
.getOperand(0), {0, 0});
4150 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4151 Value
*Cond
= getOrCreateVectorValue(I
.getOperand(0), Part
);
4152 Value
*Op0
= getOrCreateVectorValue(I
.getOperand(1), Part
);
4153 Value
*Op1
= getOrCreateVectorValue(I
.getOperand(2), Part
);
4155 Builder
.CreateSelect(InvariantCond
? ScalarCond
: Cond
, Op0
, Op1
);
4156 VectorLoopValueMap
.setVectorValue(&I
, Part
, Sel
);
4157 addMetadata(Sel
, &I
);
4163 case Instruction::ICmp
:
4164 case Instruction::FCmp
: {
4165 // Widen compares. Generate vector compares.
4166 bool FCmp
= (I
.getOpcode() == Instruction::FCmp
);
4167 auto *Cmp
= dyn_cast
<CmpInst
>(&I
);
4168 setDebugLocFromInst(Builder
, Cmp
);
4169 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4170 Value
*A
= getOrCreateVectorValue(Cmp
->getOperand(0), Part
);
4171 Value
*B
= getOrCreateVectorValue(Cmp
->getOperand(1), Part
);
4174 // Propagate fast math flags.
4175 IRBuilder
<>::FastMathFlagGuard
FMFG(Builder
);
4176 Builder
.setFastMathFlags(Cmp
->getFastMathFlags());
4177 C
= Builder
.CreateFCmp(Cmp
->getPredicate(), A
, B
);
4179 C
= Builder
.CreateICmp(Cmp
->getPredicate(), A
, B
);
4181 VectorLoopValueMap
.setVectorValue(&I
, Part
, C
);
4188 case Instruction::ZExt
:
4189 case Instruction::SExt
:
4190 case Instruction::FPToUI
:
4191 case Instruction::FPToSI
:
4192 case Instruction::FPExt
:
4193 case Instruction::PtrToInt
:
4194 case Instruction::IntToPtr
:
4195 case Instruction::SIToFP
:
4196 case Instruction::UIToFP
:
4197 case Instruction::Trunc
:
4198 case Instruction::FPTrunc
:
4199 case Instruction::BitCast
: {
4200 auto *CI
= dyn_cast
<CastInst
>(&I
);
4201 setDebugLocFromInst(Builder
, CI
);
4203 /// Vectorize casts.
4205 (VF
== 1) ? CI
->getType() : VectorType::get(CI
->getType(), VF
);
4207 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4208 Value
*A
= getOrCreateVectorValue(CI
->getOperand(0), Part
);
4209 Value
*Cast
= Builder
.CreateCast(CI
->getOpcode(), A
, DestTy
);
4210 VectorLoopValueMap
.setVectorValue(&I
, Part
, Cast
);
4211 addMetadata(Cast
, &I
);
4216 case Instruction::Call
: {
4217 // Ignore dbg intrinsics.
4218 if (isa
<DbgInfoIntrinsic
>(I
))
4220 setDebugLocFromInst(Builder
, &I
);
4222 Module
*M
= I
.getParent()->getParent()->getParent();
4223 auto *CI
= cast
<CallInst
>(&I
);
4225 StringRef FnName
= CI
->getCalledFunction()->getName();
4226 Function
*F
= CI
->getCalledFunction();
4227 Type
*RetTy
= ToVectorTy(CI
->getType(), VF
);
4228 SmallVector
<Type
*, 4> Tys
;
4229 for (Value
*ArgOperand
: CI
->arg_operands())
4230 Tys
.push_back(ToVectorTy(ArgOperand
->getType(), VF
));
4232 Intrinsic::ID ID
= getVectorIntrinsicIDForCall(CI
, TLI
);
4234 // The flag shows whether we use Intrinsic or a usual Call for vectorized
4235 // version of the instruction.
4236 // Is it beneficial to perform intrinsic call compared to lib call?
4237 bool NeedToScalarize
;
4238 unsigned CallCost
= Cost
->getVectorCallCost(CI
, VF
, NeedToScalarize
);
4239 bool UseVectorIntrinsic
=
4240 ID
&& Cost
->getVectorIntrinsicCost(CI
, VF
) <= CallCost
;
4241 assert((UseVectorIntrinsic
|| !NeedToScalarize
) &&
4242 "Instruction should be scalarized elsewhere.");
4244 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4245 SmallVector
<Value
*, 4> Args
;
4246 for (unsigned i
= 0, ie
= CI
->getNumArgOperands(); i
!= ie
; ++i
) {
4247 Value
*Arg
= CI
->getArgOperand(i
);
4248 // Some intrinsics have a scalar argument - don't replace it with a
4250 if (!UseVectorIntrinsic
|| !hasVectorInstrinsicScalarOpd(ID
, i
))
4251 Arg
= getOrCreateVectorValue(CI
->getArgOperand(i
), Part
);
4252 Args
.push_back(Arg
);
4256 if (UseVectorIntrinsic
) {
4257 // Use vector version of the intrinsic.
4258 Type
*TysForDecl
[] = {CI
->getType()};
4260 TysForDecl
[0] = VectorType::get(CI
->getType()->getScalarType(), VF
);
4261 VectorF
= Intrinsic::getDeclaration(M
, ID
, TysForDecl
);
4263 // Use vector version of the library call.
4264 StringRef VFnName
= TLI
->getVectorizedFunction(FnName
, VF
);
4265 assert(!VFnName
.empty() && "Vector function name is empty.");
4266 VectorF
= M
->getFunction(VFnName
);
4268 // Generate a declaration
4269 FunctionType
*FTy
= FunctionType::get(RetTy
, Tys
, false);
4271 Function::Create(FTy
, Function::ExternalLinkage
, VFnName
, M
);
4272 VectorF
->copyAttributesFrom(F
);
4275 assert(VectorF
&& "Can't create vector function.");
4277 SmallVector
<OperandBundleDef
, 1> OpBundles
;
4278 CI
->getOperandBundlesAsDefs(OpBundles
);
4279 CallInst
*V
= Builder
.CreateCall(VectorF
, Args
, OpBundles
);
4281 if (isa
<FPMathOperator
>(V
))
4282 V
->copyFastMathFlags(CI
);
4284 VectorLoopValueMap
.setVectorValue(&I
, Part
, V
);
4292 // This instruction is not vectorized by simple widening.
4293 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I
);
4294 llvm_unreachable("Unhandled instruction!");
4298 void InnerLoopVectorizer::updateAnalysis() {
4299 // Forget the original basic block.
4300 PSE
.getSE()->forgetLoop(OrigLoop
);
4302 // DT is not kept up-to-date for outer loop vectorization
4303 if (EnableVPlanNativePath
)
4306 // Update the dominator tree information.
4307 assert(DT
->properlyDominates(LoopBypassBlocks
.front(), LoopExitBlock
) &&
4308 "Entry does not dominate exit.");
4310 DT
->addNewBlock(LoopMiddleBlock
,
4311 LI
->getLoopFor(LoopVectorBody
)->getLoopLatch());
4312 DT
->addNewBlock(LoopScalarPreHeader
, LoopBypassBlocks
[0]);
4313 DT
->changeImmediateDominator(LoopScalarBody
, LoopScalarPreHeader
);
4314 DT
->changeImmediateDominator(LoopExitBlock
, LoopBypassBlocks
[0]);
4315 assert(DT
->verify(DominatorTree::VerificationLevel::Fast
));
4318 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF
) {
4319 // We should not collect Scalars more than once per VF. Right now, this
4320 // function is called from collectUniformsAndScalars(), which already does
4321 // this check. Collecting Scalars for VF=1 does not make any sense.
4322 assert(VF
>= 2 && Scalars
.find(VF
) == Scalars
.end() &&
4323 "This function should not be visited twice for the same VF");
4325 SmallSetVector
<Instruction
*, 8> Worklist
;
4327 // These sets are used to seed the analysis with pointers used by memory
4328 // accesses that will remain scalar.
4329 SmallSetVector
<Instruction
*, 8> ScalarPtrs
;
4330 SmallPtrSet
<Instruction
*, 8> PossibleNonScalarPtrs
;
4332 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4333 // The pointer operands of loads and stores will be scalar as long as the
4334 // memory access is not a gather or scatter operation. The value operand of a
4335 // store will remain scalar if the store is scalarized.
4336 auto isScalarUse
= [&](Instruction
*MemAccess
, Value
*Ptr
) {
4337 InstWidening WideningDecision
= getWideningDecision(MemAccess
, VF
);
4338 assert(WideningDecision
!= CM_Unknown
&&
4339 "Widening decision should be ready at this moment");
4340 if (auto *Store
= dyn_cast
<StoreInst
>(MemAccess
))
4341 if (Ptr
== Store
->getValueOperand())
4342 return WideningDecision
== CM_Scalarize
;
4343 assert(Ptr
== getLoadStorePointerOperand(MemAccess
) &&
4344 "Ptr is neither a value or pointer operand");
4345 return WideningDecision
!= CM_GatherScatter
;
4348 // A helper that returns true if the given value is a bitcast or
4349 // getelementptr instruction contained in the loop.
4350 auto isLoopVaryingBitCastOrGEP
= [&](Value
*V
) {
4351 return ((isa
<BitCastInst
>(V
) && V
->getType()->isPointerTy()) ||
4352 isa
<GetElementPtrInst
>(V
)) &&
4353 !TheLoop
->isLoopInvariant(V
);
4356 // A helper that evaluates a memory access's use of a pointer. If the use
4357 // will be a scalar use, and the pointer is only used by memory accesses, we
4358 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4359 // PossibleNonScalarPtrs.
4360 auto evaluatePtrUse
= [&](Instruction
*MemAccess
, Value
*Ptr
) {
4361 // We only care about bitcast and getelementptr instructions contained in
4363 if (!isLoopVaryingBitCastOrGEP(Ptr
))
4366 // If the pointer has already been identified as scalar (e.g., if it was
4367 // also identified as uniform), there's nothing to do.
4368 auto *I
= cast
<Instruction
>(Ptr
);
4369 if (Worklist
.count(I
))
4372 // If the use of the pointer will be a scalar use, and all users of the
4373 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4374 // place the pointer in PossibleNonScalarPtrs.
4375 if (isScalarUse(MemAccess
, Ptr
) && llvm::all_of(I
->users(), [&](User
*U
) {
4376 return isa
<LoadInst
>(U
) || isa
<StoreInst
>(U
);
4378 ScalarPtrs
.insert(I
);
4380 PossibleNonScalarPtrs
.insert(I
);
4383 // We seed the scalars analysis with three classes of instructions: (1)
4384 // instructions marked uniform-after-vectorization, (2) bitcast and
4385 // getelementptr instructions used by memory accesses requiring a scalar use,
4386 // and (3) pointer induction variables and their update instructions (we
4387 // currently only scalarize these).
4389 // (1) Add to the worklist all instructions that have been identified as
4390 // uniform-after-vectorization.
4391 Worklist
.insert(Uniforms
[VF
].begin(), Uniforms
[VF
].end());
4393 // (2) Add to the worklist all bitcast and getelementptr instructions used by
4394 // memory accesses requiring a scalar use. The pointer operands of loads and
4395 // stores will be scalar as long as the memory accesses is not a gather or
4396 // scatter operation. The value operand of a store will remain scalar if the
4397 // store is scalarized.
4398 for (auto *BB
: TheLoop
->blocks())
4399 for (auto &I
: *BB
) {
4400 if (auto *Load
= dyn_cast
<LoadInst
>(&I
)) {
4401 evaluatePtrUse(Load
, Load
->getPointerOperand());
4402 } else if (auto *Store
= dyn_cast
<StoreInst
>(&I
)) {
4403 evaluatePtrUse(Store
, Store
->getPointerOperand());
4404 evaluatePtrUse(Store
, Store
->getValueOperand());
4407 for (auto *I
: ScalarPtrs
)
4408 if (PossibleNonScalarPtrs
.find(I
) == PossibleNonScalarPtrs
.end()) {
4409 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I
<< "\n");
4413 // (3) Add to the worklist all pointer induction variables and their update
4416 // TODO: Once we are able to vectorize pointer induction variables we should
4417 // no longer insert them into the worklist here.
4418 auto *Latch
= TheLoop
->getLoopLatch();
4419 for (auto &Induction
: *Legal
->getInductionVars()) {
4420 auto *Ind
= Induction
.first
;
4421 auto *IndUpdate
= cast
<Instruction
>(Ind
->getIncomingValueForBlock(Latch
));
4422 if (Induction
.second
.getKind() != InductionDescriptor::IK_PtrInduction
)
4424 Worklist
.insert(Ind
);
4425 Worklist
.insert(IndUpdate
);
4426 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind
<< "\n");
4427 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4431 // Insert the forced scalars.
4432 // FIXME: Currently widenPHIInstruction() often creates a dead vector
4433 // induction variable when the PHI user is scalarized.
4434 auto ForcedScalar
= ForcedScalars
.find(VF
);
4435 if (ForcedScalar
!= ForcedScalars
.end())
4436 for (auto *I
: ForcedScalar
->second
)
4439 // Expand the worklist by looking through any bitcasts and getelementptr
4440 // instructions we've already identified as scalar. This is similar to the
4441 // expansion step in collectLoopUniforms(); however, here we're only
4442 // expanding to include additional bitcasts and getelementptr instructions.
4444 while (Idx
!= Worklist
.size()) {
4445 Instruction
*Dst
= Worklist
[Idx
++];
4446 if (!isLoopVaryingBitCastOrGEP(Dst
->getOperand(0)))
4448 auto *Src
= cast
<Instruction
>(Dst
->getOperand(0));
4449 if (llvm::all_of(Src
->users(), [&](User
*U
) -> bool {
4450 auto *J
= cast
<Instruction
>(U
);
4451 return !TheLoop
->contains(J
) || Worklist
.count(J
) ||
4452 ((isa
<LoadInst
>(J
) || isa
<StoreInst
>(J
)) &&
4453 isScalarUse(J
, Src
));
4455 Worklist
.insert(Src
);
4456 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src
<< "\n");
4460 // An induction variable will remain scalar if all users of the induction
4461 // variable and induction variable update remain scalar.
4462 for (auto &Induction
: *Legal
->getInductionVars()) {
4463 auto *Ind
= Induction
.first
;
4464 auto *IndUpdate
= cast
<Instruction
>(Ind
->getIncomingValueForBlock(Latch
));
4466 // We already considered pointer induction variables, so there's no reason
4467 // to look at their users again.
4469 // TODO: Once we are able to vectorize pointer induction variables we
4470 // should no longer skip over them here.
4471 if (Induction
.second
.getKind() == InductionDescriptor::IK_PtrInduction
)
4474 // Determine if all users of the induction variable are scalar after
4476 auto ScalarInd
= llvm::all_of(Ind
->users(), [&](User
*U
) -> bool {
4477 auto *I
= cast
<Instruction
>(U
);
4478 return I
== IndUpdate
|| !TheLoop
->contains(I
) || Worklist
.count(I
);
4483 // Determine if all users of the induction variable update instruction are
4484 // scalar after vectorization.
4485 auto ScalarIndUpdate
=
4486 llvm::all_of(IndUpdate
->users(), [&](User
*U
) -> bool {
4487 auto *I
= cast
<Instruction
>(U
);
4488 return I
== Ind
|| !TheLoop
->contains(I
) || Worklist
.count(I
);
4490 if (!ScalarIndUpdate
)
4493 // The induction variable and its update instruction will remain scalar.
4494 Worklist
.insert(Ind
);
4495 Worklist
.insert(IndUpdate
);
4496 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind
<< "\n");
4497 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4501 Scalars
[VF
].insert(Worklist
.begin(), Worklist
.end());
4504 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction
*I
, unsigned VF
) {
4505 if (!blockNeedsPredication(I
->getParent()))
4507 switch(I
->getOpcode()) {
4510 case Instruction::Load
:
4511 case Instruction::Store
: {
4512 if (!Legal
->isMaskRequired(I
))
4514 auto *Ptr
= getLoadStorePointerOperand(I
);
4515 auto *Ty
= getMemInstValueType(I
);
4516 // We have already decided how to vectorize this instruction, get that
4519 InstWidening WideningDecision
= getWideningDecision(I
, VF
);
4520 assert(WideningDecision
!= CM_Unknown
&&
4521 "Widening decision should be ready at this moment");
4522 return WideningDecision
== CM_Scalarize
;
4524 return isa
<LoadInst
>(I
) ?
4525 !(isLegalMaskedLoad(Ty
, Ptr
) || isLegalMaskedGather(Ty
))
4526 : !(isLegalMaskedStore(Ty
, Ptr
) || isLegalMaskedScatter(Ty
));
4528 case Instruction::UDiv
:
4529 case Instruction::SDiv
:
4530 case Instruction::SRem
:
4531 case Instruction::URem
:
4532 return mayDivideByZero(*I
);
4537 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction
*I
,
4539 assert(isAccessInterleaved(I
) && "Expecting interleaved access.");
4540 assert(getWideningDecision(I
, VF
) == CM_Unknown
&&
4541 "Decision should not be set yet.");
4542 auto *Group
= getInterleavedAccessGroup(I
);
4543 assert(Group
&& "Must have a group.");
4545 // If the instruction's allocated size doesn't equal it's type size, it
4546 // requires padding and will be scalarized.
4547 auto &DL
= I
->getModule()->getDataLayout();
4548 auto *ScalarTy
= getMemInstValueType(I
);
4549 if (hasIrregularType(ScalarTy
, DL
, VF
))
4552 // Check if masking is required.
4553 // A Group may need masking for one of two reasons: it resides in a block that
4554 // needs predication, or it was decided to use masking to deal with gaps.
4555 bool PredicatedAccessRequiresMasking
=
4556 Legal
->blockNeedsPredication(I
->getParent()) && Legal
->isMaskRequired(I
);
4557 bool AccessWithGapsRequiresMasking
=
4558 Group
->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4559 if (!PredicatedAccessRequiresMasking
&& !AccessWithGapsRequiresMasking
)
4562 // If masked interleaving is required, we expect that the user/target had
4563 // enabled it, because otherwise it either wouldn't have been created or
4564 // it should have been invalidated by the CostModel.
4565 assert(useMaskedInterleavedAccesses(TTI
) &&
4566 "Masked interleave-groups for predicated accesses are not enabled.");
4568 auto *Ty
= getMemInstValueType(I
);
4569 return isa
<LoadInst
>(I
) ? TTI
.isLegalMaskedLoad(Ty
)
4570 : TTI
.isLegalMaskedStore(Ty
);
4573 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction
*I
,
4575 // Get and ensure we have a valid memory instruction.
4576 LoadInst
*LI
= dyn_cast
<LoadInst
>(I
);
4577 StoreInst
*SI
= dyn_cast
<StoreInst
>(I
);
4578 assert((LI
|| SI
) && "Invalid memory instruction");
4580 auto *Ptr
= getLoadStorePointerOperand(I
);
4582 // In order to be widened, the pointer should be consecutive, first of all.
4583 if (!Legal
->isConsecutivePtr(Ptr
))
4586 // If the instruction is a store located in a predicated block, it will be
4588 if (isScalarWithPredication(I
))
4591 // If the instruction's allocated size doesn't equal it's type size, it
4592 // requires padding and will be scalarized.
4593 auto &DL
= I
->getModule()->getDataLayout();
4594 auto *ScalarTy
= LI
? LI
->getType() : SI
->getValueOperand()->getType();
4595 if (hasIrregularType(ScalarTy
, DL
, VF
))
4601 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF
) {
4602 // We should not collect Uniforms more than once per VF. Right now,
4603 // this function is called from collectUniformsAndScalars(), which
4604 // already does this check. Collecting Uniforms for VF=1 does not make any
4607 assert(VF
>= 2 && Uniforms
.find(VF
) == Uniforms
.end() &&
4608 "This function should not be visited twice for the same VF");
4610 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4611 // not analyze again. Uniforms.count(VF) will return 1.
4612 Uniforms
[VF
].clear();
4614 // We now know that the loop is vectorizable!
4615 // Collect instructions inside the loop that will remain uniform after
4618 // Global values, params and instructions outside of current loop are out of
4620 auto isOutOfScope
= [&](Value
*V
) -> bool {
4621 Instruction
*I
= dyn_cast
<Instruction
>(V
);
4622 return (!I
|| !TheLoop
->contains(I
));
4625 SetVector
<Instruction
*> Worklist
;
4626 BasicBlock
*Latch
= TheLoop
->getLoopLatch();
4628 // Start with the conditional branch. If the branch condition is an
4629 // instruction contained in the loop that is only used by the branch, it is
4631 auto *Cmp
= dyn_cast
<Instruction
>(Latch
->getTerminator()->getOperand(0));
4632 if (Cmp
&& TheLoop
->contains(Cmp
) && Cmp
->hasOneUse()) {
4633 Worklist
.insert(Cmp
);
4634 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp
<< "\n");
4637 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4638 // are pointers that are treated like consecutive pointers during
4639 // vectorization. The pointer operands of interleaved accesses are an
4641 SmallSetVector
<Instruction
*, 8> ConsecutiveLikePtrs
;
4643 // Holds pointer operands of instructions that are possibly non-uniform.
4644 SmallPtrSet
<Instruction
*, 8> PossibleNonUniformPtrs
;
4646 auto isUniformDecision
= [&](Instruction
*I
, unsigned VF
) {
4647 InstWidening WideningDecision
= getWideningDecision(I
, VF
);
4648 assert(WideningDecision
!= CM_Unknown
&&
4649 "Widening decision should be ready at this moment");
4651 return (WideningDecision
== CM_Widen
||
4652 WideningDecision
== CM_Widen_Reverse
||
4653 WideningDecision
== CM_Interleave
);
4655 // Iterate over the instructions in the loop, and collect all
4656 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4657 // that a consecutive-like pointer operand will be scalarized, we collect it
4658 // in PossibleNonUniformPtrs instead. We use two sets here because a single
4659 // getelementptr instruction can be used by both vectorized and scalarized
4660 // memory instructions. For example, if a loop loads and stores from the same
4661 // location, but the store is conditional, the store will be scalarized, and
4662 // the getelementptr won't remain uniform.
4663 for (auto *BB
: TheLoop
->blocks())
4664 for (auto &I
: *BB
) {
4665 // If there's no pointer operand, there's nothing to do.
4666 auto *Ptr
= dyn_cast_or_null
<Instruction
>(getLoadStorePointerOperand(&I
));
4670 // True if all users of Ptr are memory accesses that have Ptr as their
4672 auto UsersAreMemAccesses
=
4673 llvm::all_of(Ptr
->users(), [&](User
*U
) -> bool {
4674 return getLoadStorePointerOperand(U
) == Ptr
;
4677 // Ensure the memory instruction will not be scalarized or used by
4678 // gather/scatter, making its pointer operand non-uniform. If the pointer
4679 // operand is used by any instruction other than a memory access, we
4680 // conservatively assume the pointer operand may be non-uniform.
4681 if (!UsersAreMemAccesses
|| !isUniformDecision(&I
, VF
))
4682 PossibleNonUniformPtrs
.insert(Ptr
);
4684 // If the memory instruction will be vectorized and its pointer operand
4685 // is consecutive-like, or interleaving - the pointer operand should
4688 ConsecutiveLikePtrs
.insert(Ptr
);
4691 // Add to the Worklist all consecutive and consecutive-like pointers that
4692 // aren't also identified as possibly non-uniform.
4693 for (auto *V
: ConsecutiveLikePtrs
)
4694 if (PossibleNonUniformPtrs
.find(V
) == PossibleNonUniformPtrs
.end()) {
4695 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V
<< "\n");
4699 // Expand Worklist in topological order: whenever a new instruction
4700 // is added , its users should be already inside Worklist. It ensures
4701 // a uniform instruction will only be used by uniform instructions.
4703 while (idx
!= Worklist
.size()) {
4704 Instruction
*I
= Worklist
[idx
++];
4706 for (auto OV
: I
->operand_values()) {
4707 // isOutOfScope operands cannot be uniform instructions.
4708 if (isOutOfScope(OV
))
4710 // First order recurrence Phi's should typically be considered
4712 auto *OP
= dyn_cast
<PHINode
>(OV
);
4713 if (OP
&& Legal
->isFirstOrderRecurrence(OP
))
4715 // If all the users of the operand are uniform, then add the
4716 // operand into the uniform worklist.
4717 auto *OI
= cast
<Instruction
>(OV
);
4718 if (llvm::all_of(OI
->users(), [&](User
*U
) -> bool {
4719 auto *J
= cast
<Instruction
>(U
);
4720 return Worklist
.count(J
) ||
4721 (OI
== getLoadStorePointerOperand(J
) &&
4722 isUniformDecision(J
, VF
));
4724 Worklist
.insert(OI
);
4725 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI
<< "\n");
4730 // Returns true if Ptr is the pointer operand of a memory access instruction
4731 // I, and I is known to not require scalarization.
4732 auto isVectorizedMemAccessUse
= [&](Instruction
*I
, Value
*Ptr
) -> bool {
4733 return getLoadStorePointerOperand(I
) == Ptr
&& isUniformDecision(I
, VF
);
4736 // For an instruction to be added into Worklist above, all its users inside
4737 // the loop should also be in Worklist. However, this condition cannot be
4738 // true for phi nodes that form a cyclic dependence. We must process phi
4739 // nodes separately. An induction variable will remain uniform if all users
4740 // of the induction variable and induction variable update remain uniform.
4741 // The code below handles both pointer and non-pointer induction variables.
4742 for (auto &Induction
: *Legal
->getInductionVars()) {
4743 auto *Ind
= Induction
.first
;
4744 auto *IndUpdate
= cast
<Instruction
>(Ind
->getIncomingValueForBlock(Latch
));
4746 // Determine if all users of the induction variable are uniform after
4748 auto UniformInd
= llvm::all_of(Ind
->users(), [&](User
*U
) -> bool {
4749 auto *I
= cast
<Instruction
>(U
);
4750 return I
== IndUpdate
|| !TheLoop
->contains(I
) || Worklist
.count(I
) ||
4751 isVectorizedMemAccessUse(I
, Ind
);
4756 // Determine if all users of the induction variable update instruction are
4757 // uniform after vectorization.
4758 auto UniformIndUpdate
=
4759 llvm::all_of(IndUpdate
->users(), [&](User
*U
) -> bool {
4760 auto *I
= cast
<Instruction
>(U
);
4761 return I
== Ind
|| !TheLoop
->contains(I
) || Worklist
.count(I
) ||
4762 isVectorizedMemAccessUse(I
, IndUpdate
);
4764 if (!UniformIndUpdate
)
4767 // The induction variable and its update instruction will remain uniform.
4768 Worklist
.insert(Ind
);
4769 Worklist
.insert(IndUpdate
);
4770 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind
<< "\n");
4771 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4775 Uniforms
[VF
].insert(Worklist
.begin(), Worklist
.end());
4778 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4779 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4781 if (Legal
->getRuntimePointerChecking()->Need
) {
4782 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4783 "runtime pointer checks needed. Enable vectorization of this "
4784 "loop with '#pragma clang loop vectorize(enable)' when "
4785 "compiling with -Os/-Oz",
4786 "CantVersionLoopWithOptForSize", ORE
, TheLoop
);
4790 if (!PSE
.getUnionPredicate().getPredicates().empty()) {
4791 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4792 "runtime SCEV checks needed. Enable vectorization of this "
4793 "loop with '#pragma clang loop vectorize(enable)' when "
4794 "compiling with -Os/-Oz",
4795 "CantVersionLoopWithOptForSize", ORE
, TheLoop
);
4799 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4800 if (!Legal
->getLAI()->getSymbolicStrides().empty()) {
4801 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4802 "runtime stride == 1 checks needed. Enable vectorization of "
4803 "this loop with '#pragma clang loop vectorize(enable)' when "
4804 "compiling with -Os/-Oz",
4805 "CantVersionLoopWithOptForSize", ORE
, TheLoop
);
4812 Optional
<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4813 if (Legal
->getRuntimePointerChecking()->Need
&& TTI
.hasBranchDivergence()) {
4814 // TODO: It may by useful to do since it's still likely to be dynamically
4815 // uniform if the target can skip.
4816 reportVectorizationFailure(
4817 "Not inserting runtime ptr check for divergent target",
4818 "runtime pointer checks needed. Not enabled for divergent target",
4819 "CantVersionLoopWithDivergentTarget", ORE
, TheLoop
);
4823 unsigned TC
= PSE
.getSE()->getSmallConstantTripCount(TheLoop
);
4824 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC
<< '\n');
4826 reportVectorizationFailure("Single iteration (non) loop",
4827 "loop trip count is one, irrelevant for vectorization",
4828 "SingleIterationLoop", ORE
, TheLoop
);
4832 switch (ScalarEpilogueStatus
) {
4833 case CM_ScalarEpilogueAllowed
:
4834 return computeFeasibleMaxVF(TC
);
4835 case CM_ScalarEpilogueNotNeededUsePredicate
:
4837 dbgs() << "LV: vector predicate hint/switch found.\n"
4838 << "LV: Not allowing scalar epilogue, creating predicated "
4839 << "vector loop.\n");
4841 case CM_ScalarEpilogueNotAllowedLowTripLoop
:
4842 // fallthrough as a special case of OptForSize
4843 case CM_ScalarEpilogueNotAllowedOptSize
:
4844 if (ScalarEpilogueStatus
== CM_ScalarEpilogueNotAllowedOptSize
)
4846 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4848 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4851 // Bail if runtime checks are required, which are not good when optimising
4853 if (runtimeChecksRequired())
4858 // Now try the tail folding
4860 // Invalidate interleave groups that require an epilogue if we can't mask
4861 // the interleave-group.
4862 if (!useMaskedInterleavedAccesses(TTI
))
4863 InterleaveInfo
.invalidateGroupsRequiringScalarEpilogue();
4865 unsigned MaxVF
= computeFeasibleMaxVF(TC
);
4866 if (TC
> 0 && TC
% MaxVF
== 0) {
4867 // Accept MaxVF if we do not have a tail.
4868 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4872 // If we don't know the precise trip count, or if the trip count that we
4873 // found modulo the vectorization factor is not zero, try to fold the tail
4875 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4876 if (Legal
->prepareToFoldTailByMasking()) {
4877 FoldTailByMasking
= true;
4882 reportVectorizationFailure(
4883 "Unable to calculate the loop count due to complex control flow",
4884 "unable to calculate the loop count due to complex control flow",
4885 "UnknownLoopCountComplexCFG", ORE
, TheLoop
);
4889 reportVectorizationFailure(
4890 "Cannot optimize for size and vectorize at the same time.",
4891 "cannot optimize for size and vectorize at the same time. "
4892 "Enable vectorization of this loop with '#pragma clang loop "
4893 "vectorize(enable)' when compiling with -Os/-Oz",
4894 "NoTailLoopWithOptForSize", ORE
, TheLoop
);
4899 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount
) {
4900 MinBWs
= computeMinimumValueSizes(TheLoop
->getBlocks(), *DB
, &TTI
);
4901 unsigned SmallestType
, WidestType
;
4902 std::tie(SmallestType
, WidestType
) = getSmallestAndWidestTypes();
4903 unsigned WidestRegister
= TTI
.getRegisterBitWidth(true);
4905 // Get the maximum safe dependence distance in bits computed by LAA.
4906 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4907 // the memory accesses that is most restrictive (involved in the smallest
4908 // dependence distance).
4909 unsigned MaxSafeRegisterWidth
= Legal
->getMaxSafeRegisterWidth();
4911 WidestRegister
= std::min(WidestRegister
, MaxSafeRegisterWidth
);
4913 unsigned MaxVectorSize
= WidestRegister
/ WidestType
;
4915 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4916 << " / " << WidestType
<< " bits.\n");
4917 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4918 << WidestRegister
<< " bits.\n");
4920 assert(MaxVectorSize
<= 256 && "Did not expect to pack so many elements"
4921 " into one vector!");
4922 if (MaxVectorSize
== 0) {
4923 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4925 return MaxVectorSize
;
4926 } else if (ConstTripCount
&& ConstTripCount
< MaxVectorSize
&&
4927 isPowerOf2_32(ConstTripCount
)) {
4928 // We need to clamp the VF to be the ConstTripCount. There is no point in
4929 // choosing a higher viable VF as done in the loop below.
4930 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4931 << ConstTripCount
<< "\n");
4932 MaxVectorSize
= ConstTripCount
;
4933 return MaxVectorSize
;
4936 unsigned MaxVF
= MaxVectorSize
;
4937 if (TTI
.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
4938 (MaximizeBandwidth
&& isScalarEpilogueAllowed())) {
4939 // Collect all viable vectorization factors larger than the default MaxVF
4940 // (i.e. MaxVectorSize).
4941 SmallVector
<unsigned, 8> VFs
;
4942 unsigned NewMaxVectorSize
= WidestRegister
/ SmallestType
;
4943 for (unsigned VS
= MaxVectorSize
* 2; VS
<= NewMaxVectorSize
; VS
*= 2)
4946 // For each VF calculate its register usage.
4947 auto RUs
= calculateRegisterUsage(VFs
);
4949 // Select the largest VF which doesn't require more registers than existing
4951 unsigned TargetNumRegisters
= TTI
.getNumberOfRegisters(true);
4952 for (int i
= RUs
.size() - 1; i
>= 0; --i
) {
4953 if (RUs
[i
].MaxLocalUsers
<= TargetNumRegisters
) {
4958 if (unsigned MinVF
= TTI
.getMinimumVF(SmallestType
)) {
4959 if (MaxVF
< MinVF
) {
4960 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4961 << ") with target's minimum: " << MinVF
<< '\n');
4970 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF
) {
4971 float Cost
= expectedCost(1).first
;
4972 const float ScalarCost
= Cost
;
4974 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost
<< ".\n");
4976 bool ForceVectorization
= Hints
->getForce() == LoopVectorizeHints::FK_Enabled
;
4977 if (ForceVectorization
&& MaxVF
> 1) {
4978 // Ignore scalar width, because the user explicitly wants vectorization.
4979 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4981 Cost
= std::numeric_limits
<float>::max();
4984 for (unsigned i
= 2; i
<= MaxVF
; i
*= 2) {
4985 // Notice that the vector loop needs to be executed less times, so
4986 // we need to divide the cost of the vector loops by the width of
4987 // the vector elements.
4988 VectorizationCostTy C
= expectedCost(i
);
4989 float VectorCost
= C
.first
/ (float)i
;
4990 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
4991 << " costs: " << (int)VectorCost
<< ".\n");
4992 if (!C
.second
&& !ForceVectorization
) {
4994 dbgs() << "LV: Not considering vector loop of width " << i
4995 << " because it will not generate any vector instructions.\n");
4998 if (VectorCost
< Cost
) {
5004 if (!EnableCondStoresVectorization
&& NumPredStores
) {
5005 reportVectorizationFailure("There are conditional stores.",
5006 "store that is conditionally executed prevents vectorization",
5007 "ConditionalStore", ORE
, TheLoop
);
5012 LLVM_DEBUG(if (ForceVectorization
&& Width
> 1 && Cost
>= ScalarCost
) dbgs()
5013 << "LV: Vectorization seems to be not beneficial, "
5014 << "but was forced by a user.\n");
5015 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width
<< ".\n");
5016 VectorizationFactor Factor
= {Width
, (unsigned)(Width
* Cost
)};
5020 std::pair
<unsigned, unsigned>
5021 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5022 unsigned MinWidth
= -1U;
5023 unsigned MaxWidth
= 8;
5024 const DataLayout
&DL
= TheFunction
->getParent()->getDataLayout();
5027 for (BasicBlock
*BB
: TheLoop
->blocks()) {
5028 // For each instruction in the loop.
5029 for (Instruction
&I
: BB
->instructionsWithoutDebug()) {
5030 Type
*T
= I
.getType();
5032 // Skip ignored values.
5033 if (ValuesToIgnore
.find(&I
) != ValuesToIgnore
.end())
5036 // Only examine Loads, Stores and PHINodes.
5037 if (!isa
<LoadInst
>(I
) && !isa
<StoreInst
>(I
) && !isa
<PHINode
>(I
))
5040 // Examine PHI nodes that are reduction variables. Update the type to
5041 // account for the recurrence type.
5042 if (auto *PN
= dyn_cast
<PHINode
>(&I
)) {
5043 if (!Legal
->isReductionVariable(PN
))
5045 RecurrenceDescriptor RdxDesc
= (*Legal
->getReductionVars())[PN
];
5046 T
= RdxDesc
.getRecurrenceType();
5049 // Examine the stored values.
5050 if (auto *ST
= dyn_cast
<StoreInst
>(&I
))
5051 T
= ST
->getValueOperand()->getType();
5053 // Ignore loaded pointer types and stored pointer types that are not
5056 // FIXME: The check here attempts to predict whether a load or store will
5057 // be vectorized. We only know this for certain after a VF has
5058 // been selected. Here, we assume that if an access can be
5059 // vectorized, it will be. We should also look at extending this
5060 // optimization to non-pointer types.
5062 if (T
->isPointerTy() && !isConsecutiveLoadOrStore(&I
) &&
5063 !isAccessInterleaved(&I
) && !isLegalGatherOrScatter(&I
))
5066 MinWidth
= std::min(MinWidth
,
5067 (unsigned)DL
.getTypeSizeInBits(T
->getScalarType()));
5068 MaxWidth
= std::max(MaxWidth
,
5069 (unsigned)DL
.getTypeSizeInBits(T
->getScalarType()));
5073 return {MinWidth
, MaxWidth
};
5076 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF
,
5077 unsigned LoopCost
) {
5078 // -- The interleave heuristics --
5079 // We interleave the loop in order to expose ILP and reduce the loop overhead.
5080 // There are many micro-architectural considerations that we can't predict
5081 // at this level. For example, frontend pressure (on decode or fetch) due to
5082 // code size, or the number and capabilities of the execution ports.
5084 // We use the following heuristics to select the interleave count:
5085 // 1. If the code has reductions, then we interleave to break the cross
5086 // iteration dependency.
5087 // 2. If the loop is really small, then we interleave to reduce the loop
5089 // 3. We don't interleave if we think that we will spill registers to memory
5090 // due to the increased register pressure.
5092 if (!isScalarEpilogueAllowed())
5095 // We used the distance for the interleave count.
5096 if (Legal
->getMaxSafeDepDistBytes() != -1U)
5099 // Do not interleave loops with a relatively small trip count.
5100 unsigned TC
= PSE
.getSE()->getSmallConstantTripCount(TheLoop
);
5101 if (TC
> 1 && TC
< TinyTripCountInterleaveThreshold
)
5104 unsigned TargetNumRegisters
= TTI
.getNumberOfRegisters(VF
> 1);
5105 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5109 if (ForceTargetNumScalarRegs
.getNumOccurrences() > 0)
5110 TargetNumRegisters
= ForceTargetNumScalarRegs
;
5112 if (ForceTargetNumVectorRegs
.getNumOccurrences() > 0)
5113 TargetNumRegisters
= ForceTargetNumVectorRegs
;
5116 RegisterUsage R
= calculateRegisterUsage({VF
})[0];
5117 // We divide by these constants so assume that we have at least one
5118 // instruction that uses at least one register.
5119 R
.MaxLocalUsers
= std::max(R
.MaxLocalUsers
, 1U);
5121 // We calculate the interleave count using the following formula.
5122 // Subtract the number of loop invariants from the number of available
5123 // registers. These registers are used by all of the interleaved instances.
5124 // Next, divide the remaining registers by the number of registers that is
5125 // required by the loop, in order to estimate how many parallel instances
5126 // fit without causing spills. All of this is rounded down if necessary to be
5127 // a power of two. We want power of two interleave count to simplify any
5128 // addressing operations or alignment considerations.
5129 // We also want power of two interleave counts to ensure that the induction
5130 // variable of the vector loop wraps to zero, when tail is folded by masking;
5131 // this currently happens when OptForSize, in which case IC is set to 1 above.
5132 unsigned IC
= PowerOf2Floor((TargetNumRegisters
- R
.LoopInvariantRegs
) /
5135 // Don't count the induction variable as interleaved.
5136 if (EnableIndVarRegisterHeur
)
5137 IC
= PowerOf2Floor((TargetNumRegisters
- R
.LoopInvariantRegs
- 1) /
5138 std::max(1U, (R
.MaxLocalUsers
- 1)));
5140 // Clamp the interleave ranges to reasonable counts.
5141 unsigned MaxInterleaveCount
= TTI
.getMaxInterleaveFactor(VF
);
5143 // Check if the user has overridden the max.
5145 if (ForceTargetMaxScalarInterleaveFactor
.getNumOccurrences() > 0)
5146 MaxInterleaveCount
= ForceTargetMaxScalarInterleaveFactor
;
5148 if (ForceTargetMaxVectorInterleaveFactor
.getNumOccurrences() > 0)
5149 MaxInterleaveCount
= ForceTargetMaxVectorInterleaveFactor
;
5152 // If the trip count is constant, limit the interleave count to be less than
5153 // the trip count divided by VF.
5155 assert(TC
>= VF
&& "VF exceeds trip count?");
5156 if ((TC
/ VF
) < MaxInterleaveCount
)
5157 MaxInterleaveCount
= (TC
/ VF
);
5160 // If we did not calculate the cost for VF (because the user selected the VF)
5161 // then we calculate the cost of VF here.
5163 LoopCost
= expectedCost(VF
).first
;
5165 assert(LoopCost
&& "Non-zero loop cost expected");
5167 // Clamp the calculated IC to be between the 1 and the max interleave count
5168 // that the target and trip count allows.
5169 if (IC
> MaxInterleaveCount
)
5170 IC
= MaxInterleaveCount
;
5174 // Interleave if we vectorized this loop and there is a reduction that could
5175 // benefit from interleaving.
5176 if (VF
> 1 && !Legal
->getReductionVars()->empty()) {
5177 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5181 // Note that if we've already vectorized the loop we will have done the
5182 // runtime check and so interleaving won't require further checks.
5183 bool InterleavingRequiresRuntimePointerCheck
=
5184 (VF
== 1 && Legal
->getRuntimePointerChecking()->Need
);
5186 // We want to interleave small loops in order to reduce the loop overhead and
5187 // potentially expose ILP opportunities.
5188 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost
<< '\n');
5189 if (!InterleavingRequiresRuntimePointerCheck
&& LoopCost
< SmallLoopCost
) {
5190 // We assume that the cost overhead is 1 and we use the cost model
5191 // to estimate the cost of the loop and interleave until the cost of the
5192 // loop overhead is about 5% of the cost of the loop.
5194 std::min(IC
, (unsigned)PowerOf2Floor(SmallLoopCost
/ LoopCost
));
5196 // Interleave until store/load ports (estimated by max interleave count) are
5198 unsigned NumStores
= Legal
->getNumStores();
5199 unsigned NumLoads
= Legal
->getNumLoads();
5200 unsigned StoresIC
= IC
/ (NumStores
? NumStores
: 1);
5201 unsigned LoadsIC
= IC
/ (NumLoads
? NumLoads
: 1);
5203 // If we have a scalar reduction (vector reductions are already dealt with
5204 // by this point), we can increase the critical path length if the loop
5205 // we're interleaving is inside another loop. Limit, by default to 2, so the
5206 // critical path only gets increased by one reduction operation.
5207 if (!Legal
->getReductionVars()->empty() && TheLoop
->getLoopDepth() > 1) {
5208 unsigned F
= static_cast<unsigned>(MaxNestedScalarReductionIC
);
5209 SmallIC
= std::min(SmallIC
, F
);
5210 StoresIC
= std::min(StoresIC
, F
);
5211 LoadsIC
= std::min(LoadsIC
, F
);
5214 if (EnableLoadStoreRuntimeInterleave
&&
5215 std::max(StoresIC
, LoadsIC
) > SmallIC
) {
5217 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5218 return std::max(StoresIC
, LoadsIC
);
5221 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5225 // Interleave if this is a large loop (small loops are already dealt with by
5226 // this point) that could benefit from interleaving.
5227 bool HasReductions
= !Legal
->getReductionVars()->empty();
5228 if (TTI
.enableAggressiveInterleaving(HasReductions
)) {
5229 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5233 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5237 SmallVector
<LoopVectorizationCostModel::RegisterUsage
, 8>
5238 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef
<unsigned> VFs
) {
5239 // This function calculates the register usage by measuring the highest number
5240 // of values that are alive at a single location. Obviously, this is a very
5241 // rough estimation. We scan the loop in a topological order in order and
5242 // assign a number to each instruction. We use RPO to ensure that defs are
5243 // met before their users. We assume that each instruction that has in-loop
5244 // users starts an interval. We record every time that an in-loop value is
5245 // used, so we have a list of the first and last occurrences of each
5246 // instruction. Next, we transpose this data structure into a multi map that
5247 // holds the list of intervals that *end* at a specific location. This multi
5248 // map allows us to perform a linear search. We scan the instructions linearly
5249 // and record each time that a new interval starts, by placing it in a set.
5250 // If we find this value in the multi-map then we remove it from the set.
5251 // The max register usage is the maximum size of the set.
5252 // We also search for instructions that are defined outside the loop, but are
5253 // used inside the loop. We need this number separately from the max-interval
5254 // usage number because when we unroll, loop-invariant values do not take
5256 LoopBlocksDFS
DFS(TheLoop
);
5261 // Each 'key' in the map opens a new interval. The values
5262 // of the map are the index of the 'last seen' usage of the
5263 // instruction that is the key.
5264 using IntervalMap
= DenseMap
<Instruction
*, unsigned>;
5266 // Maps instruction to its index.
5267 SmallVector
<Instruction
*, 64> IdxToInstr
;
5268 // Marks the end of each interval.
5269 IntervalMap EndPoint
;
5270 // Saves the list of instruction indices that are used in the loop.
5271 SmallPtrSet
<Instruction
*, 8> Ends
;
5272 // Saves the list of values that are used in the loop but are
5273 // defined outside the loop, such as arguments and constants.
5274 SmallPtrSet
<Value
*, 8> LoopInvariants
;
5276 for (BasicBlock
*BB
: make_range(DFS
.beginRPO(), DFS
.endRPO())) {
5277 for (Instruction
&I
: BB
->instructionsWithoutDebug()) {
5278 IdxToInstr
.push_back(&I
);
5280 // Save the end location of each USE.
5281 for (Value
*U
: I
.operands()) {
5282 auto *Instr
= dyn_cast
<Instruction
>(U
);
5284 // Ignore non-instruction values such as arguments, constants, etc.
5288 // If this instruction is outside the loop then record it and continue.
5289 if (!TheLoop
->contains(Instr
)) {
5290 LoopInvariants
.insert(Instr
);
5294 // Overwrite previous end points.
5295 EndPoint
[Instr
] = IdxToInstr
.size();
5301 // Saves the list of intervals that end with the index in 'key'.
5302 using InstrList
= SmallVector
<Instruction
*, 2>;
5303 DenseMap
<unsigned, InstrList
> TransposeEnds
;
5305 // Transpose the EndPoints to a list of values that end at each index.
5306 for (auto &Interval
: EndPoint
)
5307 TransposeEnds
[Interval
.second
].push_back(Interval
.first
);
5309 SmallPtrSet
<Instruction
*, 8> OpenIntervals
;
5311 // Get the size of the widest register.
5312 unsigned MaxSafeDepDist
= -1U;
5313 if (Legal
->getMaxSafeDepDistBytes() != -1U)
5314 MaxSafeDepDist
= Legal
->getMaxSafeDepDistBytes() * 8;
5315 unsigned WidestRegister
=
5316 std::min(TTI
.getRegisterBitWidth(true), MaxSafeDepDist
);
5317 const DataLayout
&DL
= TheFunction
->getParent()->getDataLayout();
5319 SmallVector
<RegisterUsage
, 8> RUs(VFs
.size());
5320 SmallVector
<unsigned, 8> MaxUsages(VFs
.size(), 0);
5322 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5324 // A lambda that gets the register usage for the given type and VF.
5325 auto GetRegUsage
= [&DL
, WidestRegister
](Type
*Ty
, unsigned VF
) {
5326 if (Ty
->isTokenTy())
5328 unsigned TypeSize
= DL
.getTypeSizeInBits(Ty
->getScalarType());
5329 return std::max
<unsigned>(1, VF
* TypeSize
/ WidestRegister
);
5332 for (unsigned int i
= 0, s
= IdxToInstr
.size(); i
< s
; ++i
) {
5333 Instruction
*I
= IdxToInstr
[i
];
5335 // Remove all of the instructions that end at this location.
5336 InstrList
&List
= TransposeEnds
[i
];
5337 for (Instruction
*ToRemove
: List
)
5338 OpenIntervals
.erase(ToRemove
);
5340 // Ignore instructions that are never used within the loop.
5341 if (Ends
.find(I
) == Ends
.end())
5344 // Skip ignored values.
5345 if (ValuesToIgnore
.find(I
) != ValuesToIgnore
.end())
5348 // For each VF find the maximum usage of registers.
5349 for (unsigned j
= 0, e
= VFs
.size(); j
< e
; ++j
) {
5351 MaxUsages
[j
] = std::max(MaxUsages
[j
], OpenIntervals
.size());
5354 collectUniformsAndScalars(VFs
[j
]);
5355 // Count the number of live intervals.
5356 unsigned RegUsage
= 0;
5357 for (auto Inst
: OpenIntervals
) {
5358 // Skip ignored values for VF > 1.
5359 if (VecValuesToIgnore
.find(Inst
) != VecValuesToIgnore
.end() ||
5360 isScalarAfterVectorization(Inst
, VFs
[j
]))
5362 RegUsage
+= GetRegUsage(Inst
->getType(), VFs
[j
]);
5364 MaxUsages
[j
] = std::max(MaxUsages
[j
], RegUsage
);
5367 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i
<< " Interval # "
5368 << OpenIntervals
.size() << '\n');
5370 // Add the current instruction to the list of open intervals.
5371 OpenIntervals
.insert(I
);
5374 for (unsigned i
= 0, e
= VFs
.size(); i
< e
; ++i
) {
5375 unsigned Invariant
= 0;
5377 Invariant
= LoopInvariants
.size();
5379 for (auto Inst
: LoopInvariants
)
5380 Invariant
+= GetRegUsage(Inst
->getType(), VFs
[i
]);
5383 LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs
[i
] << '\n');
5384 LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages
[i
] << '\n');
5385 LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant
5388 RU
.LoopInvariantRegs
= Invariant
;
5389 RU
.MaxLocalUsers
= MaxUsages
[i
];
5396 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction
*I
){
5397 // TODO: Cost model for emulated masked load/store is completely
5398 // broken. This hack guides the cost model to use an artificially
5399 // high enough value to practically disable vectorization with such
5400 // operations, except where previously deployed legality hack allowed
5401 // using very low cost values. This is to avoid regressions coming simply
5402 // from moving "masked load/store" check from legality to cost model.
5403 // Masked Load/Gather emulation was previously never allowed.
5404 // Limited number of Masked Store/Scatter emulation was allowed.
5405 assert(isPredicatedInst(I
) && "Expecting a scalar emulated instruction");
5406 return isa
<LoadInst
>(I
) ||
5407 (isa
<StoreInst
>(I
) &&
5408 NumPredStores
> NumberOfStoresToPredicate
);
5411 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF
) {
5412 // If we aren't vectorizing the loop, or if we've already collected the
5413 // instructions to scalarize, there's nothing to do. Collection may already
5414 // have occurred if we have a user-selected VF and are now computing the
5415 // expected cost for interleaving.
5416 if (VF
< 2 || InstsToScalarize
.find(VF
) != InstsToScalarize
.end())
5419 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5420 // not profitable to scalarize any instructions, the presence of VF in the
5421 // map will indicate that we've analyzed it already.
5422 ScalarCostsTy
&ScalarCostsVF
= InstsToScalarize
[VF
];
5424 // Find all the instructions that are scalar with predication in the loop and
5425 // determine if it would be better to not if-convert the blocks they are in.
5426 // If so, we also record the instructions to scalarize.
5427 for (BasicBlock
*BB
: TheLoop
->blocks()) {
5428 if (!blockNeedsPredication(BB
))
5430 for (Instruction
&I
: *BB
)
5431 if (isScalarWithPredication(&I
)) {
5432 ScalarCostsTy ScalarCosts
;
5433 // Do not apply discount logic if hacked cost is needed
5434 // for emulated masked memrefs.
5435 if (!useEmulatedMaskMemRefHack(&I
) &&
5436 computePredInstDiscount(&I
, ScalarCosts
, VF
) >= 0)
5437 ScalarCostsVF
.insert(ScalarCosts
.begin(), ScalarCosts
.end());
5438 // Remember that BB will remain after vectorization.
5439 PredicatedBBsAfterVectorization
.insert(BB
);
5444 int LoopVectorizationCostModel::computePredInstDiscount(
5445 Instruction
*PredInst
, DenseMap
<Instruction
*, unsigned> &ScalarCosts
,
5447 assert(!isUniformAfterVectorization(PredInst
, VF
) &&
5448 "Instruction marked uniform-after-vectorization will be predicated");
5450 // Initialize the discount to zero, meaning that the scalar version and the
5451 // vector version cost the same.
5454 // Holds instructions to analyze. The instructions we visit are mapped in
5455 // ScalarCosts. Those instructions are the ones that would be scalarized if
5456 // we find that the scalar version costs less.
5457 SmallVector
<Instruction
*, 8> Worklist
;
5459 // Returns true if the given instruction can be scalarized.
5460 auto canBeScalarized
= [&](Instruction
*I
) -> bool {
5461 // We only attempt to scalarize instructions forming a single-use chain
5462 // from the original predicated block that would otherwise be vectorized.
5463 // Although not strictly necessary, we give up on instructions we know will
5464 // already be scalar to avoid traversing chains that are unlikely to be
5466 if (!I
->hasOneUse() || PredInst
->getParent() != I
->getParent() ||
5467 isScalarAfterVectorization(I
, VF
))
5470 // If the instruction is scalar with predication, it will be analyzed
5471 // separately. We ignore it within the context of PredInst.
5472 if (isScalarWithPredication(I
))
5475 // If any of the instruction's operands are uniform after vectorization,
5476 // the instruction cannot be scalarized. This prevents, for example, a
5477 // masked load from being scalarized.
5479 // We assume we will only emit a value for lane zero of an instruction
5480 // marked uniform after vectorization, rather than VF identical values.
5481 // Thus, if we scalarize an instruction that uses a uniform, we would
5482 // create uses of values corresponding to the lanes we aren't emitting code
5483 // for. This behavior can be changed by allowing getScalarValue to clone
5484 // the lane zero values for uniforms rather than asserting.
5485 for (Use
&U
: I
->operands())
5486 if (auto *J
= dyn_cast
<Instruction
>(U
.get()))
5487 if (isUniformAfterVectorization(J
, VF
))
5490 // Otherwise, we can scalarize the instruction.
5494 // Compute the expected cost discount from scalarizing the entire expression
5495 // feeding the predicated instruction. We currently only consider expressions
5496 // that are single-use instruction chains.
5497 Worklist
.push_back(PredInst
);
5498 while (!Worklist
.empty()) {
5499 Instruction
*I
= Worklist
.pop_back_val();
5501 // If we've already analyzed the instruction, there's nothing to do.
5502 if (ScalarCosts
.find(I
) != ScalarCosts
.end())
5505 // Compute the cost of the vector instruction. Note that this cost already
5506 // includes the scalarization overhead of the predicated instruction.
5507 unsigned VectorCost
= getInstructionCost(I
, VF
).first
;
5509 // Compute the cost of the scalarized instruction. This cost is the cost of
5510 // the instruction as if it wasn't if-converted and instead remained in the
5511 // predicated block. We will scale this cost by block probability after
5512 // computing the scalarization overhead.
5513 unsigned ScalarCost
= VF
* getInstructionCost(I
, 1).first
;
5515 // Compute the scalarization overhead of needed insertelement instructions
5517 if (isScalarWithPredication(I
) && !I
->getType()->isVoidTy()) {
5518 ScalarCost
+= TTI
.getScalarizationOverhead(ToVectorTy(I
->getType(), VF
),
5520 ScalarCost
+= VF
* TTI
.getCFInstrCost(Instruction::PHI
);
5523 // Compute the scalarization overhead of needed extractelement
5524 // instructions. For each of the instruction's operands, if the operand can
5525 // be scalarized, add it to the worklist; otherwise, account for the
5527 for (Use
&U
: I
->operands())
5528 if (auto *J
= dyn_cast
<Instruction
>(U
.get())) {
5529 assert(VectorType::isValidElementType(J
->getType()) &&
5530 "Instruction has non-scalar type");
5531 if (canBeScalarized(J
))
5532 Worklist
.push_back(J
);
5533 else if (needsExtract(J
, VF
))
5534 ScalarCost
+= TTI
.getScalarizationOverhead(
5535 ToVectorTy(J
->getType(),VF
), false, true);
5538 // Scale the total scalar cost by block probability.
5539 ScalarCost
/= getReciprocalPredBlockProb();
5541 // Compute the discount. A non-negative discount means the vector version
5542 // of the instruction costs more, and scalarizing would be beneficial.
5543 Discount
+= VectorCost
- ScalarCost
;
5544 ScalarCosts
[I
] = ScalarCost
;
5550 LoopVectorizationCostModel::VectorizationCostTy
5551 LoopVectorizationCostModel::expectedCost(unsigned VF
) {
5552 VectorizationCostTy Cost
;
5555 for (BasicBlock
*BB
: TheLoop
->blocks()) {
5556 VectorizationCostTy BlockCost
;
5558 // For each instruction in the old loop.
5559 for (Instruction
&I
: BB
->instructionsWithoutDebug()) {
5560 // Skip ignored values.
5561 if (ValuesToIgnore
.find(&I
) != ValuesToIgnore
.end() ||
5562 (VF
> 1 && VecValuesToIgnore
.find(&I
) != VecValuesToIgnore
.end()))
5565 VectorizationCostTy C
= getInstructionCost(&I
, VF
);
5567 // Check if we should override the cost.
5568 if (ForceTargetInstructionCost
.getNumOccurrences() > 0)
5569 C
.first
= ForceTargetInstructionCost
;
5571 BlockCost
.first
+= C
.first
;
5572 BlockCost
.second
|= C
.second
;
5573 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C
.first
5574 << " for VF " << VF
<< " For instruction: " << I
5578 // If we are vectorizing a predicated block, it will have been
5579 // if-converted. This means that the block's instructions (aside from
5580 // stores and instructions that may divide by zero) will now be
5581 // unconditionally executed. For the scalar case, we may not always execute
5582 // the predicated block. Thus, scale the block's cost by the probability of
5584 if (VF
== 1 && blockNeedsPredication(BB
))
5585 BlockCost
.first
/= getReciprocalPredBlockProb();
5587 Cost
.first
+= BlockCost
.first
;
5588 Cost
.second
|= BlockCost
.second
;
5594 /// Gets Address Access SCEV after verifying that the access pattern
5595 /// is loop invariant except the induction variable dependence.
5597 /// This SCEV can be sent to the Target in order to estimate the address
5598 /// calculation cost.
5599 static const SCEV
*getAddressAccessSCEV(
5601 LoopVectorizationLegality
*Legal
,
5602 PredicatedScalarEvolution
&PSE
,
5603 const Loop
*TheLoop
) {
5605 auto *Gep
= dyn_cast
<GetElementPtrInst
>(Ptr
);
5609 // We are looking for a gep with all loop invariant indices except for one
5610 // which should be an induction variable.
5611 auto SE
= PSE
.getSE();
5612 unsigned NumOperands
= Gep
->getNumOperands();
5613 for (unsigned i
= 1; i
< NumOperands
; ++i
) {
5614 Value
*Opd
= Gep
->getOperand(i
);
5615 if (!SE
->isLoopInvariant(SE
->getSCEV(Opd
), TheLoop
) &&
5616 !Legal
->isInductionVariable(Opd
))
5620 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5621 return PSE
.getSCEV(Ptr
);
5624 static bool isStrideMul(Instruction
*I
, LoopVectorizationLegality
*Legal
) {
5625 return Legal
->hasStride(I
->getOperand(0)) ||
5626 Legal
->hasStride(I
->getOperand(1));
5629 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction
*I
,
5631 assert(VF
> 1 && "Scalarization cost of instruction implies vectorization.");
5632 Type
*ValTy
= getMemInstValueType(I
);
5633 auto SE
= PSE
.getSE();
5635 unsigned Alignment
= getLoadStoreAlignment(I
);
5636 unsigned AS
= getLoadStoreAddressSpace(I
);
5637 Value
*Ptr
= getLoadStorePointerOperand(I
);
5638 Type
*PtrTy
= ToVectorTy(Ptr
->getType(), VF
);
5640 // Figure out whether the access is strided and get the stride value
5641 // if it's known in compile time
5642 const SCEV
*PtrSCEV
= getAddressAccessSCEV(Ptr
, Legal
, PSE
, TheLoop
);
5644 // Get the cost of the scalar memory instruction and address computation.
5645 unsigned Cost
= VF
* TTI
.getAddressComputationCost(PtrTy
, SE
, PtrSCEV
);
5647 // Don't pass *I here, since it is scalar but will actually be part of a
5648 // vectorized loop where the user of it is a vectorized instruction.
5650 TTI
.getMemoryOpCost(I
->getOpcode(), ValTy
->getScalarType(), Alignment
,
5653 // Get the overhead of the extractelement and insertelement instructions
5654 // we might create due to scalarization.
5655 Cost
+= getScalarizationOverhead(I
, VF
);
5657 // If we have a predicated store, it may not be executed for each vector
5658 // lane. Scale the cost by the probability of executing the predicated
5660 if (isPredicatedInst(I
)) {
5661 Cost
/= getReciprocalPredBlockProb();
5663 if (useEmulatedMaskMemRefHack(I
))
5664 // Artificially setting to a high enough value to practically disable
5665 // vectorization with such operations.
5672 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction
*I
,
5674 Type
*ValTy
= getMemInstValueType(I
);
5675 Type
*VectorTy
= ToVectorTy(ValTy
, VF
);
5676 unsigned Alignment
= getLoadStoreAlignment(I
);
5677 Value
*Ptr
= getLoadStorePointerOperand(I
);
5678 unsigned AS
= getLoadStoreAddressSpace(I
);
5679 int ConsecutiveStride
= Legal
->isConsecutivePtr(Ptr
);
5681 assert((ConsecutiveStride
== 1 || ConsecutiveStride
== -1) &&
5682 "Stride should be 1 or -1 for consecutive memory access");
5684 if (Legal
->isMaskRequired(I
))
5685 Cost
+= TTI
.getMaskedMemoryOpCost(I
->getOpcode(), VectorTy
, Alignment
, AS
);
5687 Cost
+= TTI
.getMemoryOpCost(I
->getOpcode(), VectorTy
, Alignment
, AS
, I
);
5689 bool Reverse
= ConsecutiveStride
< 0;
5691 Cost
+= TTI
.getShuffleCost(TargetTransformInfo::SK_Reverse
, VectorTy
, 0);
5695 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction
*I
,
5697 Type
*ValTy
= getMemInstValueType(I
);
5698 Type
*VectorTy
= ToVectorTy(ValTy
, VF
);
5699 unsigned Alignment
= getLoadStoreAlignment(I
);
5700 unsigned AS
= getLoadStoreAddressSpace(I
);
5701 if (isa
<LoadInst
>(I
)) {
5702 return TTI
.getAddressComputationCost(ValTy
) +
5703 TTI
.getMemoryOpCost(Instruction::Load
, ValTy
, Alignment
, AS
) +
5704 TTI
.getShuffleCost(TargetTransformInfo::SK_Broadcast
, VectorTy
);
5706 StoreInst
*SI
= cast
<StoreInst
>(I
);
5708 bool isLoopInvariantStoreValue
= Legal
->isUniform(SI
->getValueOperand());
5709 return TTI
.getAddressComputationCost(ValTy
) +
5710 TTI
.getMemoryOpCost(Instruction::Store
, ValTy
, Alignment
, AS
) +
5711 (isLoopInvariantStoreValue
? 0 : TTI
.getVectorInstrCost(
5712 Instruction::ExtractElement
,
5716 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction
*I
,
5718 Type
*ValTy
= getMemInstValueType(I
);
5719 Type
*VectorTy
= ToVectorTy(ValTy
, VF
);
5720 unsigned Alignment
= getLoadStoreAlignment(I
);
5721 Value
*Ptr
= getLoadStorePointerOperand(I
);
5723 return TTI
.getAddressComputationCost(VectorTy
) +
5724 TTI
.getGatherScatterOpCost(I
->getOpcode(), VectorTy
, Ptr
,
5725 Legal
->isMaskRequired(I
), Alignment
);
5728 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction
*I
,
5730 Type
*ValTy
= getMemInstValueType(I
);
5731 Type
*VectorTy
= ToVectorTy(ValTy
, VF
);
5732 unsigned AS
= getLoadStoreAddressSpace(I
);
5734 auto Group
= getInterleavedAccessGroup(I
);
5735 assert(Group
&& "Fail to get an interleaved access group.");
5737 unsigned InterleaveFactor
= Group
->getFactor();
5738 Type
*WideVecTy
= VectorType::get(ValTy
, VF
* InterleaveFactor
);
5740 // Holds the indices of existing members in an interleaved load group.
5741 // An interleaved store group doesn't need this as it doesn't allow gaps.
5742 SmallVector
<unsigned, 4> Indices
;
5743 if (isa
<LoadInst
>(I
)) {
5744 for (unsigned i
= 0; i
< InterleaveFactor
; i
++)
5745 if (Group
->getMember(i
))
5746 Indices
.push_back(i
);
5749 // Calculate the cost of the whole interleaved group.
5750 bool UseMaskForGaps
=
5751 Group
->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5752 unsigned Cost
= TTI
.getInterleavedMemoryOpCost(
5753 I
->getOpcode(), WideVecTy
, Group
->getFactor(), Indices
,
5754 Group
->getAlignment(), AS
, Legal
->isMaskRequired(I
), UseMaskForGaps
);
5756 if (Group
->isReverse()) {
5757 // TODO: Add support for reversed masked interleaved access.
5758 assert(!Legal
->isMaskRequired(I
) &&
5759 "Reverse masked interleaved access not supported.");
5760 Cost
+= Group
->getNumMembers() *
5761 TTI
.getShuffleCost(TargetTransformInfo::SK_Reverse
, VectorTy
, 0);
5766 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction
*I
,
5768 // Calculate scalar cost only. Vectorization cost should be ready at this
5771 Type
*ValTy
= getMemInstValueType(I
);
5772 unsigned Alignment
= getLoadStoreAlignment(I
);
5773 unsigned AS
= getLoadStoreAddressSpace(I
);
5775 return TTI
.getAddressComputationCost(ValTy
) +
5776 TTI
.getMemoryOpCost(I
->getOpcode(), ValTy
, Alignment
, AS
, I
);
5778 return getWideningCost(I
, VF
);
5781 LoopVectorizationCostModel::VectorizationCostTy
5782 LoopVectorizationCostModel::getInstructionCost(Instruction
*I
, unsigned VF
) {
5783 // If we know that this instruction will remain uniform, check the cost of
5784 // the scalar version.
5785 if (isUniformAfterVectorization(I
, VF
))
5788 if (VF
> 1 && isProfitableToScalarize(I
, VF
))
5789 return VectorizationCostTy(InstsToScalarize
[VF
][I
], false);
5791 // Forced scalars do not have any scalarization overhead.
5792 auto ForcedScalar
= ForcedScalars
.find(VF
);
5793 if (VF
> 1 && ForcedScalar
!= ForcedScalars
.end()) {
5794 auto InstSet
= ForcedScalar
->second
;
5795 if (InstSet
.find(I
) != InstSet
.end())
5796 return VectorizationCostTy((getInstructionCost(I
, 1).first
* VF
), false);
5800 unsigned C
= getInstructionCost(I
, VF
, VectorTy
);
5802 bool TypeNotScalarized
=
5803 VF
> 1 && VectorTy
->isVectorTy() && TTI
.getNumberOfParts(VectorTy
) < VF
;
5804 return VectorizationCostTy(C
, TypeNotScalarized
);
5807 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction
*I
,
5814 Type
*RetTy
= ToVectorTy(I
->getType(), VF
);
5815 if (!RetTy
->isVoidTy() &&
5816 (!isa
<LoadInst
>(I
) || !TTI
.supportsEfficientVectorElementLoadStore()))
5817 Cost
+= TTI
.getScalarizationOverhead(RetTy
, true, false);
5819 // Some targets keep addresses scalar.
5820 if (isa
<LoadInst
>(I
) && !TTI
.prefersVectorizedAddressing())
5823 // Some targets support efficient element stores.
5824 if (isa
<StoreInst
>(I
) && TTI
.supportsEfficientVectorElementLoadStore())
5827 // Collect operands to consider.
5828 CallInst
*CI
= dyn_cast
<CallInst
>(I
);
5829 Instruction::op_range Ops
= CI
? CI
->arg_operands() : I
->operands();
5831 // Skip operands that do not require extraction/scalarization and do not incur
5833 return Cost
+ TTI
.getOperandsScalarizationOverhead(
5834 filterExtractingOperands(Ops
, VF
), VF
);
5837 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF
) {
5841 for (BasicBlock
*BB
: TheLoop
->blocks()) {
5842 // For each instruction in the old loop.
5843 for (Instruction
&I
: *BB
) {
5844 Value
*Ptr
= getLoadStorePointerOperand(&I
);
5848 // TODO: We should generate better code and update the cost model for
5849 // predicated uniform stores. Today they are treated as any other
5850 // predicated store (see added test cases in
5851 // invariant-store-vectorization.ll).
5852 if (isa
<StoreInst
>(&I
) && isScalarWithPredication(&I
))
5855 if (Legal
->isUniform(Ptr
) &&
5856 // Conditional loads and stores should be scalarized and predicated.
5857 // isScalarWithPredication cannot be used here since masked
5858 // gather/scatters are not considered scalar with predication.
5859 !Legal
->blockNeedsPredication(I
.getParent())) {
5860 // TODO: Avoid replicating loads and stores instead of
5861 // relying on instcombine to remove them.
5862 // Load: Scalar load + broadcast
5863 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5864 unsigned Cost
= getUniformMemOpCost(&I
, VF
);
5865 setWideningDecision(&I
, VF
, CM_Scalarize
, Cost
);
5869 // We assume that widening is the best solution when possible.
5870 if (memoryInstructionCanBeWidened(&I
, VF
)) {
5871 unsigned Cost
= getConsecutiveMemOpCost(&I
, VF
);
5872 int ConsecutiveStride
=
5873 Legal
->isConsecutivePtr(getLoadStorePointerOperand(&I
));
5874 assert((ConsecutiveStride
== 1 || ConsecutiveStride
== -1) &&
5875 "Expected consecutive stride.");
5876 InstWidening Decision
=
5877 ConsecutiveStride
== 1 ? CM_Widen
: CM_Widen_Reverse
;
5878 setWideningDecision(&I
, VF
, Decision
, Cost
);
5882 // Choose between Interleaving, Gather/Scatter or Scalarization.
5883 unsigned InterleaveCost
= std::numeric_limits
<unsigned>::max();
5884 unsigned NumAccesses
= 1;
5885 if (isAccessInterleaved(&I
)) {
5886 auto Group
= getInterleavedAccessGroup(&I
);
5887 assert(Group
&& "Fail to get an interleaved access group.");
5889 // Make one decision for the whole group.
5890 if (getWideningDecision(&I
, VF
) != CM_Unknown
)
5893 NumAccesses
= Group
->getNumMembers();
5894 if (interleavedAccessCanBeWidened(&I
, VF
))
5895 InterleaveCost
= getInterleaveGroupCost(&I
, VF
);
5898 unsigned GatherScatterCost
=
5899 isLegalGatherOrScatter(&I
)
5900 ? getGatherScatterCost(&I
, VF
) * NumAccesses
5901 : std::numeric_limits
<unsigned>::max();
5903 unsigned ScalarizationCost
=
5904 getMemInstScalarizationCost(&I
, VF
) * NumAccesses
;
5906 // Choose better solution for the current VF,
5907 // write down this decision and use it during vectorization.
5909 InstWidening Decision
;
5910 if (InterleaveCost
<= GatherScatterCost
&&
5911 InterleaveCost
< ScalarizationCost
) {
5912 Decision
= CM_Interleave
;
5913 Cost
= InterleaveCost
;
5914 } else if (GatherScatterCost
< ScalarizationCost
) {
5915 Decision
= CM_GatherScatter
;
5916 Cost
= GatherScatterCost
;
5918 Decision
= CM_Scalarize
;
5919 Cost
= ScalarizationCost
;
5921 // If the instructions belongs to an interleave group, the whole group
5922 // receives the same decision. The whole group receives the cost, but
5923 // the cost will actually be assigned to one instruction.
5924 if (auto Group
= getInterleavedAccessGroup(&I
))
5925 setWideningDecision(Group
, VF
, Decision
, Cost
);
5927 setWideningDecision(&I
, VF
, Decision
, Cost
);
5931 // Make sure that any load of address and any other address computation
5932 // remains scalar unless there is gather/scatter support. This avoids
5933 // inevitable extracts into address registers, and also has the benefit of
5934 // activating LSR more, since that pass can't optimize vectorized
5936 if (TTI
.prefersVectorizedAddressing())
5939 // Start with all scalar pointer uses.
5940 SmallPtrSet
<Instruction
*, 8> AddrDefs
;
5941 for (BasicBlock
*BB
: TheLoop
->blocks())
5942 for (Instruction
&I
: *BB
) {
5943 Instruction
*PtrDef
=
5944 dyn_cast_or_null
<Instruction
>(getLoadStorePointerOperand(&I
));
5945 if (PtrDef
&& TheLoop
->contains(PtrDef
) &&
5946 getWideningDecision(&I
, VF
) != CM_GatherScatter
)
5947 AddrDefs
.insert(PtrDef
);
5950 // Add all instructions used to generate the addresses.
5951 SmallVector
<Instruction
*, 4> Worklist
;
5952 for (auto *I
: AddrDefs
)
5953 Worklist
.push_back(I
);
5954 while (!Worklist
.empty()) {
5955 Instruction
*I
= Worklist
.pop_back_val();
5956 for (auto &Op
: I
->operands())
5957 if (auto *InstOp
= dyn_cast
<Instruction
>(Op
))
5958 if ((InstOp
->getParent() == I
->getParent()) && !isa
<PHINode
>(InstOp
) &&
5959 AddrDefs
.insert(InstOp
).second
)
5960 Worklist
.push_back(InstOp
);
5963 for (auto *I
: AddrDefs
) {
5964 if (isa
<LoadInst
>(I
)) {
5965 // Setting the desired widening decision should ideally be handled in
5966 // by cost functions, but since this involves the task of finding out
5967 // if the loaded register is involved in an address computation, it is
5968 // instead changed here when we know this is the case.
5969 InstWidening Decision
= getWideningDecision(I
, VF
);
5970 if (Decision
== CM_Widen
|| Decision
== CM_Widen_Reverse
)
5971 // Scalarize a widened load of address.
5972 setWideningDecision(I
, VF
, CM_Scalarize
,
5973 (VF
* getMemoryInstructionCost(I
, 1)));
5974 else if (auto Group
= getInterleavedAccessGroup(I
)) {
5975 // Scalarize an interleave group of address loads.
5976 for (unsigned I
= 0; I
< Group
->getFactor(); ++I
) {
5977 if (Instruction
*Member
= Group
->getMember(I
))
5978 setWideningDecision(Member
, VF
, CM_Scalarize
,
5979 (VF
* getMemoryInstructionCost(Member
, 1)));
5983 // Make sure I gets scalarized and a cost estimate without
5984 // scalarization overhead.
5985 ForcedScalars
[VF
].insert(I
);
5989 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction
*I
,
5992 Type
*RetTy
= I
->getType();
5993 if (canTruncateToMinimalBitwidth(I
, VF
))
5994 RetTy
= IntegerType::get(RetTy
->getContext(), MinBWs
[I
]);
5995 VectorTy
= isScalarAfterVectorization(I
, VF
) ? RetTy
: ToVectorTy(RetTy
, VF
);
5996 auto SE
= PSE
.getSE();
5998 // TODO: We need to estimate the cost of intrinsic calls.
5999 switch (I
->getOpcode()) {
6000 case Instruction::GetElementPtr
:
6001 // We mark this instruction as zero-cost because the cost of GEPs in
6002 // vectorized code depends on whether the corresponding memory instruction
6003 // is scalarized or not. Therefore, we handle GEPs with the memory
6004 // instruction cost.
6006 case Instruction::Br
: {
6007 // In cases of scalarized and predicated instructions, there will be VF
6008 // predicated blocks in the vectorized loop. Each branch around these
6009 // blocks requires also an extract of its vector compare i1 element.
6010 bool ScalarPredicatedBB
= false;
6011 BranchInst
*BI
= cast
<BranchInst
>(I
);
6012 if (VF
> 1 && BI
->isConditional() &&
6013 (PredicatedBBsAfterVectorization
.find(BI
->getSuccessor(0)) !=
6014 PredicatedBBsAfterVectorization
.end() ||
6015 PredicatedBBsAfterVectorization
.find(BI
->getSuccessor(1)) !=
6016 PredicatedBBsAfterVectorization
.end()))
6017 ScalarPredicatedBB
= true;
6019 if (ScalarPredicatedBB
) {
6020 // Return cost for branches around scalarized and predicated blocks.
6022 VectorType::get(IntegerType::getInt1Ty(RetTy
->getContext()), VF
);
6023 return (TTI
.getScalarizationOverhead(Vec_i1Ty
, false, true) +
6024 (TTI
.getCFInstrCost(Instruction::Br
) * VF
));
6025 } else if (I
->getParent() == TheLoop
->getLoopLatch() || VF
== 1)
6026 // The back-edge branch will remain, as will all scalar branches.
6027 return TTI
.getCFInstrCost(Instruction::Br
);
6029 // This branch will be eliminated by if-conversion.
6031 // Note: We currently assume zero cost for an unconditional branch inside
6032 // a predicated block since it will become a fall-through, although we
6033 // may decide in the future to call TTI for all branches.
6035 case Instruction::PHI
: {
6036 auto *Phi
= cast
<PHINode
>(I
);
6038 // First-order recurrences are replaced by vector shuffles inside the loop.
6039 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6040 if (VF
> 1 && Legal
->isFirstOrderRecurrence(Phi
))
6041 return TTI
.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector
,
6042 VectorTy
, VF
- 1, VectorType::get(RetTy
, 1));
6044 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6045 // converted into select instructions. We require N - 1 selects per phi
6046 // node, where N is the number of incoming values.
6047 if (VF
> 1 && Phi
->getParent() != TheLoop
->getHeader())
6048 return (Phi
->getNumIncomingValues() - 1) *
6049 TTI
.getCmpSelInstrCost(
6050 Instruction::Select
, ToVectorTy(Phi
->getType(), VF
),
6051 ToVectorTy(Type::getInt1Ty(Phi
->getContext()), VF
));
6053 return TTI
.getCFInstrCost(Instruction::PHI
);
6055 case Instruction::UDiv
:
6056 case Instruction::SDiv
:
6057 case Instruction::URem
:
6058 case Instruction::SRem
:
6059 // If we have a predicated instruction, it may not be executed for each
6060 // vector lane. Get the scalarization cost and scale this amount by the
6061 // probability of executing the predicated block. If the instruction is not
6062 // predicated, we fall through to the next case.
6063 if (VF
> 1 && isScalarWithPredication(I
)) {
6066 // These instructions have a non-void type, so account for the phi nodes
6067 // that we will create. This cost is likely to be zero. The phi node
6068 // cost, if any, should be scaled by the block probability because it
6069 // models a copy at the end of each predicated block.
6070 Cost
+= VF
* TTI
.getCFInstrCost(Instruction::PHI
);
6072 // The cost of the non-predicated instruction.
6073 Cost
+= VF
* TTI
.getArithmeticInstrCost(I
->getOpcode(), RetTy
);
6075 // The cost of insertelement and extractelement instructions needed for
6077 Cost
+= getScalarizationOverhead(I
, VF
);
6079 // Scale the cost by the probability of executing the predicated blocks.
6080 // This assumes the predicated block for each vector lane is equally
6082 return Cost
/ getReciprocalPredBlockProb();
6085 case Instruction::Add
:
6086 case Instruction::FAdd
:
6087 case Instruction::Sub
:
6088 case Instruction::FSub
:
6089 case Instruction::Mul
:
6090 case Instruction::FMul
:
6091 case Instruction::FDiv
:
6092 case Instruction::FRem
:
6093 case Instruction::Shl
:
6094 case Instruction::LShr
:
6095 case Instruction::AShr
:
6096 case Instruction::And
:
6097 case Instruction::Or
:
6098 case Instruction::Xor
: {
6099 // Since we will replace the stride by 1 the multiplication should go away.
6100 if (I
->getOpcode() == Instruction::Mul
&& isStrideMul(I
, Legal
))
6102 // Certain instructions can be cheaper to vectorize if they have a constant
6103 // second vector operand. One example of this are shifts on x86.
6104 Value
*Op2
= I
->getOperand(1);
6105 TargetTransformInfo::OperandValueProperties Op2VP
;
6106 TargetTransformInfo::OperandValueKind Op2VK
=
6107 TTI
.getOperandInfo(Op2
, Op2VP
);
6108 if (Op2VK
== TargetTransformInfo::OK_AnyValue
&& Legal
->isUniform(Op2
))
6109 Op2VK
= TargetTransformInfo::OK_UniformValue
;
6111 SmallVector
<const Value
*, 4> Operands(I
->operand_values());
6112 unsigned N
= isScalarAfterVectorization(I
, VF
) ? VF
: 1;
6113 return N
* TTI
.getArithmeticInstrCost(
6114 I
->getOpcode(), VectorTy
, TargetTransformInfo::OK_AnyValue
,
6115 Op2VK
, TargetTransformInfo::OP_None
, Op2VP
, Operands
);
6117 case Instruction::FNeg
: {
6118 unsigned N
= isScalarAfterVectorization(I
, VF
) ? VF
: 1;
6119 return N
* TTI
.getArithmeticInstrCost(
6120 I
->getOpcode(), VectorTy
, TargetTransformInfo::OK_AnyValue
,
6121 TargetTransformInfo::OK_AnyValue
,
6122 TargetTransformInfo::OP_None
, TargetTransformInfo::OP_None
,
6125 case Instruction::Select
: {
6126 SelectInst
*SI
= cast
<SelectInst
>(I
);
6127 const SCEV
*CondSCEV
= SE
->getSCEV(SI
->getCondition());
6128 bool ScalarCond
= (SE
->isLoopInvariant(CondSCEV
, TheLoop
));
6129 Type
*CondTy
= SI
->getCondition()->getType();
6131 CondTy
= VectorType::get(CondTy
, VF
);
6133 return TTI
.getCmpSelInstrCost(I
->getOpcode(), VectorTy
, CondTy
, I
);
6135 case Instruction::ICmp
:
6136 case Instruction::FCmp
: {
6137 Type
*ValTy
= I
->getOperand(0)->getType();
6138 Instruction
*Op0AsInstruction
= dyn_cast
<Instruction
>(I
->getOperand(0));
6139 if (canTruncateToMinimalBitwidth(Op0AsInstruction
, VF
))
6140 ValTy
= IntegerType::get(ValTy
->getContext(), MinBWs
[Op0AsInstruction
]);
6141 VectorTy
= ToVectorTy(ValTy
, VF
);
6142 return TTI
.getCmpSelInstrCost(I
->getOpcode(), VectorTy
, nullptr, I
);
6144 case Instruction::Store
:
6145 case Instruction::Load
: {
6146 unsigned Width
= VF
;
6148 InstWidening Decision
= getWideningDecision(I
, Width
);
6149 assert(Decision
!= CM_Unknown
&&
6150 "CM decision should be taken at this point");
6151 if (Decision
== CM_Scalarize
)
6154 VectorTy
= ToVectorTy(getMemInstValueType(I
), Width
);
6155 return getMemoryInstructionCost(I
, VF
);
6157 case Instruction::ZExt
:
6158 case Instruction::SExt
:
6159 case Instruction::FPToUI
:
6160 case Instruction::FPToSI
:
6161 case Instruction::FPExt
:
6162 case Instruction::PtrToInt
:
6163 case Instruction::IntToPtr
:
6164 case Instruction::SIToFP
:
6165 case Instruction::UIToFP
:
6166 case Instruction::Trunc
:
6167 case Instruction::FPTrunc
:
6168 case Instruction::BitCast
: {
6169 // We optimize the truncation of induction variables having constant
6170 // integer steps. The cost of these truncations is the same as the scalar
6172 if (isOptimizableIVTruncate(I
, VF
)) {
6173 auto *Trunc
= cast
<TruncInst
>(I
);
6174 return TTI
.getCastInstrCost(Instruction::Trunc
, Trunc
->getDestTy(),
6175 Trunc
->getSrcTy(), Trunc
);
6178 Type
*SrcScalarTy
= I
->getOperand(0)->getType();
6180 VectorTy
->isVectorTy() ? ToVectorTy(SrcScalarTy
, VF
) : SrcScalarTy
;
6181 if (canTruncateToMinimalBitwidth(I
, VF
)) {
6182 // This cast is going to be shrunk. This may remove the cast or it might
6183 // turn it into slightly different cast. For example, if MinBW == 16,
6184 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6186 // Calculate the modified src and dest types.
6187 Type
*MinVecTy
= VectorTy
;
6188 if (I
->getOpcode() == Instruction::Trunc
) {
6189 SrcVecTy
= smallestIntegerVectorType(SrcVecTy
, MinVecTy
);
6191 largestIntegerVectorType(ToVectorTy(I
->getType(), VF
), MinVecTy
);
6192 } else if (I
->getOpcode() == Instruction::ZExt
||
6193 I
->getOpcode() == Instruction::SExt
) {
6194 SrcVecTy
= largestIntegerVectorType(SrcVecTy
, MinVecTy
);
6196 smallestIntegerVectorType(ToVectorTy(I
->getType(), VF
), MinVecTy
);
6200 unsigned N
= isScalarAfterVectorization(I
, VF
) ? VF
: 1;
6201 return N
* TTI
.getCastInstrCost(I
->getOpcode(), VectorTy
, SrcVecTy
, I
);
6203 case Instruction::Call
: {
6204 bool NeedToScalarize
;
6205 CallInst
*CI
= cast
<CallInst
>(I
);
6206 unsigned CallCost
= getVectorCallCost(CI
, VF
, NeedToScalarize
);
6207 if (getVectorIntrinsicIDForCall(CI
, TLI
))
6208 return std::min(CallCost
, getVectorIntrinsicCost(CI
, VF
));
6212 // The cost of executing VF copies of the scalar instruction. This opcode
6213 // is unknown. Assume that it is the same as 'mul'.
6214 return VF
* TTI
.getArithmeticInstrCost(Instruction::Mul
, VectorTy
) +
6215 getScalarizationOverhead(I
, VF
);
6219 char LoopVectorize::ID
= 0;
6221 static const char lv_name
[] = "Loop Vectorization";
6223 INITIALIZE_PASS_BEGIN(LoopVectorize
, LV_NAME
, lv_name
, false, false)
6224 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass
)
6225 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass
)
6226 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass
)
6227 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass
)
6228 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker
)
6229 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass
)
6230 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass
)
6231 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass
)
6232 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass
)
6233 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis
)
6234 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass
)
6235 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass
)
6236 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass
)
6237 INITIALIZE_PASS_END(LoopVectorize
, LV_NAME
, lv_name
, false, false)
6241 Pass
*createLoopVectorizePass() { return new LoopVectorize(); }
6243 Pass
*createLoopVectorizePass(bool InterleaveOnlyWhenForced
,
6244 bool VectorizeOnlyWhenForced
) {
6245 return new LoopVectorize(InterleaveOnlyWhenForced
, VectorizeOnlyWhenForced
);
6248 } // end namespace llvm
6250 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction
*Inst
) {
6251 // Check if the pointer operand of a load or store instruction is
6253 if (auto *Ptr
= getLoadStorePointerOperand(Inst
))
6254 return Legal
->isConsecutivePtr(Ptr
);
6258 void LoopVectorizationCostModel::collectValuesToIgnore() {
6259 // Ignore ephemeral values.
6260 CodeMetrics::collectEphemeralValues(TheLoop
, AC
, ValuesToIgnore
);
6262 // Ignore type-promoting instructions we identified during reduction
6264 for (auto &Reduction
: *Legal
->getReductionVars()) {
6265 RecurrenceDescriptor
&RedDes
= Reduction
.second
;
6266 SmallPtrSetImpl
<Instruction
*> &Casts
= RedDes
.getCastInsts();
6267 VecValuesToIgnore
.insert(Casts
.begin(), Casts
.end());
6269 // Ignore type-casting instructions we identified during induction
6271 for (auto &Induction
: *Legal
->getInductionVars()) {
6272 InductionDescriptor
&IndDes
= Induction
.second
;
6273 const SmallVectorImpl
<Instruction
*> &Casts
= IndDes
.getCastInsts();
6274 VecValuesToIgnore
.insert(Casts
.begin(), Casts
.end());
6278 // TODO: we could return a pair of values that specify the max VF and
6279 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6280 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6281 // doesn't have a cost model that can choose which plan to execute if
6282 // more than one is generated.
6283 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits
,
6284 LoopVectorizationCostModel
&CM
) {
6285 unsigned WidestType
;
6286 std::tie(std::ignore
, WidestType
) = CM
.getSmallestAndWidestTypes();
6287 return WidestVectorRegBits
/ WidestType
;
6291 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF
) {
6292 unsigned VF
= UserVF
;
6293 // Outer loop handling: They may require CFG and instruction level
6294 // transformations before even evaluating whether vectorization is profitable.
6295 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6296 // the vectorization pipeline.
6297 if (!OrigLoop
->empty()) {
6298 // If the user doesn't provide a vectorization factor, determine a
6301 VF
= determineVPlanVF(TTI
->getRegisterBitWidth(true /* Vector*/), CM
);
6302 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF
<< ".\n");
6304 // Make sure we have a VF > 1 for stress testing.
6305 if (VPlanBuildStressTest
&& VF
< 2) {
6306 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6307 << "overriding computed VF.\n");
6311 assert(EnableVPlanNativePath
&& "VPlan-native path is not enabled.");
6312 assert(isPowerOf2_32(VF
) && "VF needs to be a power of two");
6313 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF
? "user " : "") << "VF " << VF
6314 << " to build VPlans.\n");
6315 buildVPlans(VF
, VF
);
6317 // For VPlan build stress testing, we bail out after VPlan construction.
6318 if (VPlanBuildStressTest
)
6319 return VectorizationFactor::Disabled();
6325 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6326 "VPlan-native path.\n");
6327 return VectorizationFactor::Disabled();
6330 Optional
<VectorizationFactor
> LoopVectorizationPlanner::plan(unsigned UserVF
) {
6331 assert(OrigLoop
->empty() && "Inner loop expected.");
6332 Optional
<unsigned> MaybeMaxVF
= CM
.computeMaxVF();
6333 if (!MaybeMaxVF
) // Cases that should not to be vectorized nor interleaved.
6336 // Invalidate interleave groups if all blocks of loop will be predicated.
6337 if (CM
.blockNeedsPredication(OrigLoop
->getHeader()) &&
6338 !useMaskedInterleavedAccesses(*TTI
)) {
6341 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6342 "which requires masked-interleaved support.\n");
6343 CM
.InterleaveInfo
.reset();
6347 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF
<< ".\n");
6348 assert(isPowerOf2_32(UserVF
) && "VF needs to be a power of two");
6349 // Collect the instructions (and their associated costs) that will be more
6350 // profitable to scalarize.
6351 CM
.selectUserVectorizationFactor(UserVF
);
6352 buildVPlansWithVPRecipes(UserVF
, UserVF
);
6353 LLVM_DEBUG(printPlans(dbgs()));
6354 return {{UserVF
, 0}};
6357 unsigned MaxVF
= MaybeMaxVF
.getValue();
6358 assert(MaxVF
!= 0 && "MaxVF is zero.");
6360 for (unsigned VF
= 1; VF
<= MaxVF
; VF
*= 2) {
6361 // Collect Uniform and Scalar instructions after vectorization with VF.
6362 CM
.collectUniformsAndScalars(VF
);
6364 // Collect the instructions (and their associated costs) that will be more
6365 // profitable to scalarize.
6367 CM
.collectInstsToScalarize(VF
);
6370 buildVPlansWithVPRecipes(1, MaxVF
);
6371 LLVM_DEBUG(printPlans(dbgs()));
6373 return VectorizationFactor::Disabled();
6375 // Select the optimal vectorization factor.
6376 return CM
.selectVectorizationFactor(MaxVF
);
6379 void LoopVectorizationPlanner::setBestPlan(unsigned VF
, unsigned UF
) {
6380 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF
<< ", UF=" << UF
6385 erase_if(VPlans
, [VF
](const VPlanPtr
&Plan
) {
6386 return !Plan
->hasVF(VF
);
6388 assert(VPlans
.size() == 1 && "Best VF has not a single VPlan.");
6391 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer
&ILV
,
6392 DominatorTree
*DT
) {
6393 // Perform the actual loop transformation.
6395 // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6396 VPCallbackILV
CallbackILV(ILV
);
6398 VPTransformState State
{BestVF
, BestUF
, LI
,
6399 DT
, ILV
.Builder
, ILV
.VectorLoopValueMap
,
6401 State
.CFG
.PrevBB
= ILV
.createVectorizedLoopSkeleton();
6402 State
.TripCount
= ILV
.getOrCreateTripCount(nullptr);
6404 //===------------------------------------------------===//
6406 // Notice: any optimization or new instruction that go
6407 // into the code below should also be implemented in
6410 //===------------------------------------------------===//
6412 // 2. Copy and widen instructions from the old loop into the new loop.
6413 assert(VPlans
.size() == 1 && "Not a single VPlan to execute.");
6414 VPlans
.front()->execute(&State
);
6416 // 3. Fix the vectorized code: take care of header phi's, live-outs,
6417 // predication, updating analyses.
6418 ILV
.fixVectorizedLoop();
6421 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6422 SmallPtrSetImpl
<Instruction
*> &DeadInstructions
) {
6423 BasicBlock
*Latch
= OrigLoop
->getLoopLatch();
6425 // We create new control-flow for the vectorized loop, so the original
6426 // condition will be dead after vectorization if it's only used by the
6428 auto *Cmp
= dyn_cast
<Instruction
>(Latch
->getTerminator()->getOperand(0));
6429 if (Cmp
&& Cmp
->hasOneUse())
6430 DeadInstructions
.insert(Cmp
);
6432 // We create new "steps" for induction variable updates to which the original
6433 // induction variables map. An original update instruction will be dead if
6434 // all its users except the induction variable are dead.
6435 for (auto &Induction
: *Legal
->getInductionVars()) {
6436 PHINode
*Ind
= Induction
.first
;
6437 auto *IndUpdate
= cast
<Instruction
>(Ind
->getIncomingValueForBlock(Latch
));
6438 if (llvm::all_of(IndUpdate
->users(), [&](User
*U
) -> bool {
6439 return U
== Ind
|| DeadInstructions
.find(cast
<Instruction
>(U
)) !=
6440 DeadInstructions
.end();
6442 DeadInstructions
.insert(IndUpdate
);
6444 // We record as "Dead" also the type-casting instructions we had identified
6445 // during induction analysis. We don't need any handling for them in the
6446 // vectorized loop because we have proven that, under a proper runtime
6447 // test guarding the vectorized loop, the value of the phi, and the casted
6448 // value of the phi, are the same. The last instruction in this casting chain
6449 // will get its scalar/vector/widened def from the scalar/vector/widened def
6450 // of the respective phi node. Any other casts in the induction def-use chain
6451 // have no other uses outside the phi update chain, and will be ignored.
6452 InductionDescriptor
&IndDes
= Induction
.second
;
6453 const SmallVectorImpl
<Instruction
*> &Casts
= IndDes
.getCastInsts();
6454 DeadInstructions
.insert(Casts
.begin(), Casts
.end());
6458 Value
*InnerLoopUnroller::reverseVector(Value
*Vec
) { return Vec
; }
6460 Value
*InnerLoopUnroller::getBroadcastInstrs(Value
*V
) { return V
; }
6462 Value
*InnerLoopUnroller::getStepVector(Value
*Val
, int StartIdx
, Value
*Step
,
6463 Instruction::BinaryOps BinOp
) {
6464 // When unrolling and the VF is 1, we only need to add a simple scalar.
6465 Type
*Ty
= Val
->getType();
6466 assert(!Ty
->isVectorTy() && "Val must be a scalar");
6468 if (Ty
->isFloatingPointTy()) {
6469 Constant
*C
= ConstantFP::get(Ty
, (double)StartIdx
);
6471 // Floating point operations had to be 'fast' to enable the unrolling.
6472 Value
*MulOp
= addFastMathFlag(Builder
.CreateFMul(C
, Step
));
6473 return addFastMathFlag(Builder
.CreateBinOp(BinOp
, Val
, MulOp
));
6475 Constant
*C
= ConstantInt::get(Ty
, StartIdx
);
6476 return Builder
.CreateAdd(Val
, Builder
.CreateMul(C
, Step
), "induction");
6479 static void AddRuntimeUnrollDisableMetaData(Loop
*L
) {
6480 SmallVector
<Metadata
*, 4> MDs
;
6481 // Reserve first location for self reference to the LoopID metadata node.
6482 MDs
.push_back(nullptr);
6483 bool IsUnrollMetadata
= false;
6484 MDNode
*LoopID
= L
->getLoopID();
6486 // First find existing loop unrolling disable metadata.
6487 for (unsigned i
= 1, ie
= LoopID
->getNumOperands(); i
< ie
; ++i
) {
6488 auto *MD
= dyn_cast
<MDNode
>(LoopID
->getOperand(i
));
6490 const auto *S
= dyn_cast
<MDString
>(MD
->getOperand(0));
6492 S
&& S
->getString().startswith("llvm.loop.unroll.disable");
6494 MDs
.push_back(LoopID
->getOperand(i
));
6498 if (!IsUnrollMetadata
) {
6499 // Add runtime unroll disable metadata.
6500 LLVMContext
&Context
= L
->getHeader()->getContext();
6501 SmallVector
<Metadata
*, 1> DisableOperands
;
6502 DisableOperands
.push_back(
6503 MDString::get(Context
, "llvm.loop.unroll.runtime.disable"));
6504 MDNode
*DisableNode
= MDNode::get(Context
, DisableOperands
);
6505 MDs
.push_back(DisableNode
);
6506 MDNode
*NewLoopID
= MDNode::get(Context
, MDs
);
6507 // Set operand 0 to refer to the loop id itself.
6508 NewLoopID
->replaceOperandWith(0, NewLoopID
);
6509 L
->setLoopID(NewLoopID
);
6513 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6514 const std::function
<bool(unsigned)> &Predicate
, VFRange
&Range
) {
6515 assert(Range
.End
> Range
.Start
&& "Trying to test an empty VF range.");
6516 bool PredicateAtRangeStart
= Predicate(Range
.Start
);
6518 for (unsigned TmpVF
= Range
.Start
* 2; TmpVF
< Range
.End
; TmpVF
*= 2)
6519 if (Predicate(TmpVF
) != PredicateAtRangeStart
) {
6524 return PredicateAtRangeStart
;
6527 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6528 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6529 /// of VF's starting at a given VF and extending it as much as possible. Each
6530 /// vectorization decision can potentially shorten this sub-range during
6532 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF
, unsigned MaxVF
) {
6533 for (unsigned VF
= MinVF
; VF
< MaxVF
+ 1;) {
6534 VFRange SubRange
= {VF
, MaxVF
+ 1};
6535 VPlans
.push_back(buildVPlan(SubRange
));
6540 VPValue
*VPRecipeBuilder::createEdgeMask(BasicBlock
*Src
, BasicBlock
*Dst
,
6542 assert(is_contained(predecessors(Dst
), Src
) && "Invalid edge");
6544 // Look for cached value.
6545 std::pair
<BasicBlock
*, BasicBlock
*> Edge(Src
, Dst
);
6546 EdgeMaskCacheTy::iterator ECEntryIt
= EdgeMaskCache
.find(Edge
);
6547 if (ECEntryIt
!= EdgeMaskCache
.end())
6548 return ECEntryIt
->second
;
6550 VPValue
*SrcMask
= createBlockInMask(Src
, Plan
);
6552 // The terminator has to be a branch inst!
6553 BranchInst
*BI
= dyn_cast
<BranchInst
>(Src
->getTerminator());
6554 assert(BI
&& "Unexpected terminator found");
6556 if (!BI
->isConditional())
6557 return EdgeMaskCache
[Edge
] = SrcMask
;
6559 VPValue
*EdgeMask
= Plan
->getVPValue(BI
->getCondition());
6560 assert(EdgeMask
&& "No Edge Mask found for condition");
6562 if (BI
->getSuccessor(0) != Dst
)
6563 EdgeMask
= Builder
.createNot(EdgeMask
);
6565 if (SrcMask
) // Otherwise block in-mask is all-one, no need to AND.
6566 EdgeMask
= Builder
.createAnd(EdgeMask
, SrcMask
);
6568 return EdgeMaskCache
[Edge
] = EdgeMask
;
6571 VPValue
*VPRecipeBuilder::createBlockInMask(BasicBlock
*BB
, VPlanPtr
&Plan
) {
6572 assert(OrigLoop
->contains(BB
) && "Block is not a part of a loop");
6574 // Look for cached value.
6575 BlockMaskCacheTy::iterator BCEntryIt
= BlockMaskCache
.find(BB
);
6576 if (BCEntryIt
!= BlockMaskCache
.end())
6577 return BCEntryIt
->second
;
6579 // All-one mask is modelled as no-mask following the convention for masked
6580 // load/store/gather/scatter. Initialize BlockMask to no-mask.
6581 VPValue
*BlockMask
= nullptr;
6583 if (OrigLoop
->getHeader() == BB
) {
6584 if (!CM
.blockNeedsPredication(BB
))
6585 return BlockMaskCache
[BB
] = BlockMask
; // Loop incoming mask is all-one.
6587 // Introduce the early-exit compare IV <= BTC to form header block mask.
6588 // This is used instead of IV < TC because TC may wrap, unlike BTC.
6589 VPValue
*IV
= Plan
->getVPValue(Legal
->getPrimaryInduction());
6590 VPValue
*BTC
= Plan
->getOrCreateBackedgeTakenCount();
6591 BlockMask
= Builder
.createNaryOp(VPInstruction::ICmpULE
, {IV
, BTC
});
6592 return BlockMaskCache
[BB
] = BlockMask
;
6595 // This is the block mask. We OR all incoming edges.
6596 for (auto *Predecessor
: predecessors(BB
)) {
6597 VPValue
*EdgeMask
= createEdgeMask(Predecessor
, BB
, Plan
);
6598 if (!EdgeMask
) // Mask of predecessor is all-one so mask of block is too.
6599 return BlockMaskCache
[BB
] = EdgeMask
;
6601 if (!BlockMask
) { // BlockMask has its initialized nullptr value.
6602 BlockMask
= EdgeMask
;
6606 BlockMask
= Builder
.createOr(BlockMask
, EdgeMask
);
6609 return BlockMaskCache
[BB
] = BlockMask
;
6612 VPInterleaveRecipe
*VPRecipeBuilder::tryToInterleaveMemory(Instruction
*I
,
6615 const InterleaveGroup
<Instruction
> *IG
= CM
.getInterleavedAccessGroup(I
);
6619 // Now check if IG is relevant for VF's in the given range.
6620 auto isIGMember
= [&](Instruction
*I
) -> std::function
<bool(unsigned)> {
6621 return [=](unsigned VF
) -> bool {
6622 return (VF
>= 2 && // Query is illegal for VF == 1
6623 CM
.getWideningDecision(I
, VF
) ==
6624 LoopVectorizationCostModel::CM_Interleave
);
6627 if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I
), Range
))
6630 // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
6631 // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
6632 // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
6633 assert(I
== IG
->getInsertPos() &&
6634 "Generating a recipe for an adjunct member of an interleave group");
6636 VPValue
*Mask
= nullptr;
6637 if (Legal
->isMaskRequired(I
))
6638 Mask
= createBlockInMask(I
->getParent(), Plan
);
6640 return new VPInterleaveRecipe(IG
, Mask
);
6643 VPWidenMemoryInstructionRecipe
*
6644 VPRecipeBuilder::tryToWidenMemory(Instruction
*I
, VFRange
&Range
,
6646 if (!isa
<LoadInst
>(I
) && !isa
<StoreInst
>(I
))
6649 auto willWiden
= [&](unsigned VF
) -> bool {
6652 if (CM
.isScalarAfterVectorization(I
, VF
) ||
6653 CM
.isProfitableToScalarize(I
, VF
))
6655 LoopVectorizationCostModel::InstWidening Decision
=
6656 CM
.getWideningDecision(I
, VF
);
6657 assert(Decision
!= LoopVectorizationCostModel::CM_Unknown
&&
6658 "CM decision should be taken at this point.");
6659 assert(Decision
!= LoopVectorizationCostModel::CM_Interleave
&&
6660 "Interleave memory opportunity should be caught earlier.");
6661 return Decision
!= LoopVectorizationCostModel::CM_Scalarize
;
6664 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden
, Range
))
6667 VPValue
*Mask
= nullptr;
6668 if (Legal
->isMaskRequired(I
))
6669 Mask
= createBlockInMask(I
->getParent(), Plan
);
6671 return new VPWidenMemoryInstructionRecipe(*I
, Mask
);
6674 VPWidenIntOrFpInductionRecipe
*
6675 VPRecipeBuilder::tryToOptimizeInduction(Instruction
*I
, VFRange
&Range
) {
6676 if (PHINode
*Phi
= dyn_cast
<PHINode
>(I
)) {
6677 // Check if this is an integer or fp induction. If so, build the recipe that
6678 // produces its scalar and vector values.
6679 InductionDescriptor II
= Legal
->getInductionVars()->lookup(Phi
);
6680 if (II
.getKind() == InductionDescriptor::IK_IntInduction
||
6681 II
.getKind() == InductionDescriptor::IK_FpInduction
)
6682 return new VPWidenIntOrFpInductionRecipe(Phi
);
6687 // Optimize the special case where the source is a constant integer
6688 // induction variable. Notice that we can only optimize the 'trunc' case
6689 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6690 // (c) other casts depend on pointer size.
6692 // Determine whether \p K is a truncation based on an induction variable that
6693 // can be optimized.
6694 auto isOptimizableIVTruncate
=
6695 [&](Instruction
*K
) -> std::function
<bool(unsigned)> {
6697 [=](unsigned VF
) -> bool { return CM
.isOptimizableIVTruncate(K
, VF
); };
6700 if (isa
<TruncInst
>(I
) && LoopVectorizationPlanner::getDecisionAndClampRange(
6701 isOptimizableIVTruncate(I
), Range
))
6702 return new VPWidenIntOrFpInductionRecipe(cast
<PHINode
>(I
->getOperand(0)),
6703 cast
<TruncInst
>(I
));
6707 VPBlendRecipe
*VPRecipeBuilder::tryToBlend(Instruction
*I
, VPlanPtr
&Plan
) {
6708 PHINode
*Phi
= dyn_cast
<PHINode
>(I
);
6709 if (!Phi
|| Phi
->getParent() == OrigLoop
->getHeader())
6712 // We know that all PHIs in non-header blocks are converted into selects, so
6713 // we don't have to worry about the insertion order and we can just use the
6714 // builder. At this point we generate the predication tree. There may be
6715 // duplications since this is a simple recursive scan, but future
6716 // optimizations will clean it up.
6718 SmallVector
<VPValue
*, 2> Masks
;
6719 unsigned NumIncoming
= Phi
->getNumIncomingValues();
6720 for (unsigned In
= 0; In
< NumIncoming
; In
++) {
6722 createEdgeMask(Phi
->getIncomingBlock(In
), Phi
->getParent(), Plan
);
6723 assert((EdgeMask
|| NumIncoming
== 1) &&
6724 "Multiple predecessors with one having a full mask");
6726 Masks
.push_back(EdgeMask
);
6728 return new VPBlendRecipe(Phi
, Masks
);
6731 bool VPRecipeBuilder::tryToWiden(Instruction
*I
, VPBasicBlock
*VPBB
,
6734 bool IsPredicated
= LoopVectorizationPlanner::getDecisionAndClampRange(
6735 [&](unsigned VF
) { return CM
.isScalarWithPredication(I
, VF
); }, Range
);
6740 auto IsVectorizableOpcode
= [](unsigned Opcode
) {
6742 case Instruction::Add
:
6743 case Instruction::And
:
6744 case Instruction::AShr
:
6745 case Instruction::BitCast
:
6746 case Instruction::Br
:
6747 case Instruction::Call
:
6748 case Instruction::FAdd
:
6749 case Instruction::FCmp
:
6750 case Instruction::FDiv
:
6751 case Instruction::FMul
:
6752 case Instruction::FNeg
:
6753 case Instruction::FPExt
:
6754 case Instruction::FPToSI
:
6755 case Instruction::FPToUI
:
6756 case Instruction::FPTrunc
:
6757 case Instruction::FRem
:
6758 case Instruction::FSub
:
6759 case Instruction::GetElementPtr
:
6760 case Instruction::ICmp
:
6761 case Instruction::IntToPtr
:
6762 case Instruction::Load
:
6763 case Instruction::LShr
:
6764 case Instruction::Mul
:
6765 case Instruction::Or
:
6766 case Instruction::PHI
:
6767 case Instruction::PtrToInt
:
6768 case Instruction::SDiv
:
6769 case Instruction::Select
:
6770 case Instruction::SExt
:
6771 case Instruction::Shl
:
6772 case Instruction::SIToFP
:
6773 case Instruction::SRem
:
6774 case Instruction::Store
:
6775 case Instruction::Sub
:
6776 case Instruction::Trunc
:
6777 case Instruction::UDiv
:
6778 case Instruction::UIToFP
:
6779 case Instruction::URem
:
6780 case Instruction::Xor
:
6781 case Instruction::ZExt
:
6787 if (!IsVectorizableOpcode(I
->getOpcode()))
6790 if (CallInst
*CI
= dyn_cast
<CallInst
>(I
)) {
6791 Intrinsic::ID ID
= getVectorIntrinsicIDForCall(CI
, TLI
);
6792 if (ID
&& (ID
== Intrinsic::assume
|| ID
== Intrinsic::lifetime_end
||
6793 ID
== Intrinsic::lifetime_start
|| ID
== Intrinsic::sideeffect
))
6797 auto willWiden
= [&](unsigned VF
) -> bool {
6798 if (!isa
<PHINode
>(I
) && (CM
.isScalarAfterVectorization(I
, VF
) ||
6799 CM
.isProfitableToScalarize(I
, VF
)))
6801 if (CallInst
*CI
= dyn_cast
<CallInst
>(I
)) {
6802 Intrinsic::ID ID
= getVectorIntrinsicIDForCall(CI
, TLI
);
6803 // The following case may be scalarized depending on the VF.
6804 // The flag shows whether we use Intrinsic or a usual Call for vectorized
6805 // version of the instruction.
6806 // Is it beneficial to perform intrinsic call compared to lib call?
6807 bool NeedToScalarize
;
6808 unsigned CallCost
= CM
.getVectorCallCost(CI
, VF
, NeedToScalarize
);
6809 bool UseVectorIntrinsic
=
6810 ID
&& CM
.getVectorIntrinsicCost(CI
, VF
) <= CallCost
;
6811 return UseVectorIntrinsic
|| !NeedToScalarize
;
6813 if (isa
<LoadInst
>(I
) || isa
<StoreInst
>(I
)) {
6814 assert(CM
.getWideningDecision(I
, VF
) ==
6815 LoopVectorizationCostModel::CM_Scalarize
&&
6816 "Memory widening decisions should have been taken care by now");
6822 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden
, Range
))
6825 // Success: widen this instruction. We optimize the common case where
6826 // consecutive instructions can be represented by a single recipe.
6827 if (!VPBB
->empty()) {
6828 VPWidenRecipe
*LastWidenRecipe
= dyn_cast
<VPWidenRecipe
>(&VPBB
->back());
6829 if (LastWidenRecipe
&& LastWidenRecipe
->appendInstruction(I
))
6833 VPBB
->appendRecipe(new VPWidenRecipe(I
));
6837 VPBasicBlock
*VPRecipeBuilder::handleReplication(
6838 Instruction
*I
, VFRange
&Range
, VPBasicBlock
*VPBB
,
6839 DenseMap
<Instruction
*, VPReplicateRecipe
*> &PredInst2Recipe
,
6841 bool IsUniform
= LoopVectorizationPlanner::getDecisionAndClampRange(
6842 [&](unsigned VF
) { return CM
.isUniformAfterVectorization(I
, VF
); },
6845 bool IsPredicated
= LoopVectorizationPlanner::getDecisionAndClampRange(
6846 [&](unsigned VF
) { return CM
.isScalarWithPredication(I
, VF
); }, Range
);
6848 auto *Recipe
= new VPReplicateRecipe(I
, IsUniform
, IsPredicated
);
6850 // Find if I uses a predicated instruction. If so, it will use its scalar
6851 // value. Avoid hoisting the insert-element which packs the scalar value into
6852 // a vector value, as that happens iff all users use the vector value.
6853 for (auto &Op
: I
->operands())
6854 if (auto *PredInst
= dyn_cast
<Instruction
>(Op
))
6855 if (PredInst2Recipe
.find(PredInst
) != PredInst2Recipe
.end())
6856 PredInst2Recipe
[PredInst
]->setAlsoPack(false);
6858 // Finalize the recipe for Instr, first if it is not predicated.
6859 if (!IsPredicated
) {
6860 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I
<< "\n");
6861 VPBB
->appendRecipe(Recipe
);
6864 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I
<< "\n");
6865 assert(VPBB
->getSuccessors().empty() &&
6866 "VPBB has successors when handling predicated replication.");
6867 // Record predicated instructions for above packing optimizations.
6868 PredInst2Recipe
[I
] = Recipe
;
6869 VPBlockBase
*Region
= createReplicateRegion(I
, Recipe
, Plan
);
6870 VPBlockUtils::insertBlockAfter(Region
, VPBB
);
6871 auto *RegSucc
= new VPBasicBlock();
6872 VPBlockUtils::insertBlockAfter(RegSucc
, Region
);
6876 VPRegionBlock
*VPRecipeBuilder::createReplicateRegion(Instruction
*Instr
,
6877 VPRecipeBase
*PredRecipe
,
6879 // Instructions marked for predication are replicated and placed under an
6880 // if-then construct to prevent side-effects.
6882 // Generate recipes to compute the block mask for this region.
6883 VPValue
*BlockInMask
= createBlockInMask(Instr
->getParent(), Plan
);
6885 // Build the triangular if-then region.
6886 std::string RegionName
= (Twine("pred.") + Instr
->getOpcodeName()).str();
6887 assert(Instr
->getParent() && "Predicated instruction not in any basic block");
6888 auto *BOMRecipe
= new VPBranchOnMaskRecipe(BlockInMask
);
6889 auto *Entry
= new VPBasicBlock(Twine(RegionName
) + ".entry", BOMRecipe
);
6891 Instr
->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr
);
6892 auto *Exit
= new VPBasicBlock(Twine(RegionName
) + ".continue", PHIRecipe
);
6893 auto *Pred
= new VPBasicBlock(Twine(RegionName
) + ".if", PredRecipe
);
6894 VPRegionBlock
*Region
= new VPRegionBlock(Entry
, Exit
, RegionName
, true);
6896 // Note: first set Entry as region entry and then connect successors starting
6897 // from it in order, to propagate the "parent" of each VPBasicBlock.
6898 VPBlockUtils::insertTwoBlocksAfter(Pred
, Exit
, BlockInMask
, Entry
);
6899 VPBlockUtils::connectBlocks(Pred
, Exit
);
6904 bool VPRecipeBuilder::tryToCreateRecipe(Instruction
*Instr
, VFRange
&Range
,
6905 VPlanPtr
&Plan
, VPBasicBlock
*VPBB
) {
6906 VPRecipeBase
*Recipe
= nullptr;
6907 // Check if Instr should belong to an interleave memory recipe, or already
6908 // does. In the latter case Instr is irrelevant.
6909 if ((Recipe
= tryToInterleaveMemory(Instr
, Range
, Plan
))) {
6910 VPBB
->appendRecipe(Recipe
);
6914 // Check if Instr is a memory operation that should be widened.
6915 if ((Recipe
= tryToWidenMemory(Instr
, Range
, Plan
))) {
6916 VPBB
->appendRecipe(Recipe
);
6920 // Check if Instr should form some PHI recipe.
6921 if ((Recipe
= tryToOptimizeInduction(Instr
, Range
))) {
6922 VPBB
->appendRecipe(Recipe
);
6925 if ((Recipe
= tryToBlend(Instr
, Plan
))) {
6926 VPBB
->appendRecipe(Recipe
);
6929 if (PHINode
*Phi
= dyn_cast
<PHINode
>(Instr
)) {
6930 VPBB
->appendRecipe(new VPWidenPHIRecipe(Phi
));
6934 // Check if Instr is to be widened by a general VPWidenRecipe, after
6935 // having first checked for specific widening recipes that deal with
6936 // Interleave Groups, Inductions and Phi nodes.
6937 if (tryToWiden(Instr
, VPBB
, Range
))
6943 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF
,
6945 assert(OrigLoop
->empty() && "Inner loop expected.");
6947 // Collect conditions feeding internal conditional branches; they need to be
6948 // represented in VPlan for it to model masking.
6949 SmallPtrSet
<Value
*, 1> NeedDef
;
6951 auto *Latch
= OrigLoop
->getLoopLatch();
6952 for (BasicBlock
*BB
: OrigLoop
->blocks()) {
6955 BranchInst
*Branch
= dyn_cast
<BranchInst
>(BB
->getTerminator());
6956 if (Branch
&& Branch
->isConditional())
6957 NeedDef
.insert(Branch
->getCondition());
6960 // If the tail is to be folded by masking, the primary induction variable
6961 // needs to be represented in VPlan for it to model early-exit masking.
6962 // Also, both the Phi and the live-out instruction of each reduction are
6963 // required in order to introduce a select between them in VPlan.
6964 if (CM
.foldTailByMasking()) {
6965 NeedDef
.insert(Legal
->getPrimaryInduction());
6966 for (auto &Reduction
: *Legal
->getReductionVars()) {
6967 NeedDef
.insert(Reduction
.first
);
6968 NeedDef
.insert(Reduction
.second
.getLoopExitInstr());
6972 // Collect instructions from the original loop that will become trivially dead
6973 // in the vectorized loop. We don't need to vectorize these instructions. For
6974 // example, original induction update instructions can become dead because we
6975 // separately emit induction "steps" when generating code for the new loop.
6976 // Similarly, we create a new latch condition when setting up the structure
6977 // of the new loop, so the old one can become dead.
6978 SmallPtrSet
<Instruction
*, 4> DeadInstructions
;
6979 collectTriviallyDeadInstructions(DeadInstructions
);
6981 for (unsigned VF
= MinVF
; VF
< MaxVF
+ 1;) {
6982 VFRange SubRange
= {VF
, MaxVF
+ 1};
6984 buildVPlanWithVPRecipes(SubRange
, NeedDef
, DeadInstructions
));
6989 VPlanPtr
LoopVectorizationPlanner::buildVPlanWithVPRecipes(
6990 VFRange
&Range
, SmallPtrSetImpl
<Value
*> &NeedDef
,
6991 SmallPtrSetImpl
<Instruction
*> &DeadInstructions
) {
6992 // Hold a mapping from predicated instructions to their recipes, in order to
6993 // fix their AlsoPack behavior if a user is determined to replicate and use a
6994 // scalar instead of vector value.
6995 DenseMap
<Instruction
*, VPReplicateRecipe
*> PredInst2Recipe
;
6997 DenseMap
<Instruction
*, Instruction
*> &SinkAfter
= Legal
->getSinkAfter();
6998 DenseMap
<Instruction
*, Instruction
*> SinkAfterInverse
;
7000 // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7001 VPBasicBlock
*VPBB
= new VPBasicBlock("Pre-Entry");
7002 auto Plan
= std::make_unique
<VPlan
>(VPBB
);
7004 VPRecipeBuilder
RecipeBuilder(OrigLoop
, TLI
, Legal
, CM
, Builder
);
7005 // Represent values that will have defs inside VPlan.
7006 for (Value
*V
: NeedDef
)
7007 Plan
->addVPValue(V
);
7009 // Scan the body of the loop in a topological order to visit each basic block
7010 // after having visited its predecessor basic blocks.
7011 LoopBlocksDFS
DFS(OrigLoop
);
7014 for (BasicBlock
*BB
: make_range(DFS
.beginRPO(), DFS
.endRPO())) {
7015 // Relevant instructions from basic block BB will be grouped into VPRecipe
7016 // ingredients and fill a new VPBasicBlock.
7017 unsigned VPBBsForBB
= 0;
7018 auto *FirstVPBBForBB
= new VPBasicBlock(BB
->getName());
7019 VPBlockUtils::insertBlockAfter(FirstVPBBForBB
, VPBB
);
7020 VPBB
= FirstVPBBForBB
;
7021 Builder
.setInsertPoint(VPBB
);
7023 std::vector
<Instruction
*> Ingredients
;
7025 // Organize the ingredients to vectorize from current basic block in the
7027 for (Instruction
&I
: BB
->instructionsWithoutDebug()) {
7028 Instruction
*Instr
= &I
;
7030 // First filter out irrelevant instructions, to ensure no recipes are
7032 if (isa
<BranchInst
>(Instr
) ||
7033 DeadInstructions
.find(Instr
) != DeadInstructions
.end())
7036 // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
7037 // member of the IG, do not construct any Recipe for it.
7038 const InterleaveGroup
<Instruction
> *IG
=
7039 CM
.getInterleavedAccessGroup(Instr
);
7040 if (IG
&& Instr
!= IG
->getInsertPos() &&
7041 Range
.Start
>= 2 && // Query is illegal for VF == 1
7042 CM
.getWideningDecision(Instr
, Range
.Start
) ==
7043 LoopVectorizationCostModel::CM_Interleave
) {
7044 auto SinkCandidate
= SinkAfterInverse
.find(Instr
);
7045 if (SinkCandidate
!= SinkAfterInverse
.end())
7046 Ingredients
.push_back(SinkCandidate
->second
);
7050 // Move instructions to handle first-order recurrences, step 1: avoid
7051 // handling this instruction until after we've handled the instruction it
7053 auto SAIt
= SinkAfter
.find(Instr
);
7054 if (SAIt
!= SinkAfter
.end()) {
7055 LLVM_DEBUG(dbgs() << "Sinking" << *SAIt
->first
<< " after"
7057 << " to vectorize a 1st order recurrence.\n");
7058 SinkAfterInverse
[SAIt
->second
] = Instr
;
7062 Ingredients
.push_back(Instr
);
7064 // Move instructions to handle first-order recurrences, step 2: push the
7065 // instruction to be sunk at its insertion point.
7066 auto SAInvIt
= SinkAfterInverse
.find(Instr
);
7067 if (SAInvIt
!= SinkAfterInverse
.end())
7068 Ingredients
.push_back(SAInvIt
->second
);
7071 // Introduce each ingredient into VPlan.
7072 for (Instruction
*Instr
: Ingredients
) {
7073 if (RecipeBuilder
.tryToCreateRecipe(Instr
, Range
, Plan
, VPBB
))
7076 // Otherwise, if all widening options failed, Instruction is to be
7077 // replicated. This may create a successor for VPBB.
7078 VPBasicBlock
*NextVPBB
= RecipeBuilder
.handleReplication(
7079 Instr
, Range
, VPBB
, PredInst2Recipe
, Plan
);
7080 if (NextVPBB
!= VPBB
) {
7082 VPBB
->setName(BB
->hasName() ? BB
->getName() + "." + Twine(VPBBsForBB
++)
7088 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7089 // may also be empty, such as the last one VPBB, reflecting original
7090 // basic-blocks with no recipes.
7091 VPBasicBlock
*PreEntry
= cast
<VPBasicBlock
>(Plan
->getEntry());
7092 assert(PreEntry
->empty() && "Expecting empty pre-entry block.");
7093 VPBlockBase
*Entry
= Plan
->setEntry(PreEntry
->getSingleSuccessor());
7094 VPBlockUtils::disconnectBlocks(PreEntry
, Entry
);
7097 // Finally, if tail is folded by masking, introduce selects between the phi
7098 // and the live-out instruction of each reduction, at the end of the latch.
7099 if (CM
.foldTailByMasking()) {
7100 Builder
.setInsertPoint(VPBB
);
7101 auto *Cond
= RecipeBuilder
.createBlockInMask(OrigLoop
->getHeader(), Plan
);
7102 for (auto &Reduction
: *Legal
->getReductionVars()) {
7103 VPValue
*Phi
= Plan
->getVPValue(Reduction
.first
);
7104 VPValue
*Red
= Plan
->getVPValue(Reduction
.second
.getLoopExitInstr());
7105 Builder
.createNaryOp(Instruction::Select
, {Cond
, Red
, Phi
});
7109 std::string PlanName
;
7110 raw_string_ostream
RSO(PlanName
);
7111 unsigned VF
= Range
.Start
;
7113 RSO
<< "Initial VPlan for VF={" << VF
;
7114 for (VF
*= 2; VF
< Range
.End
; VF
*= 2) {
7120 Plan
->setName(PlanName
);
7125 VPlanPtr
LoopVectorizationPlanner::buildVPlan(VFRange
&Range
) {
7126 // Outer loop handling: They may require CFG and instruction level
7127 // transformations before even evaluating whether vectorization is profitable.
7128 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7129 // the vectorization pipeline.
7130 assert(!OrigLoop
->empty());
7131 assert(EnableVPlanNativePath
&& "VPlan-native path is not enabled.");
7133 // Create new empty VPlan
7134 auto Plan
= std::make_unique
<VPlan
>();
7136 // Build hierarchical CFG
7137 VPlanHCFGBuilder
HCFGBuilder(OrigLoop
, LI
, *Plan
);
7138 HCFGBuilder
.buildHierarchicalCFG();
7140 for (unsigned VF
= Range
.Start
; VF
< Range
.End
; VF
*= 2)
7143 if (EnableVPlanPredication
) {
7144 VPlanPredicator
VPP(*Plan
);
7147 // Avoid running transformation to recipes until masked code generation in
7148 // VPlan-native path is in place.
7152 SmallPtrSet
<Instruction
*, 1> DeadInstructions
;
7153 VPlanHCFGTransforms::VPInstructionsToVPRecipes(
7154 Plan
, Legal
->getInductionVars(), DeadInstructions
);
7159 Value
* LoopVectorizationPlanner::VPCallbackILV::
7160 getOrCreateVectorValues(Value
*V
, unsigned Part
) {
7161 return ILV
.getOrCreateVectorValue(V
, Part
);
7164 void VPInterleaveRecipe::print(raw_ostream
&O
, const Twine
&Indent
) const {
7166 << Indent
<< "\"INTERLEAVE-GROUP with factor " << IG
->getFactor() << " at ";
7167 IG
->getInsertPos()->printAsOperand(O
, false);
7170 User
->getOperand(0)->printAsOperand(O
);
7173 for (unsigned i
= 0; i
< IG
->getFactor(); ++i
)
7174 if (Instruction
*I
= IG
->getMember(i
))
7176 << Indent
<< "\" " << VPlanIngredient(I
) << " " << i
<< "\\l\"";
7179 void VPWidenRecipe::execute(VPTransformState
&State
) {
7180 for (auto &Instr
: make_range(Begin
, End
))
7181 State
.ILV
->widenInstruction(Instr
);
7184 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState
&State
) {
7185 assert(!State
.Instance
&& "Int or FP induction being replicated.");
7186 State
.ILV
->widenIntOrFpInduction(IV
, Trunc
);
7189 void VPWidenPHIRecipe::execute(VPTransformState
&State
) {
7190 State
.ILV
->widenPHIInstruction(Phi
, State
.UF
, State
.VF
);
7193 void VPBlendRecipe::execute(VPTransformState
&State
) {
7194 State
.ILV
->setDebugLocFromInst(State
.Builder
, Phi
);
7195 // We know that all PHIs in non-header blocks are converted into
7196 // selects, so we don't have to worry about the insertion order and we
7197 // can just use the builder.
7198 // At this point we generate the predication tree. There may be
7199 // duplications since this is a simple recursive scan, but future
7200 // optimizations will clean it up.
7202 unsigned NumIncoming
= Phi
->getNumIncomingValues();
7204 assert((User
|| NumIncoming
== 1) &&
7205 "Multiple predecessors with predecessors having a full mask");
7206 // Generate a sequence of selects of the form:
7207 // SELECT(Mask3, In3,
7208 // SELECT(Mask2, In2,
7210 InnerLoopVectorizer::VectorParts
Entry(State
.UF
);
7211 for (unsigned In
= 0; In
< NumIncoming
; ++In
) {
7212 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
) {
7213 // We might have single edge PHIs (blocks) - use an identity
7214 // 'select' for the first PHI operand.
7216 State
.ILV
->getOrCreateVectorValue(Phi
->getIncomingValue(In
), Part
);
7218 Entry
[Part
] = In0
; // Initialize with the first incoming value.
7220 // Select between the current value and the previous incoming edge
7221 // based on the incoming mask.
7222 Value
*Cond
= State
.get(User
->getOperand(In
), Part
);
7224 State
.Builder
.CreateSelect(Cond
, In0
, Entry
[Part
], "predphi");
7228 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
)
7229 State
.ValueMap
.setVectorValue(Phi
, Part
, Entry
[Part
]);
7232 void VPInterleaveRecipe::execute(VPTransformState
&State
) {
7233 assert(!State
.Instance
&& "Interleave group being replicated.");
7235 return State
.ILV
->vectorizeInterleaveGroup(IG
->getInsertPos());
7237 // Last (and currently only) operand is a mask.
7238 InnerLoopVectorizer::VectorParts
MaskValues(State
.UF
);
7239 VPValue
*Mask
= User
->getOperand(User
->getNumOperands() - 1);
7240 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
)
7241 MaskValues
[Part
] = State
.get(Mask
, Part
);
7242 State
.ILV
->vectorizeInterleaveGroup(IG
->getInsertPos(), &MaskValues
);
7245 void VPReplicateRecipe::execute(VPTransformState
&State
) {
7246 if (State
.Instance
) { // Generate a single instance.
7247 State
.ILV
->scalarizeInstruction(Ingredient
, *State
.Instance
, IsPredicated
);
7248 // Insert scalar instance packing it into a vector.
7249 if (AlsoPack
&& State
.VF
> 1) {
7250 // If we're constructing lane 0, initialize to start from undef.
7251 if (State
.Instance
->Lane
== 0) {
7253 UndefValue::get(VectorType::get(Ingredient
->getType(), State
.VF
));
7254 State
.ValueMap
.setVectorValue(Ingredient
, State
.Instance
->Part
, Undef
);
7256 State
.ILV
->packScalarIntoVectorValue(Ingredient
, *State
.Instance
);
7261 // Generate scalar instances for all VF lanes of all UF parts, unless the
7262 // instruction is uniform inwhich case generate only the first lane for each
7264 unsigned EndLane
= IsUniform
? 1 : State
.VF
;
7265 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
)
7266 for (unsigned Lane
= 0; Lane
< EndLane
; ++Lane
)
7267 State
.ILV
->scalarizeInstruction(Ingredient
, {Part
, Lane
}, IsPredicated
);
7270 void VPBranchOnMaskRecipe::execute(VPTransformState
&State
) {
7271 assert(State
.Instance
&& "Branch on Mask works only on single instance.");
7273 unsigned Part
= State
.Instance
->Part
;
7274 unsigned Lane
= State
.Instance
->Lane
;
7276 Value
*ConditionBit
= nullptr;
7277 if (!User
) // Block in mask is all-one.
7278 ConditionBit
= State
.Builder
.getTrue();
7280 VPValue
*BlockInMask
= User
->getOperand(0);
7281 ConditionBit
= State
.get(BlockInMask
, Part
);
7282 if (ConditionBit
->getType()->isVectorTy())
7283 ConditionBit
= State
.Builder
.CreateExtractElement(
7284 ConditionBit
, State
.Builder
.getInt32(Lane
));
7287 // Replace the temporary unreachable terminator with a new conditional branch,
7288 // whose two destinations will be set later when they are created.
7289 auto *CurrentTerminator
= State
.CFG
.PrevBB
->getTerminator();
7290 assert(isa
<UnreachableInst
>(CurrentTerminator
) &&
7291 "Expected to replace unreachable terminator with conditional branch.");
7292 auto *CondBr
= BranchInst::Create(State
.CFG
.PrevBB
, nullptr, ConditionBit
);
7293 CondBr
->setSuccessor(0, nullptr);
7294 ReplaceInstWithInst(CurrentTerminator
, CondBr
);
7297 void VPPredInstPHIRecipe::execute(VPTransformState
&State
) {
7298 assert(State
.Instance
&& "Predicated instruction PHI works per instance.");
7299 Instruction
*ScalarPredInst
= cast
<Instruction
>(
7300 State
.ValueMap
.getScalarValue(PredInst
, *State
.Instance
));
7301 BasicBlock
*PredicatedBB
= ScalarPredInst
->getParent();
7302 BasicBlock
*PredicatingBB
= PredicatedBB
->getSinglePredecessor();
7303 assert(PredicatingBB
&& "Predicated block has no single predecessor.");
7305 // By current pack/unpack logic we need to generate only a single phi node: if
7306 // a vector value for the predicated instruction exists at this point it means
7307 // the instruction has vector users only, and a phi for the vector value is
7308 // needed. In this case the recipe of the predicated instruction is marked to
7309 // also do that packing, thereby "hoisting" the insert-element sequence.
7310 // Otherwise, a phi node for the scalar value is needed.
7311 unsigned Part
= State
.Instance
->Part
;
7312 if (State
.ValueMap
.hasVectorValue(PredInst
, Part
)) {
7313 Value
*VectorValue
= State
.ValueMap
.getVectorValue(PredInst
, Part
);
7314 InsertElementInst
*IEI
= cast
<InsertElementInst
>(VectorValue
);
7315 PHINode
*VPhi
= State
.Builder
.CreatePHI(IEI
->getType(), 2);
7316 VPhi
->addIncoming(IEI
->getOperand(0), PredicatingBB
); // Unmodified vector.
7317 VPhi
->addIncoming(IEI
, PredicatedBB
); // New vector with inserted element.
7318 State
.ValueMap
.resetVectorValue(PredInst
, Part
, VPhi
); // Update cache.
7320 Type
*PredInstType
= PredInst
->getType();
7321 PHINode
*Phi
= State
.Builder
.CreatePHI(PredInstType
, 2);
7322 Phi
->addIncoming(UndefValue::get(ScalarPredInst
->getType()), PredicatingBB
);
7323 Phi
->addIncoming(ScalarPredInst
, PredicatedBB
);
7324 State
.ValueMap
.resetScalarValue(PredInst
, *State
.Instance
, Phi
);
7328 void VPWidenMemoryInstructionRecipe::execute(VPTransformState
&State
) {
7330 return State
.ILV
->vectorizeMemoryInstruction(&Instr
);
7332 // Last (and currently only) operand is a mask.
7333 InnerLoopVectorizer::VectorParts
MaskValues(State
.UF
);
7334 VPValue
*Mask
= User
->getOperand(User
->getNumOperands() - 1);
7335 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
)
7336 MaskValues
[Part
] = State
.get(Mask
, Part
);
7337 State
.ILV
->vectorizeMemoryInstruction(&Instr
, &MaskValues
);
7340 static ScalarEpilogueLowering
7341 getScalarEpilogueLowering(Function
*F
, Loop
*L
, LoopVectorizeHints
&Hints
,
7342 ProfileSummaryInfo
*PSI
, BlockFrequencyInfo
*BFI
) {
7343 ScalarEpilogueLowering SEL
= CM_ScalarEpilogueAllowed
;
7344 if (Hints
.getForce() != LoopVectorizeHints::FK_Enabled
&&
7346 llvm::shouldOptimizeForSize(L
->getHeader(), PSI
, BFI
)))
7347 SEL
= CM_ScalarEpilogueNotAllowedOptSize
;
7348 else if (PreferPredicateOverEpilog
|| Hints
.getPredicate())
7349 SEL
= CM_ScalarEpilogueNotNeededUsePredicate
;
7354 // Process the loop in the VPlan-native vectorization path. This path builds
7355 // VPlan upfront in the vectorization pipeline, which allows to apply
7356 // VPlan-to-VPlan transformations from the very beginning without modifying the
7358 static bool processLoopInVPlanNativePath(
7359 Loop
*L
, PredicatedScalarEvolution
&PSE
, LoopInfo
*LI
, DominatorTree
*DT
,
7360 LoopVectorizationLegality
*LVL
, TargetTransformInfo
*TTI
,
7361 TargetLibraryInfo
*TLI
, DemandedBits
*DB
, AssumptionCache
*AC
,
7362 OptimizationRemarkEmitter
*ORE
, BlockFrequencyInfo
*BFI
,
7363 ProfileSummaryInfo
*PSI
, LoopVectorizeHints
&Hints
) {
7365 assert(EnableVPlanNativePath
&& "VPlan-native path is disabled.");
7366 Function
*F
= L
->getHeader()->getParent();
7367 InterleavedAccessInfo
IAI(PSE
, L
, DT
, LI
, LVL
->getLAI());
7368 ScalarEpilogueLowering SEL
= getScalarEpilogueLowering(F
, L
, Hints
, PSI
, BFI
);
7370 LoopVectorizationCostModel
CM(SEL
, L
, PSE
, LI
, LVL
, *TTI
, TLI
, DB
, AC
, ORE
, F
,
7372 // Use the planner for outer loop vectorization.
7373 // TODO: CM is not used at this point inside the planner. Turn CM into an
7374 // optional argument if we don't need it in the future.
7375 LoopVectorizationPlanner
LVP(L
, LI
, TLI
, TTI
, LVL
, CM
);
7377 // Get user vectorization factor.
7378 const unsigned UserVF
= Hints
.getWidth();
7380 // Plan how to best vectorize, return the best VF and its cost.
7381 const VectorizationFactor VF
= LVP
.planInVPlanNativePath(UserVF
);
7383 // If we are stress testing VPlan builds, do not attempt to generate vector
7384 // code. Masked vector code generation support will follow soon.
7385 // Also, do not attempt to vectorize if no vector code will be produced.
7386 if (VPlanBuildStressTest
|| EnableVPlanPredication
||
7387 VectorizationFactor::Disabled() == VF
)
7390 LVP
.setBestPlan(VF
.Width
, 1);
7392 InnerLoopVectorizer
LB(L
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
, VF
.Width
, 1, LVL
,
7394 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7395 << L
->getHeader()->getParent()->getName() << "\"\n");
7396 LVP
.executePlan(LB
, DT
);
7398 // Mark the loop as already vectorized to avoid vectorizing again.
7399 Hints
.setAlreadyVectorized();
7401 LLVM_DEBUG(verifyFunction(*L
->getHeader()->getParent()));
7405 bool LoopVectorizePass::processLoop(Loop
*L
) {
7406 assert((EnableVPlanNativePath
|| L
->empty()) &&
7407 "VPlan-native path is not enabled. Only process inner loops.");
7410 const std::string DebugLocStr
= getDebugLocString(L
);
7413 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7414 << L
->getHeader()->getParent()->getName() << "\" from "
7415 << DebugLocStr
<< "\n");
7417 LoopVectorizeHints
Hints(L
, InterleaveOnlyWhenForced
, *ORE
);
7420 dbgs() << "LV: Loop hints:"
7422 << (Hints
.getForce() == LoopVectorizeHints::FK_Disabled
7424 : (Hints
.getForce() == LoopVectorizeHints::FK_Enabled
7427 << " width=" << Hints
.getWidth()
7428 << " unroll=" << Hints
.getInterleave() << "\n");
7430 // Function containing loop
7431 Function
*F
= L
->getHeader()->getParent();
7433 // Looking at the diagnostic output is the only way to determine if a loop
7434 // was vectorized (other than looking at the IR or machine code), so it
7435 // is important to generate an optimization remark for each loop. Most of
7436 // these messages are generated as OptimizationRemarkAnalysis. Remarks
7437 // generated as OptimizationRemark and OptimizationRemarkMissed are
7438 // less verbose reporting vectorized loops and unvectorized loops that may
7439 // benefit from vectorization, respectively.
7441 if (!Hints
.allowVectorization(F
, L
, VectorizeOnlyWhenForced
)) {
7442 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7446 PredicatedScalarEvolution
PSE(*SE
, *L
);
7448 // Check if it is legal to vectorize the loop.
7449 LoopVectorizationRequirements
Requirements(*ORE
);
7450 LoopVectorizationLegality
LVL(L
, PSE
, DT
, TTI
, TLI
, AA
, F
, GetLAA
, LI
, ORE
,
7451 &Requirements
, &Hints
, DB
, AC
);
7452 if (!LVL
.canVectorize(EnableVPlanNativePath
)) {
7453 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7454 Hints
.emitRemarkWithHints();
7458 // Check the function attributes and profiles to find out if this function
7459 // should be optimized for size.
7460 ScalarEpilogueLowering SEL
= getScalarEpilogueLowering(F
, L
, Hints
, PSI
, BFI
);
7462 // Entrance to the VPlan-native vectorization path. Outer loops are processed
7463 // here. They may require CFG and instruction level transformations before
7464 // even evaluating whether vectorization is profitable. Since we cannot modify
7465 // the incoming IR, we need to build VPlan upfront in the vectorization
7468 return processLoopInVPlanNativePath(L
, PSE
, LI
, DT
, &LVL
, TTI
, TLI
, DB
, AC
,
7469 ORE
, BFI
, PSI
, Hints
);
7471 assert(L
->empty() && "Inner loop expected.");
7472 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7473 // count by optimizing for size, to minimize overheads.
7474 // Prefer constant trip counts over profile data, over upper bound estimate.
7475 unsigned ExpectedTC
= 0;
7476 bool HasExpectedTC
= false;
7477 if (const SCEVConstant
*ConstExits
=
7478 dyn_cast
<SCEVConstant
>(SE
->getBackedgeTakenCount(L
))) {
7479 const APInt
&ExitsCount
= ConstExits
->getAPInt();
7480 // We are interested in small values for ExpectedTC. Skip over those that
7481 // can't fit an unsigned.
7482 if (ExitsCount
.ult(std::numeric_limits
<unsigned>::max())) {
7483 ExpectedTC
= static_cast<unsigned>(ExitsCount
.getZExtValue()) + 1;
7484 HasExpectedTC
= true;
7487 // ExpectedTC may be large because it's bound by a variable. Check
7488 // profiling information to validate we should vectorize.
7489 if (!HasExpectedTC
&& LoopVectorizeWithBlockFrequency
) {
7490 auto EstimatedTC
= getLoopEstimatedTripCount(L
);
7492 ExpectedTC
= *EstimatedTC
;
7493 HasExpectedTC
= true;
7496 if (!HasExpectedTC
) {
7497 ExpectedTC
= SE
->getSmallConstantMaxTripCount(L
);
7498 HasExpectedTC
= (ExpectedTC
> 0);
7501 if (HasExpectedTC
&& ExpectedTC
< TinyTripCountVectorThreshold
) {
7502 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7503 << "This loop is worth vectorizing only if no scalar "
7504 << "iteration overheads are incurred.");
7505 if (Hints
.getForce() == LoopVectorizeHints::FK_Enabled
)
7506 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7508 LLVM_DEBUG(dbgs() << "\n");
7509 SEL
= CM_ScalarEpilogueNotAllowedLowTripLoop
;
7513 // Check the function attributes to see if implicit floats are allowed.
7514 // FIXME: This check doesn't seem possibly correct -- what if the loop is
7515 // an integer loop and the vector instructions selected are purely integer
7516 // vector instructions?
7517 if (F
->hasFnAttribute(Attribute::NoImplicitFloat
)) {
7518 reportVectorizationFailure(
7519 "Can't vectorize when the NoImplicitFloat attribute is used",
7520 "loop not vectorized due to NoImplicitFloat attribute",
7521 "NoImplicitFloat", ORE
, L
);
7522 Hints
.emitRemarkWithHints();
7526 // Check if the target supports potentially unsafe FP vectorization.
7527 // FIXME: Add a check for the type of safety issue (denormal, signaling)
7528 // for the target we're vectorizing for, to make sure none of the
7529 // additional fp-math flags can help.
7530 if (Hints
.isPotentiallyUnsafe() &&
7531 TTI
->isFPVectorizationPotentiallyUnsafe()) {
7532 reportVectorizationFailure(
7533 "Potentially unsafe FP op prevents vectorization",
7534 "loop not vectorized due to unsafe FP support.",
7535 "UnsafeFP", ORE
, L
);
7536 Hints
.emitRemarkWithHints();
7540 bool UseInterleaved
= TTI
->enableInterleavedAccessVectorization();
7541 InterleavedAccessInfo
IAI(PSE
, L
, DT
, LI
, LVL
.getLAI());
7543 // If an override option has been passed in for interleaved accesses, use it.
7544 if (EnableInterleavedMemAccesses
.getNumOccurrences() > 0)
7545 UseInterleaved
= EnableInterleavedMemAccesses
;
7547 // Analyze interleaved memory accesses.
7548 if (UseInterleaved
) {
7549 IAI
.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI
));
7552 // Use the cost model.
7553 LoopVectorizationCostModel
CM(SEL
, L
, PSE
, LI
, &LVL
, *TTI
, TLI
, DB
, AC
, ORE
,
7555 CM
.collectValuesToIgnore();
7557 // Use the planner for vectorization.
7558 LoopVectorizationPlanner
LVP(L
, LI
, TLI
, TTI
, &LVL
, CM
);
7560 // Get user vectorization factor.
7561 unsigned UserVF
= Hints
.getWidth();
7563 // Plan how to best vectorize, return the best VF and its cost.
7564 Optional
<VectorizationFactor
> MaybeVF
= LVP
.plan(UserVF
);
7566 VectorizationFactor VF
= VectorizationFactor::Disabled();
7568 unsigned UserIC
= Hints
.getInterleave();
7572 // Select the interleave count.
7573 IC
= CM
.selectInterleaveCount(VF
.Width
, VF
.Cost
);
7576 // Identify the diagnostic messages that should be produced.
7577 std::pair
<StringRef
, std::string
> VecDiagMsg
, IntDiagMsg
;
7578 bool VectorizeLoop
= true, InterleaveLoop
= true;
7579 if (Requirements
.doesNotMeet(F
, L
, Hints
)) {
7580 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7582 Hints
.emitRemarkWithHints();
7586 if (VF
.Width
== 1) {
7587 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7588 VecDiagMsg
= std::make_pair(
7589 "VectorizationNotBeneficial",
7590 "the cost-model indicates that vectorization is not beneficial");
7591 VectorizeLoop
= false;
7594 if (!MaybeVF
&& UserIC
> 1) {
7595 // Tell the user interleaving was avoided up-front, despite being explicitly
7597 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7598 "interleaving should be avoided up front\n");
7599 IntDiagMsg
= std::make_pair(
7600 "InterleavingAvoided",
7601 "Ignoring UserIC, because interleaving was avoided up front");
7602 InterleaveLoop
= false;
7603 } else if (IC
== 1 && UserIC
<= 1) {
7604 // Tell the user interleaving is not beneficial.
7605 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7606 IntDiagMsg
= std::make_pair(
7607 "InterleavingNotBeneficial",
7608 "the cost-model indicates that interleaving is not beneficial");
7609 InterleaveLoop
= false;
7611 IntDiagMsg
.first
= "InterleavingNotBeneficialAndDisabled";
7612 IntDiagMsg
.second
+=
7613 " and is explicitly disabled or interleave count is set to 1";
7615 } else if (IC
> 1 && UserIC
== 1) {
7616 // Tell the user interleaving is beneficial, but it explicitly disabled.
7618 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7619 IntDiagMsg
= std::make_pair(
7620 "InterleavingBeneficialButDisabled",
7621 "the cost-model indicates that interleaving is beneficial "
7622 "but is explicitly disabled or interleave count is set to 1");
7623 InterleaveLoop
= false;
7626 // Override IC if user provided an interleave count.
7627 IC
= UserIC
> 0 ? UserIC
: IC
;
7629 // Emit diagnostic messages, if any.
7630 const char *VAPassName
= Hints
.vectorizeAnalysisPassName();
7631 if (!VectorizeLoop
&& !InterleaveLoop
) {
7632 // Do not vectorize or interleaving the loop.
7634 return OptimizationRemarkMissed(VAPassName
, VecDiagMsg
.first
,
7635 L
->getStartLoc(), L
->getHeader())
7636 << VecDiagMsg
.second
;
7639 return OptimizationRemarkMissed(LV_NAME
, IntDiagMsg
.first
,
7640 L
->getStartLoc(), L
->getHeader())
7641 << IntDiagMsg
.second
;
7644 } else if (!VectorizeLoop
&& InterleaveLoop
) {
7645 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC
<< '\n');
7647 return OptimizationRemarkAnalysis(VAPassName
, VecDiagMsg
.first
,
7648 L
->getStartLoc(), L
->getHeader())
7649 << VecDiagMsg
.second
;
7651 } else if (VectorizeLoop
&& !InterleaveLoop
) {
7652 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF
.Width
7653 << ") in " << DebugLocStr
<< '\n');
7655 return OptimizationRemarkAnalysis(LV_NAME
, IntDiagMsg
.first
,
7656 L
->getStartLoc(), L
->getHeader())
7657 << IntDiagMsg
.second
;
7659 } else if (VectorizeLoop
&& InterleaveLoop
) {
7660 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF
.Width
7661 << ") in " << DebugLocStr
<< '\n');
7662 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC
<< '\n');
7665 LVP
.setBestPlan(VF
.Width
, IC
);
7667 using namespace ore
;
7668 bool DisableRuntimeUnroll
= false;
7669 MDNode
*OrigLoopID
= L
->getLoopID();
7671 if (!VectorizeLoop
) {
7672 assert(IC
> 1 && "interleave count should not be 1 or 0");
7673 // If we decided that it is not legal to vectorize the loop, then
7675 InnerLoopUnroller
Unroller(L
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
, IC
, &LVL
,
7677 LVP
.executePlan(Unroller
, DT
);
7680 return OptimizationRemark(LV_NAME
, "Interleaved", L
->getStartLoc(),
7682 << "interleaved loop (interleaved count: "
7683 << NV("InterleaveCount", IC
) << ")";
7686 // If we decided that it is *legal* to vectorize the loop, then do it.
7687 InnerLoopVectorizer
LB(L
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
, VF
.Width
, IC
,
7689 LVP
.executePlan(LB
, DT
);
7692 // Add metadata to disable runtime unrolling a scalar loop when there are
7693 // no runtime checks about strides and memory. A scalar loop that is
7694 // rarely used is not worth unrolling.
7695 if (!LB
.areSafetyChecksAdded())
7696 DisableRuntimeUnroll
= true;
7698 // Report the vectorization decision.
7700 return OptimizationRemark(LV_NAME
, "Vectorized", L
->getStartLoc(),
7702 << "vectorized loop (vectorization width: "
7703 << NV("VectorizationFactor", VF
.Width
)
7704 << ", interleaved count: " << NV("InterleaveCount", IC
) << ")";
7708 Optional
<MDNode
*> RemainderLoopID
=
7709 makeFollowupLoopID(OrigLoopID
, {LLVMLoopVectorizeFollowupAll
,
7710 LLVMLoopVectorizeFollowupEpilogue
});
7711 if (RemainderLoopID
.hasValue()) {
7712 L
->setLoopID(RemainderLoopID
.getValue());
7714 if (DisableRuntimeUnroll
)
7715 AddRuntimeUnrollDisableMetaData(L
);
7717 // Mark the loop as already vectorized to avoid vectorizing again.
7718 Hints
.setAlreadyVectorized();
7721 LLVM_DEBUG(verifyFunction(*L
->getHeader()->getParent()));
7725 bool LoopVectorizePass::runImpl(
7726 Function
&F
, ScalarEvolution
&SE_
, LoopInfo
&LI_
, TargetTransformInfo
&TTI_
,
7727 DominatorTree
&DT_
, BlockFrequencyInfo
&BFI_
, TargetLibraryInfo
*TLI_
,
7728 DemandedBits
&DB_
, AliasAnalysis
&AA_
, AssumptionCache
&AC_
,
7729 std::function
<const LoopAccessInfo
&(Loop
&)> &GetLAA_
,
7730 OptimizationRemarkEmitter
&ORE_
, ProfileSummaryInfo
*PSI_
) {
7745 // 1. the target claims to have no vector registers, and
7746 // 2. interleaving won't help ILP.
7748 // The second condition is necessary because, even if the target has no
7749 // vector registers, loop vectorization may still enable scalar
7751 if (!TTI
->getNumberOfRegisters(true) && TTI
->getMaxInterleaveFactor(1) < 2)
7754 bool Changed
= false;
7756 // The vectorizer requires loops to be in simplified form.
7757 // Since simplification may add new inner loops, it has to run before the
7758 // legality and profitability checks. This means running the loop vectorizer
7759 // will simplify all loops, regardless of whether anything end up being
7763 simplifyLoop(L
, DT
, LI
, SE
, AC
, nullptr, false /* PreserveLCSSA */);
7765 // Build up a worklist of inner-loops to vectorize. This is necessary as
7766 // the act of vectorizing or partially unrolling a loop creates new loops
7767 // and can invalidate iterators across the loops.
7768 SmallVector
<Loop
*, 8> Worklist
;
7771 collectSupportedLoops(*L
, LI
, ORE
, Worklist
);
7773 LoopsAnalyzed
+= Worklist
.size();
7775 // Now walk the identified inner loops.
7776 while (!Worklist
.empty()) {
7777 Loop
*L
= Worklist
.pop_back_val();
7779 // For the inner loops we actually process, form LCSSA to simplify the
7781 Changed
|= formLCSSARecursively(*L
, *DT
, LI
, SE
);
7783 Changed
|= processLoop(L
);
7786 // Process each loop nest in the function.
7790 PreservedAnalyses
LoopVectorizePass::run(Function
&F
,
7791 FunctionAnalysisManager
&AM
) {
7792 auto &SE
= AM
.getResult
<ScalarEvolutionAnalysis
>(F
);
7793 auto &LI
= AM
.getResult
<LoopAnalysis
>(F
);
7794 auto &TTI
= AM
.getResult
<TargetIRAnalysis
>(F
);
7795 auto &DT
= AM
.getResult
<DominatorTreeAnalysis
>(F
);
7796 auto &BFI
= AM
.getResult
<BlockFrequencyAnalysis
>(F
);
7797 auto &TLI
= AM
.getResult
<TargetLibraryAnalysis
>(F
);
7798 auto &AA
= AM
.getResult
<AAManager
>(F
);
7799 auto &AC
= AM
.getResult
<AssumptionAnalysis
>(F
);
7800 auto &DB
= AM
.getResult
<DemandedBitsAnalysis
>(F
);
7801 auto &ORE
= AM
.getResult
<OptimizationRemarkEmitterAnalysis
>(F
);
7802 MemorySSA
*MSSA
= EnableMSSALoopDependency
7803 ? &AM
.getResult
<MemorySSAAnalysis
>(F
).getMSSA()
7806 auto &LAM
= AM
.getResult
<LoopAnalysisManagerFunctionProxy
>(F
).getManager();
7807 std::function
<const LoopAccessInfo
&(Loop
&)> GetLAA
=
7808 [&](Loop
&L
) -> const LoopAccessInfo
& {
7809 LoopStandardAnalysisResults AR
= {AA
, AC
, DT
, LI
, SE
, TLI
, TTI
, MSSA
};
7810 return LAM
.getResult
<LoopAccessAnalysis
>(L
, AR
);
7812 const ModuleAnalysisManager
&MAM
=
7813 AM
.getResult
<ModuleAnalysisManagerFunctionProxy
>(F
).getManager();
7814 ProfileSummaryInfo
*PSI
=
7815 MAM
.getCachedResult
<ProfileSummaryAnalysis
>(*F
.getParent());
7817 runImpl(F
, SE
, LI
, TTI
, DT
, BFI
, &TLI
, DB
, AA
, AC
, GetLAA
, ORE
, PSI
);
7819 return PreservedAnalyses::all();
7820 PreservedAnalyses PA
;
7822 // We currently do not preserve loopinfo/dominator analyses with outer loop
7823 // vectorization. Until this is addressed, mark these analyses as preserved
7824 // only for non-VPlan-native path.
7825 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7826 if (!EnableVPlanNativePath
) {
7827 PA
.preserve
<LoopAnalysis
>();
7828 PA
.preserve
<DominatorTreeAnalysis
>();
7830 PA
.preserve
<BasicAA
>();
7831 PA
.preserve
<GlobalsAA
>();