1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 // of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 // widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 // of vectorization. It decides on the optimal vector width, which
26 // can be one, if vectorization is not profitable.
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
36 //===----------------------------------------------------------------------===//
38 // The reduction-variable vectorization is based on the paper:
39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
41 // Variable uniformity checks are inspired by:
42 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
44 // The interleaved access vectorization is based on the paper:
45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
48 // Other ideas/concepts are from:
49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52 // Vectorizing Compilers.
54 //===----------------------------------------------------------------------===//
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanHCFGTransforms.h"
62 #include "VPlanPredicator.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/LoopSimplify.h"
137 #include "llvm/Transforms/Utils/LoopUtils.h"
138 #include "llvm/Transforms/Utils/LoopVersioning.h"
139 #include "llvm/Transforms/Utils/SizeOpts.h"
140 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
145 #include <functional>
154 using namespace llvm
;
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
160 /// Metadata attribute names
161 static const char *const LLVMLoopVectorizeFollowupAll
=
162 "llvm.loop.vectorize.followup_all";
163 static const char *const LLVMLoopVectorizeFollowupVectorized
=
164 "llvm.loop.vectorize.followup_vectorized";
165 static const char *const LLVMLoopVectorizeFollowupEpilogue
=
166 "llvm.loop.vectorize.followup_epilogue";
169 STATISTIC(LoopsVectorized
, "Number of loops vectorized");
170 STATISTIC(LoopsAnalyzed
, "Number of loops analyzed for vectorization");
172 /// Loops with a known constant trip count below this number are vectorized only
173 /// if no scalar iteration overheads are incurred.
174 static cl::opt
<unsigned> TinyTripCountVectorThreshold(
175 "vectorizer-min-trip-count", cl::init(16), cl::Hidden
,
176 cl::desc("Loops with a constant trip count that is smaller than this "
177 "value are vectorized only if no scalar iteration overheads "
180 // Indicates that an epilogue is undesired, predication is preferred.
181 // This means that the vectorizer will try to fold the loop-tail (epilogue)
182 // into the loop and predicate the loop body accordingly.
183 static cl::opt
<bool> PreferPredicateOverEpilog(
184 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden
,
185 cl::desc("Indicate that an epilogue is undesired, predication should be "
188 static cl::opt
<bool> MaximizeBandwidth(
189 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden
,
190 cl::desc("Maximize bandwidth when selecting vectorization factor which "
191 "will be determined by the smallest type in loop."));
193 static cl::opt
<bool> EnableInterleavedMemAccesses(
194 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden
,
195 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
197 /// An interleave-group may need masking if it resides in a block that needs
198 /// predication, or in order to mask away gaps.
199 static cl::opt
<bool> EnableMaskedInterleavedMemAccesses(
200 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden
,
201 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
203 /// We don't interleave loops with a known constant trip count below this
205 static const unsigned TinyTripCountInterleaveThreshold
= 128;
207 static cl::opt
<unsigned> ForceTargetNumScalarRegs(
208 "force-target-num-scalar-regs", cl::init(0), cl::Hidden
,
209 cl::desc("A flag that overrides the target's number of scalar registers."));
211 static cl::opt
<unsigned> ForceTargetNumVectorRegs(
212 "force-target-num-vector-regs", cl::init(0), cl::Hidden
,
213 cl::desc("A flag that overrides the target's number of vector registers."));
215 static cl::opt
<unsigned> ForceTargetMaxScalarInterleaveFactor(
216 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden
,
217 cl::desc("A flag that overrides the target's max interleave factor for "
220 static cl::opt
<unsigned> ForceTargetMaxVectorInterleaveFactor(
221 "force-target-max-vector-interleave", cl::init(0), cl::Hidden
,
222 cl::desc("A flag that overrides the target's max interleave factor for "
223 "vectorized loops."));
225 static cl::opt
<unsigned> ForceTargetInstructionCost(
226 "force-target-instruction-cost", cl::init(0), cl::Hidden
,
227 cl::desc("A flag that overrides the target's expected cost for "
228 "an instruction to a single constant value. Mostly "
229 "useful for getting consistent testing."));
231 static cl::opt
<unsigned> SmallLoopCost(
232 "small-loop-cost", cl::init(20), cl::Hidden
,
234 "The cost of a loop that is considered 'small' by the interleaver."));
236 static cl::opt
<bool> LoopVectorizeWithBlockFrequency(
237 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden
,
238 cl::desc("Enable the use of the block frequency analysis to access PGO "
239 "heuristics minimizing code growth in cold regions and being more "
240 "aggressive in hot regions."));
242 // Runtime interleave loops for load/store throughput.
243 static cl::opt
<bool> EnableLoadStoreRuntimeInterleave(
244 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden
,
246 "Enable runtime interleaving until load/store ports are saturated"));
248 /// The number of stores in a loop that are allowed to need predication.
249 static cl::opt
<unsigned> NumberOfStoresToPredicate(
250 "vectorize-num-stores-pred", cl::init(1), cl::Hidden
,
251 cl::desc("Max number of stores to be predicated behind an if."));
253 static cl::opt
<bool> EnableIndVarRegisterHeur(
254 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden
,
255 cl::desc("Count the induction variable only once when interleaving"));
257 static cl::opt
<bool> EnableCondStoresVectorization(
258 "enable-cond-stores-vec", cl::init(true), cl::Hidden
,
259 cl::desc("Enable if predication of stores during vectorization."));
261 static cl::opt
<unsigned> MaxNestedScalarReductionIC(
262 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden
,
263 cl::desc("The maximum interleave count to use when interleaving a scalar "
264 "reduction in a nested loop."));
266 cl::opt
<bool> EnableVPlanNativePath(
267 "enable-vplan-native-path", cl::init(false), cl::Hidden
,
268 cl::desc("Enable VPlan-native vectorization path with "
269 "support for outer loop vectorization."));
271 // FIXME: Remove this switch once we have divergence analysis. Currently we
272 // assume divergent non-backedge branches when this switch is true.
273 cl::opt
<bool> EnableVPlanPredication(
274 "enable-vplan-predication", cl::init(false), cl::Hidden
,
275 cl::desc("Enable VPlan-native vectorization path predicator with "
276 "support for outer loop vectorization."));
278 // This flag enables the stress testing of the VPlan H-CFG construction in the
279 // VPlan-native vectorization path. It must be used in conjuction with
280 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
281 // verification of the H-CFGs built.
282 static cl::opt
<bool> VPlanBuildStressTest(
283 "vplan-build-stress-test", cl::init(false), cl::Hidden
,
285 "Build VPlan for every supported loop nest in the function and bail "
286 "out right after the build (stress test the VPlan H-CFG construction "
287 "in the VPlan-native vectorization path)."));
289 cl::opt
<bool> llvm::EnableLoopInterleaving(
290 "interleave-loops", cl::init(true), cl::Hidden
,
291 cl::desc("Enable loop interleaving in Loop vectorization passes"));
292 cl::opt
<bool> llvm::EnableLoopVectorization(
293 "vectorize-loops", cl::init(true), cl::Hidden
,
294 cl::desc("Run the Loop vectorization passes"));
296 /// A helper function for converting Scalar types to vector types.
297 /// If the incoming type is void, we return void. If the VF is 1, we return
299 static Type
*ToVectorTy(Type
*Scalar
, unsigned VF
) {
300 if (Scalar
->isVoidTy() || VF
== 1)
302 return VectorType::get(Scalar
, VF
);
305 /// A helper function that returns the type of loaded or stored value.
306 static Type
*getMemInstValueType(Value
*I
) {
307 assert((isa
<LoadInst
>(I
) || isa
<StoreInst
>(I
)) &&
308 "Expected Load or Store instruction");
309 if (auto *LI
= dyn_cast
<LoadInst
>(I
))
310 return LI
->getType();
311 return cast
<StoreInst
>(I
)->getValueOperand()->getType();
314 /// A helper function that returns true if the given type is irregular. The
315 /// type is irregular if its allocated size doesn't equal the store size of an
316 /// element of the corresponding vector type at the given vectorization factor.
317 static bool hasIrregularType(Type
*Ty
, const DataLayout
&DL
, unsigned VF
) {
318 // Determine if an array of VF elements of type Ty is "bitcast compatible"
319 // with a <VF x Ty> vector.
321 auto *VectorTy
= VectorType::get(Ty
, VF
);
322 return VF
* DL
.getTypeAllocSize(Ty
) != DL
.getTypeStoreSize(VectorTy
);
325 // If the vectorization factor is one, we just check if an array of type Ty
326 // requires padding between elements.
327 return DL
.getTypeAllocSizeInBits(Ty
) != DL
.getTypeSizeInBits(Ty
);
330 /// A helper function that returns the reciprocal of the block probability of
331 /// predicated blocks. If we return X, we are assuming the predicated block
332 /// will execute once for every X iterations of the loop header.
334 /// TODO: We should use actual block probability here, if available. Currently,
335 /// we always assume predicated blocks have a 50% chance of executing.
336 static unsigned getReciprocalPredBlockProb() { return 2; }
338 /// A helper function that adds a 'fast' flag to floating-point operations.
339 static Value
*addFastMathFlag(Value
*V
) {
340 if (isa
<FPMathOperator
>(V
))
341 cast
<Instruction
>(V
)->setFastMathFlags(FastMathFlags::getFast());
345 static Value
*addFastMathFlag(Value
*V
, FastMathFlags FMF
) {
346 if (isa
<FPMathOperator
>(V
))
347 cast
<Instruction
>(V
)->setFastMathFlags(FMF
);
351 /// A helper function that returns an integer or floating-point constant with
353 static Constant
*getSignedIntOrFpConstant(Type
*Ty
, int64_t C
) {
354 return Ty
->isIntegerTy() ? ConstantInt::getSigned(Ty
, C
)
355 : ConstantFP::get(Ty
, C
);
360 /// InnerLoopVectorizer vectorizes loops which contain only one basic
361 /// block to a specified vectorization factor (VF).
362 /// This class performs the widening of scalars into vectors, or multiple
363 /// scalars. This class also implements the following features:
364 /// * It inserts an epilogue loop for handling loops that don't have iteration
365 /// counts that are known to be a multiple of the vectorization factor.
366 /// * It handles the code generation for reduction variables.
367 /// * Scalarization (implementation using scalars) of un-vectorizable
369 /// InnerLoopVectorizer does not perform any vectorization-legality
370 /// checks, and relies on the caller to check for the different legality
371 /// aspects. The InnerLoopVectorizer relies on the
372 /// LoopVectorizationLegality class to provide information about the induction
373 /// and reduction variables that were found to a given vectorization factor.
374 class InnerLoopVectorizer
{
376 InnerLoopVectorizer(Loop
*OrigLoop
, PredicatedScalarEvolution
&PSE
,
377 LoopInfo
*LI
, DominatorTree
*DT
,
378 const TargetLibraryInfo
*TLI
,
379 const TargetTransformInfo
*TTI
, AssumptionCache
*AC
,
380 OptimizationRemarkEmitter
*ORE
, unsigned VecWidth
,
381 unsigned UnrollFactor
, LoopVectorizationLegality
*LVL
,
382 LoopVectorizationCostModel
*CM
)
383 : OrigLoop(OrigLoop
), PSE(PSE
), LI(LI
), DT(DT
), TLI(TLI
), TTI(TTI
),
384 AC(AC
), ORE(ORE
), VF(VecWidth
), UF(UnrollFactor
),
385 Builder(PSE
.getSE()->getContext()),
386 VectorLoopValueMap(UnrollFactor
, VecWidth
), Legal(LVL
), Cost(CM
) {}
387 virtual ~InnerLoopVectorizer() = default;
389 /// Create a new empty loop. Unlink the old loop and connect the new one.
390 /// Return the pre-header block of the new loop.
391 BasicBlock
*createVectorizedLoopSkeleton();
393 /// Widen a single instruction within the innermost loop.
394 void widenInstruction(Instruction
&I
);
396 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
397 void fixVectorizedLoop();
399 // Return true if any runtime check is added.
400 bool areSafetyChecksAdded() { return AddedSafetyChecks
; }
402 /// A type for vectorized values in the new loop. Each value from the
403 /// original loop, when vectorized, is represented by UF vector values in the
404 /// new unrolled loop, where UF is the unroll factor.
405 using VectorParts
= SmallVector
<Value
*, 2>;
407 /// Vectorize a single PHINode in a block. This method handles the induction
408 /// variable canonicalization. It supports both VF = 1 for unrolled loops and
409 /// arbitrary length vectors.
410 void widenPHIInstruction(Instruction
*PN
, unsigned UF
, unsigned VF
);
412 /// A helper function to scalarize a single Instruction in the innermost loop.
413 /// Generates a sequence of scalar instances for each lane between \p MinLane
414 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
416 void scalarizeInstruction(Instruction
*Instr
, const VPIteration
&Instance
,
417 bool IfPredicateInstr
);
419 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
420 /// is provided, the integer induction variable will first be truncated to
421 /// the corresponding type.
422 void widenIntOrFpInduction(PHINode
*IV
, TruncInst
*Trunc
= nullptr);
424 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
425 /// vector or scalar value on-demand if one is not yet available. When
426 /// vectorizing a loop, we visit the definition of an instruction before its
427 /// uses. When visiting the definition, we either vectorize or scalarize the
428 /// instruction, creating an entry for it in the corresponding map. (In some
429 /// cases, such as induction variables, we will create both vector and scalar
430 /// entries.) Then, as we encounter uses of the definition, we derive values
431 /// for each scalar or vector use unless such a value is already available.
432 /// For example, if we scalarize a definition and one of its uses is vector,
433 /// we build the required vector on-demand with an insertelement sequence
434 /// when visiting the use. Otherwise, if the use is scalar, we can use the
435 /// existing scalar definition.
437 /// Return a value in the new loop corresponding to \p V from the original
438 /// loop at unroll index \p Part. If the value has already been vectorized,
439 /// the corresponding vector entry in VectorLoopValueMap is returned. If,
440 /// however, the value has a scalar entry in VectorLoopValueMap, we construct
441 /// a new vector value on-demand by inserting the scalar values into a vector
442 /// with an insertelement sequence. If the value has been neither vectorized
443 /// nor scalarized, it must be loop invariant, so we simply broadcast the
444 /// value into a vector.
445 Value
*getOrCreateVectorValue(Value
*V
, unsigned Part
);
447 /// Return a value in the new loop corresponding to \p V from the original
448 /// loop at unroll and vector indices \p Instance. If the value has been
449 /// vectorized but not scalarized, the necessary extractelement instruction
450 /// will be generated.
451 Value
*getOrCreateScalarValue(Value
*V
, const VPIteration
&Instance
);
453 /// Construct the vector value of a scalarized value \p V one lane at a time.
454 void packScalarIntoVectorValue(Value
*V
, const VPIteration
&Instance
);
456 /// Try to vectorize the interleaved access group that \p Instr belongs to,
457 /// optionally masking the vector operations if \p BlockInMask is non-null.
458 void vectorizeInterleaveGroup(Instruction
*Instr
,
459 VectorParts
*BlockInMask
= nullptr);
461 /// Vectorize Load and Store instructions, optionally masking the vector
462 /// operations if \p BlockInMask is non-null.
463 void vectorizeMemoryInstruction(Instruction
*Instr
,
464 VectorParts
*BlockInMask
= nullptr);
466 /// Set the debug location in the builder using the debug location in
468 void setDebugLocFromInst(IRBuilder
<> &B
, const Value
*Ptr
);
470 /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
471 void fixNonInductionPHIs(void);
474 friend class LoopVectorizationPlanner
;
476 /// A small list of PHINodes.
477 using PhiVector
= SmallVector
<PHINode
*, 4>;
479 /// A type for scalarized values in the new loop. Each value from the
480 /// original loop, when scalarized, is represented by UF x VF scalar values
481 /// in the new unrolled loop, where UF is the unroll factor and VF is the
482 /// vectorization factor.
483 using ScalarParts
= SmallVector
<SmallVector
<Value
*, 4>, 2>;
485 /// Set up the values of the IVs correctly when exiting the vector loop.
486 void fixupIVUsers(PHINode
*OrigPhi
, const InductionDescriptor
&II
,
487 Value
*CountRoundDown
, Value
*EndValue
,
488 BasicBlock
*MiddleBlock
);
490 /// Create a new induction variable inside L.
491 PHINode
*createInductionVariable(Loop
*L
, Value
*Start
, Value
*End
,
492 Value
*Step
, Instruction
*DL
);
494 /// Handle all cross-iteration phis in the header.
495 void fixCrossIterationPHIs();
497 /// Fix a first-order recurrence. This is the second phase of vectorizing
499 void fixFirstOrderRecurrence(PHINode
*Phi
);
501 /// Fix a reduction cross-iteration phi. This is the second phase of
502 /// vectorizing this phi node.
503 void fixReduction(PHINode
*Phi
);
505 /// The Loop exit block may have single value PHI nodes with some
506 /// incoming value. While vectorizing we only handled real values
507 /// that were defined inside the loop and we should have one value for
508 /// each predecessor of its parent basic block. See PR14725.
511 /// Iteratively sink the scalarized operands of a predicated instruction into
512 /// the block that was created for it.
513 void sinkScalarOperands(Instruction
*PredInst
);
515 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
517 void truncateToMinimalBitwidths();
519 /// Insert the new loop to the loop hierarchy and pass manager
520 /// and update the analysis passes.
521 void updateAnalysis();
523 /// Create a broadcast instruction. This method generates a broadcast
524 /// instruction (shuffle) for loop invariant values and for the induction
525 /// value. If this is the induction variable then we extend it to N, N+1, ...
526 /// this is needed because each iteration in the loop corresponds to a SIMD
528 virtual Value
*getBroadcastInstrs(Value
*V
);
530 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
531 /// to each vector element of Val. The sequence starts at StartIndex.
532 /// \p Opcode is relevant for FP induction variable.
533 virtual Value
*getStepVector(Value
*Val
, int StartIdx
, Value
*Step
,
534 Instruction::BinaryOps Opcode
=
535 Instruction::BinaryOpsEnd
);
537 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
538 /// variable on which to base the steps, \p Step is the size of the step, and
539 /// \p EntryVal is the value from the original loop that maps to the steps.
540 /// Note that \p EntryVal doesn't have to be an induction variable - it
541 /// can also be a truncate instruction.
542 void buildScalarSteps(Value
*ScalarIV
, Value
*Step
, Instruction
*EntryVal
,
543 const InductionDescriptor
&ID
);
545 /// Create a vector induction phi node based on an existing scalar one. \p
546 /// EntryVal is the value from the original loop that maps to the vector phi
547 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
548 /// truncate instruction, instead of widening the original IV, we widen a
549 /// version of the IV truncated to \p EntryVal's type.
550 void createVectorIntOrFpInductionPHI(const InductionDescriptor
&II
,
551 Value
*Step
, Instruction
*EntryVal
);
553 /// Returns true if an instruction \p I should be scalarized instead of
554 /// vectorized for the chosen vectorization factor.
555 bool shouldScalarizeInstruction(Instruction
*I
) const;
557 /// Returns true if we should generate a scalar version of \p IV.
558 bool needsScalarInduction(Instruction
*IV
) const;
560 /// If there is a cast involved in the induction variable \p ID, which should
561 /// be ignored in the vectorized loop body, this function records the
562 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
563 /// cast. We had already proved that the casted Phi is equal to the uncasted
564 /// Phi in the vectorized loop (under a runtime guard), and therefore
565 /// there is no need to vectorize the cast - the same value can be used in the
566 /// vector loop for both the Phi and the cast.
567 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
568 /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
570 /// \p EntryVal is the value from the original loop that maps to the vector
571 /// phi node and is used to distinguish what is the IV currently being
572 /// processed - original one (if \p EntryVal is a phi corresponding to the
573 /// original IV) or the "newly-created" one based on the proof mentioned above
574 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
575 /// latter case \p EntryVal is a TruncInst and we must not record anything for
576 /// that IV, but it's error-prone to expect callers of this routine to care
577 /// about that, hence this explicit parameter.
578 void recordVectorLoopValueForInductionCast(const InductionDescriptor
&ID
,
579 const Instruction
*EntryVal
,
580 Value
*VectorLoopValue
,
582 unsigned Lane
= UINT_MAX
);
584 /// Generate a shuffle sequence that will reverse the vector Vec.
585 virtual Value
*reverseVector(Value
*Vec
);
587 /// Returns (and creates if needed) the original loop trip count.
588 Value
*getOrCreateTripCount(Loop
*NewLoop
);
590 /// Returns (and creates if needed) the trip count of the widened loop.
591 Value
*getOrCreateVectorTripCount(Loop
*NewLoop
);
593 /// Returns a bitcasted value to the requested vector type.
594 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
595 Value
*createBitOrPointerCast(Value
*V
, VectorType
*DstVTy
,
596 const DataLayout
&DL
);
598 /// Emit a bypass check to see if the vector trip count is zero, including if
600 void emitMinimumIterationCountCheck(Loop
*L
, BasicBlock
*Bypass
);
602 /// Emit a bypass check to see if all of the SCEV assumptions we've
603 /// had to make are correct.
604 void emitSCEVChecks(Loop
*L
, BasicBlock
*Bypass
);
606 /// Emit bypass checks to check any memory assumptions we may have made.
607 void emitMemRuntimeChecks(Loop
*L
, BasicBlock
*Bypass
);
609 /// Compute the transformed value of Index at offset StartValue using step
611 /// For integer induction, returns StartValue + Index * StepValue.
612 /// For pointer induction, returns StartValue[Index * StepValue].
613 /// FIXME: The newly created binary instructions should contain nsw/nuw
614 /// flags, which can be found from the original scalar operations.
615 Value
*emitTransformedIndex(IRBuilder
<> &B
, Value
*Index
, ScalarEvolution
*SE
,
616 const DataLayout
&DL
,
617 const InductionDescriptor
&ID
) const;
619 /// Add additional metadata to \p To that was not present on \p Orig.
621 /// Currently this is used to add the noalias annotations based on the
622 /// inserted memchecks. Use this for instructions that are *cloned* into the
624 void addNewMetadata(Instruction
*To
, const Instruction
*Orig
);
626 /// Add metadata from one instruction to another.
628 /// This includes both the original MDs from \p From and additional ones (\see
629 /// addNewMetadata). Use this for *newly created* instructions in the vector
631 void addMetadata(Instruction
*To
, Instruction
*From
);
633 /// Similar to the previous function but it adds the metadata to a
634 /// vector of instructions.
635 void addMetadata(ArrayRef
<Value
*> To
, Instruction
*From
);
637 /// The original loop.
640 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
641 /// dynamic knowledge to simplify SCEV expressions and converts them to a
642 /// more usable form.
643 PredicatedScalarEvolution
&PSE
;
654 /// Target Library Info.
655 const TargetLibraryInfo
*TLI
;
657 /// Target Transform Info.
658 const TargetTransformInfo
*TTI
;
660 /// Assumption Cache.
663 /// Interface to emit optimization remarks.
664 OptimizationRemarkEmitter
*ORE
;
666 /// LoopVersioning. It's only set up (non-null) if memchecks were
669 /// This is currently only used to add no-alias metadata based on the
670 /// memchecks. The actually versioning is performed manually.
671 std::unique_ptr
<LoopVersioning
> LVer
;
673 /// The vectorization SIMD factor to use. Each vector will have this many
677 /// The vectorization unroll factor to use. Each scalar is vectorized to this
678 /// many different vector instructions.
681 /// The builder that we use
684 // --- Vectorization state ---
686 /// The vector-loop preheader.
687 BasicBlock
*LoopVectorPreHeader
;
689 /// The scalar-loop preheader.
690 BasicBlock
*LoopScalarPreHeader
;
692 /// Middle Block between the vector and the scalar.
693 BasicBlock
*LoopMiddleBlock
;
695 /// The ExitBlock of the scalar loop.
696 BasicBlock
*LoopExitBlock
;
698 /// The vector loop body.
699 BasicBlock
*LoopVectorBody
;
701 /// The scalar loop body.
702 BasicBlock
*LoopScalarBody
;
704 /// A list of all bypass blocks. The first block is the entry of the loop.
705 SmallVector
<BasicBlock
*, 4> LoopBypassBlocks
;
707 /// The new Induction variable which was added to the new block.
708 PHINode
*Induction
= nullptr;
710 /// The induction variable of the old basic block.
711 PHINode
*OldInduction
= nullptr;
713 /// Maps values from the original loop to their corresponding values in the
714 /// vectorized loop. A key value can map to either vector values, scalar
715 /// values or both kinds of values, depending on whether the key was
716 /// vectorized and scalarized.
717 VectorizerValueMap VectorLoopValueMap
;
719 /// Store instructions that were predicated.
720 SmallVector
<Instruction
*, 4> PredicatedInstructions
;
722 /// Trip count of the original loop.
723 Value
*TripCount
= nullptr;
725 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
726 Value
*VectorTripCount
= nullptr;
728 /// The legality analysis.
729 LoopVectorizationLegality
*Legal
;
731 /// The profitablity analysis.
732 LoopVectorizationCostModel
*Cost
;
734 // Record whether runtime checks are added.
735 bool AddedSafetyChecks
= false;
737 // Holds the end values for each induction variable. We save the end values
738 // so we can later fix-up the external users of the induction variables.
739 DenseMap
<PHINode
*, Value
*> IVEndValues
;
741 // Vector of original scalar PHIs whose corresponding widened PHIs need to be
742 // fixed up at the end of vector code generation.
743 SmallVector
<PHINode
*, 8> OrigPHIsToFix
;
746 class InnerLoopUnroller
: public InnerLoopVectorizer
{
748 InnerLoopUnroller(Loop
*OrigLoop
, PredicatedScalarEvolution
&PSE
,
749 LoopInfo
*LI
, DominatorTree
*DT
,
750 const TargetLibraryInfo
*TLI
,
751 const TargetTransformInfo
*TTI
, AssumptionCache
*AC
,
752 OptimizationRemarkEmitter
*ORE
, unsigned UnrollFactor
,
753 LoopVectorizationLegality
*LVL
,
754 LoopVectorizationCostModel
*CM
)
755 : InnerLoopVectorizer(OrigLoop
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
, 1,
756 UnrollFactor
, LVL
, CM
) {}
759 Value
*getBroadcastInstrs(Value
*V
) override
;
760 Value
*getStepVector(Value
*Val
, int StartIdx
, Value
*Step
,
761 Instruction::BinaryOps Opcode
=
762 Instruction::BinaryOpsEnd
) override
;
763 Value
*reverseVector(Value
*Vec
) override
;
766 } // end namespace llvm
768 /// Look for a meaningful debug location on the instruction or it's
770 static Instruction
*getDebugLocFromInstOrOperands(Instruction
*I
) {
775 if (I
->getDebugLoc() != Empty
)
778 for (User::op_iterator OI
= I
->op_begin(), OE
= I
->op_end(); OI
!= OE
; ++OI
) {
779 if (Instruction
*OpInst
= dyn_cast
<Instruction
>(*OI
))
780 if (OpInst
->getDebugLoc() != Empty
)
787 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder
<> &B
, const Value
*Ptr
) {
788 if (const Instruction
*Inst
= dyn_cast_or_null
<Instruction
>(Ptr
)) {
789 const DILocation
*DIL
= Inst
->getDebugLoc();
790 if (DIL
&& Inst
->getFunction()->isDebugInfoForProfiling() &&
791 !isa
<DbgInfoIntrinsic
>(Inst
)) {
792 auto NewDIL
= DIL
->cloneByMultiplyingDuplicationFactor(UF
* VF
);
794 B
.SetCurrentDebugLocation(NewDIL
.getValue());
797 << "Failed to create new discriminator: "
798 << DIL
->getFilename() << " Line: " << DIL
->getLine());
801 B
.SetCurrentDebugLocation(DIL
);
803 B
.SetCurrentDebugLocation(DebugLoc());
806 /// Write a record \p DebugMsg about vectorization failure to the debug
807 /// output stream. If \p I is passed, it is an instruction that prevents
810 static void debugVectorizationFailure(const StringRef DebugMsg
,
812 dbgs() << "LV: Not vectorizing: " << DebugMsg
;
821 /// Create an analysis remark that explains why vectorization failed
823 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
824 /// RemarkName is the identifier for the remark. If \p I is passed it is an
825 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for
826 /// the location of the remark. \return the remark object that can be
828 static OptimizationRemarkAnalysis
createLVAnalysis(const char *PassName
,
829 StringRef RemarkName
, Loop
*TheLoop
, Instruction
*I
) {
830 Value
*CodeRegion
= TheLoop
->getHeader();
831 DebugLoc DL
= TheLoop
->getStartLoc();
834 CodeRegion
= I
->getParent();
835 // If there is no debug location attached to the instruction, revert back to
837 if (I
->getDebugLoc())
838 DL
= I
->getDebugLoc();
841 OptimizationRemarkAnalysis
R(PassName
, RemarkName
, DL
, CodeRegion
);
842 R
<< "loop not vectorized: ";
848 void reportVectorizationFailure(const StringRef DebugMsg
,
849 const StringRef OREMsg
, const StringRef ORETag
,
850 OptimizationRemarkEmitter
*ORE
, Loop
*TheLoop
, Instruction
*I
) {
851 LLVM_DEBUG(debugVectorizationFailure(DebugMsg
, I
));
852 LoopVectorizeHints
Hints(TheLoop
, true /* doesn't matter */, *ORE
);
853 ORE
->emit(createLVAnalysis(Hints
.vectorizeAnalysisPassName(),
854 ORETag
, TheLoop
, I
) << OREMsg
);
857 } // end namespace llvm
860 /// \return string containing a file name and a line # for the given loop.
861 static std::string
getDebugLocString(const Loop
*L
) {
864 raw_string_ostream
OS(Result
);
865 if (const DebugLoc LoopDbgLoc
= L
->getStartLoc())
866 LoopDbgLoc
.print(OS
);
868 // Just print the module name.
869 OS
<< L
->getHeader()->getParent()->getParent()->getModuleIdentifier();
876 void InnerLoopVectorizer::addNewMetadata(Instruction
*To
,
877 const Instruction
*Orig
) {
878 // If the loop was versioned with memchecks, add the corresponding no-alias
880 if (LVer
&& (isa
<LoadInst
>(Orig
) || isa
<StoreInst
>(Orig
)))
881 LVer
->annotateInstWithNoAlias(To
, Orig
);
884 void InnerLoopVectorizer::addMetadata(Instruction
*To
,
886 propagateMetadata(To
, From
);
887 addNewMetadata(To
, From
);
890 void InnerLoopVectorizer::addMetadata(ArrayRef
<Value
*> To
,
892 for (Value
*V
: To
) {
893 if (Instruction
*I
= dyn_cast
<Instruction
>(V
))
894 addMetadata(I
, From
);
900 // Loop vectorization cost-model hints how the scalar epilogue loop should be
902 enum ScalarEpilogueLowering
{
904 // The default: allowing scalar epilogues.
905 CM_ScalarEpilogueAllowed
,
907 // Vectorization with OptForSize: don't allow epilogues.
908 CM_ScalarEpilogueNotAllowedOptSize
,
910 // A special case of vectorisation with OptForSize: loops with a very small
911 // trip count are considered for vectorization under OptForSize, thereby
912 // making sure the cost of their loop body is dominant, free of runtime
913 // guards and scalar iteration overheads.
914 CM_ScalarEpilogueNotAllowedLowTripLoop
,
916 // Loop hint predicate indicating an epilogue is undesired.
917 CM_ScalarEpilogueNotNeededUsePredicate
920 /// LoopVectorizationCostModel - estimates the expected speedups due to
922 /// In many cases vectorization is not profitable. This can happen because of
923 /// a number of reasons. In this class we mainly attempt to predict the
924 /// expected speedup/slowdowns due to the supported instruction set. We use the
925 /// TargetTransformInfo to query the different backends for the cost of
926 /// different operations.
927 class LoopVectorizationCostModel
{
929 LoopVectorizationCostModel(ScalarEpilogueLowering SEL
, Loop
*L
,
930 PredicatedScalarEvolution
&PSE
, LoopInfo
*LI
,
931 LoopVectorizationLegality
*Legal
,
932 const TargetTransformInfo
&TTI
,
933 const TargetLibraryInfo
*TLI
, DemandedBits
*DB
,
935 OptimizationRemarkEmitter
*ORE
, const Function
*F
,
936 const LoopVectorizeHints
*Hints
,
937 InterleavedAccessInfo
&IAI
)
938 : ScalarEpilogueStatus(SEL
), TheLoop(L
), PSE(PSE
), LI(LI
), Legal(Legal
),
939 TTI(TTI
), TLI(TLI
), DB(DB
), AC(AC
), ORE(ORE
), TheFunction(F
),
940 Hints(Hints
), InterleaveInfo(IAI
) {}
942 /// \return An upper bound for the vectorization factor, or None if
943 /// vectorization and interleaving should be avoided up front.
944 Optional
<unsigned> computeMaxVF();
946 /// \return True if runtime checks are required for vectorization, and false
948 bool runtimeChecksRequired();
950 /// \return The most profitable vectorization factor and the cost of that VF.
951 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
952 /// then this vectorization factor will be selected if vectorization is
954 VectorizationFactor
selectVectorizationFactor(unsigned MaxVF
);
956 /// Setup cost-based decisions for user vectorization factor.
957 void selectUserVectorizationFactor(unsigned UserVF
) {
958 collectUniformsAndScalars(UserVF
);
959 collectInstsToScalarize(UserVF
);
962 /// \return The size (in bits) of the smallest and widest types in the code
963 /// that needs to be vectorized. We ignore values that remain scalar such as
964 /// 64 bit loop indices.
965 std::pair
<unsigned, unsigned> getSmallestAndWidestTypes();
967 /// \return The desired interleave count.
968 /// If interleave count has been specified by metadata it will be returned.
969 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
970 /// are the selected vectorization factor and the cost of the selected VF.
971 unsigned selectInterleaveCount(unsigned VF
, unsigned LoopCost
);
973 /// Memory access instruction may be vectorized in more than one way.
974 /// Form of instruction after vectorization depends on cost.
975 /// This function takes cost-based decisions for Load/Store instructions
976 /// and collects them in a map. This decisions map is used for building
977 /// the lists of loop-uniform and loop-scalar instructions.
978 /// The calculated cost is saved with widening decision in order to
979 /// avoid redundant calculations.
980 void setCostBasedWideningDecision(unsigned VF
);
982 /// A struct that represents some properties of the register usage
984 struct RegisterUsage
{
985 /// Holds the number of loop invariant values that are used in the loop.
986 unsigned LoopInvariantRegs
;
988 /// Holds the maximum number of concurrent live intervals in the loop.
989 unsigned MaxLocalUsers
;
992 /// \return Returns information about the register usages of the loop for the
993 /// given vectorization factors.
994 SmallVector
<RegisterUsage
, 8> calculateRegisterUsage(ArrayRef
<unsigned> VFs
);
996 /// Collect values we want to ignore in the cost model.
997 void collectValuesToIgnore();
999 /// \returns The smallest bitwidth each instruction can be represented with.
1000 /// The vector equivalents of these instructions should be truncated to this
1002 const MapVector
<Instruction
*, uint64_t> &getMinimalBitwidths() const {
1006 /// \returns True if it is more profitable to scalarize instruction \p I for
1007 /// vectorization factor \p VF.
1008 bool isProfitableToScalarize(Instruction
*I
, unsigned VF
) const {
1009 assert(VF
> 1 && "Profitable to scalarize relevant only for VF > 1.");
1011 // Cost model is not run in the VPlan-native path - return conservative
1012 // result until this changes.
1013 if (EnableVPlanNativePath
)
1016 auto Scalars
= InstsToScalarize
.find(VF
);
1017 assert(Scalars
!= InstsToScalarize
.end() &&
1018 "VF not yet analyzed for scalarization profitability");
1019 return Scalars
->second
.find(I
) != Scalars
->second
.end();
1022 /// Returns true if \p I is known to be uniform after vectorization.
1023 bool isUniformAfterVectorization(Instruction
*I
, unsigned VF
) const {
1027 // Cost model is not run in the VPlan-native path - return conservative
1028 // result until this changes.
1029 if (EnableVPlanNativePath
)
1032 auto UniformsPerVF
= Uniforms
.find(VF
);
1033 assert(UniformsPerVF
!= Uniforms
.end() &&
1034 "VF not yet analyzed for uniformity");
1035 return UniformsPerVF
->second
.find(I
) != UniformsPerVF
->second
.end();
1038 /// Returns true if \p I is known to be scalar after vectorization.
1039 bool isScalarAfterVectorization(Instruction
*I
, unsigned VF
) const {
1043 // Cost model is not run in the VPlan-native path - return conservative
1044 // result until this changes.
1045 if (EnableVPlanNativePath
)
1048 auto ScalarsPerVF
= Scalars
.find(VF
);
1049 assert(ScalarsPerVF
!= Scalars
.end() &&
1050 "Scalar values are not calculated for VF");
1051 return ScalarsPerVF
->second
.find(I
) != ScalarsPerVF
->second
.end();
1054 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1055 /// for vectorization factor \p VF.
1056 bool canTruncateToMinimalBitwidth(Instruction
*I
, unsigned VF
) const {
1057 return VF
> 1 && MinBWs
.find(I
) != MinBWs
.end() &&
1058 !isProfitableToScalarize(I
, VF
) &&
1059 !isScalarAfterVectorization(I
, VF
);
1062 /// Decision that was taken during cost calculation for memory instruction.
1065 CM_Widen
, // For consecutive accesses with stride +1.
1066 CM_Widen_Reverse
, // For consecutive accesses with stride -1.
1072 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1073 /// instruction \p I and vector width \p VF.
1074 void setWideningDecision(Instruction
*I
, unsigned VF
, InstWidening W
,
1076 assert(VF
>= 2 && "Expected VF >=2");
1077 WideningDecisions
[std::make_pair(I
, VF
)] = std::make_pair(W
, Cost
);
1080 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1081 /// interleaving group \p Grp and vector width \p VF.
1082 void setWideningDecision(const InterleaveGroup
<Instruction
> *Grp
, unsigned VF
,
1083 InstWidening W
, unsigned Cost
) {
1084 assert(VF
>= 2 && "Expected VF >=2");
1085 /// Broadcast this decicion to all instructions inside the group.
1086 /// But the cost will be assigned to one instruction only.
1087 for (unsigned i
= 0; i
< Grp
->getFactor(); ++i
) {
1088 if (auto *I
= Grp
->getMember(i
)) {
1089 if (Grp
->getInsertPos() == I
)
1090 WideningDecisions
[std::make_pair(I
, VF
)] = std::make_pair(W
, Cost
);
1092 WideningDecisions
[std::make_pair(I
, VF
)] = std::make_pair(W
, 0);
1097 /// Return the cost model decision for the given instruction \p I and vector
1098 /// width \p VF. Return CM_Unknown if this instruction did not pass
1099 /// through the cost modeling.
1100 InstWidening
getWideningDecision(Instruction
*I
, unsigned VF
) {
1101 assert(VF
>= 2 && "Expected VF >=2");
1103 // Cost model is not run in the VPlan-native path - return conservative
1104 // result until this changes.
1105 if (EnableVPlanNativePath
)
1106 return CM_GatherScatter
;
1108 std::pair
<Instruction
*, unsigned> InstOnVF
= std::make_pair(I
, VF
);
1109 auto Itr
= WideningDecisions
.find(InstOnVF
);
1110 if (Itr
== WideningDecisions
.end())
1112 return Itr
->second
.first
;
1115 /// Return the vectorization cost for the given instruction \p I and vector
1117 unsigned getWideningCost(Instruction
*I
, unsigned VF
) {
1118 assert(VF
>= 2 && "Expected VF >=2");
1119 std::pair
<Instruction
*, unsigned> InstOnVF
= std::make_pair(I
, VF
);
1120 assert(WideningDecisions
.find(InstOnVF
) != WideningDecisions
.end() &&
1121 "The cost is not calculated");
1122 return WideningDecisions
[InstOnVF
].second
;
1125 /// Return True if instruction \p I is an optimizable truncate whose operand
1126 /// is an induction variable. Such a truncate will be removed by adding a new
1127 /// induction variable with the destination type.
1128 bool isOptimizableIVTruncate(Instruction
*I
, unsigned VF
) {
1129 // If the instruction is not a truncate, return false.
1130 auto *Trunc
= dyn_cast
<TruncInst
>(I
);
1134 // Get the source and destination types of the truncate.
1135 Type
*SrcTy
= ToVectorTy(cast
<CastInst
>(I
)->getSrcTy(), VF
);
1136 Type
*DestTy
= ToVectorTy(cast
<CastInst
>(I
)->getDestTy(), VF
);
1138 // If the truncate is free for the given types, return false. Replacing a
1139 // free truncate with an induction variable would add an induction variable
1140 // update instruction to each iteration of the loop. We exclude from this
1141 // check the primary induction variable since it will need an update
1142 // instruction regardless.
1143 Value
*Op
= Trunc
->getOperand(0);
1144 if (Op
!= Legal
->getPrimaryInduction() && TTI
.isTruncateFree(SrcTy
, DestTy
))
1147 // If the truncated value is not an induction variable, return false.
1148 return Legal
->isInductionPhi(Op
);
1151 /// Collects the instructions to scalarize for each predicated instruction in
1153 void collectInstsToScalarize(unsigned VF
);
1155 /// Collect Uniform and Scalar values for the given \p VF.
1156 /// The sets depend on CM decision for Load/Store instructions
1157 /// that may be vectorized as interleave, gather-scatter or scalarized.
1158 void collectUniformsAndScalars(unsigned VF
) {
1159 // Do the analysis once.
1160 if (VF
== 1 || Uniforms
.find(VF
) != Uniforms
.end())
1162 setCostBasedWideningDecision(VF
);
1163 collectLoopUniforms(VF
);
1164 collectLoopScalars(VF
);
1167 /// Returns true if the target machine supports masked store operation
1168 /// for the given \p DataType and kind of access to \p Ptr.
1169 bool isLegalMaskedStore(Type
*DataType
, Value
*Ptr
) {
1170 return Legal
->isConsecutivePtr(Ptr
) && TTI
.isLegalMaskedStore(DataType
);
1173 /// Returns true if the target machine supports masked load operation
1174 /// for the given \p DataType and kind of access to \p Ptr.
1175 bool isLegalMaskedLoad(Type
*DataType
, Value
*Ptr
) {
1176 return Legal
->isConsecutivePtr(Ptr
) && TTI
.isLegalMaskedLoad(DataType
);
1179 /// Returns true if the target machine supports masked scatter operation
1180 /// for the given \p DataType.
1181 bool isLegalMaskedScatter(Type
*DataType
) {
1182 return TTI
.isLegalMaskedScatter(DataType
);
1185 /// Returns true if the target machine supports masked gather operation
1186 /// for the given \p DataType.
1187 bool isLegalMaskedGather(Type
*DataType
) {
1188 return TTI
.isLegalMaskedGather(DataType
);
1191 /// Returns true if the target machine can represent \p V as a masked gather
1192 /// or scatter operation.
1193 bool isLegalGatherOrScatter(Value
*V
) {
1194 bool LI
= isa
<LoadInst
>(V
);
1195 bool SI
= isa
<StoreInst
>(V
);
1198 auto *Ty
= getMemInstValueType(V
);
1199 return (LI
&& isLegalMaskedGather(Ty
)) || (SI
&& isLegalMaskedScatter(Ty
));
1202 /// Returns true if \p I is an instruction that will be scalarized with
1203 /// predication. Such instructions include conditional stores and
1204 /// instructions that may divide by zero.
1205 /// If a non-zero VF has been calculated, we check if I will be scalarized
1206 /// predication for that VF.
1207 bool isScalarWithPredication(Instruction
*I
, unsigned VF
= 1);
1209 // Returns true if \p I is an instruction that will be predicated either
1210 // through scalar predication or masked load/store or masked gather/scatter.
1211 // Superset of instructions that return true for isScalarWithPredication.
1212 bool isPredicatedInst(Instruction
*I
) {
1213 if (!blockNeedsPredication(I
->getParent()))
1215 // Loads and stores that need some form of masked operation are predicated
1217 if (isa
<LoadInst
>(I
) || isa
<StoreInst
>(I
))
1218 return Legal
->isMaskRequired(I
);
1219 return isScalarWithPredication(I
);
1222 /// Returns true if \p I is a memory instruction with consecutive memory
1223 /// access that can be widened.
1224 bool memoryInstructionCanBeWidened(Instruction
*I
, unsigned VF
= 1);
1226 /// Returns true if \p I is a memory instruction in an interleaved-group
1227 /// of memory accesses that can be vectorized with wide vector loads/stores
1229 bool interleavedAccessCanBeWidened(Instruction
*I
, unsigned VF
= 1);
1231 /// Check if \p Instr belongs to any interleaved access group.
1232 bool isAccessInterleaved(Instruction
*Instr
) {
1233 return InterleaveInfo
.isInterleaved(Instr
);
1236 /// Get the interleaved access group that \p Instr belongs to.
1237 const InterleaveGroup
<Instruction
> *
1238 getInterleavedAccessGroup(Instruction
*Instr
) {
1239 return InterleaveInfo
.getInterleaveGroup(Instr
);
1242 /// Returns true if an interleaved group requires a scalar iteration
1243 /// to handle accesses with gaps, and there is nothing preventing us from
1244 /// creating a scalar epilogue.
1245 bool requiresScalarEpilogue() const {
1246 return isScalarEpilogueAllowed() && InterleaveInfo
.requiresScalarEpilogue();
1249 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1250 /// loop hint annotation.
1251 bool isScalarEpilogueAllowed() const {
1252 return ScalarEpilogueStatus
== CM_ScalarEpilogueAllowed
;
1255 /// Returns true if all loop blocks should be masked to fold tail loop.
1256 bool foldTailByMasking() const { return FoldTailByMasking
; }
1258 bool blockNeedsPredication(BasicBlock
*BB
) {
1259 return foldTailByMasking() || Legal
->blockNeedsPredication(BB
);
1262 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1263 /// with factor VF. Return the cost of the instruction, including
1264 /// scalarization overhead if it's needed.
1265 unsigned getVectorIntrinsicCost(CallInst
*CI
, unsigned VF
);
1267 /// Estimate cost of a call instruction CI if it were vectorized with factor
1268 /// VF. Return the cost of the instruction, including scalarization overhead
1269 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1271 /// i.e. either vector version isn't available, or is too expensive.
1272 unsigned getVectorCallCost(CallInst
*CI
, unsigned VF
, bool &NeedToScalarize
);
1275 unsigned NumPredStores
= 0;
1277 /// \return An upper bound for the vectorization factor, larger than zero.
1278 /// One is returned if vectorization should best be avoided due to cost.
1279 unsigned computeFeasibleMaxVF(unsigned ConstTripCount
);
1281 /// The vectorization cost is a combination of the cost itself and a boolean
1282 /// indicating whether any of the contributing operations will actually
1284 /// vector values after type legalization in the backend. If this latter value
1286 /// false, then all operations will be scalarized (i.e. no vectorization has
1287 /// actually taken place).
1288 using VectorizationCostTy
= std::pair
<unsigned, bool>;
1290 /// Returns the expected execution cost. The unit of the cost does
1291 /// not matter because we use the 'cost' units to compare different
1292 /// vector widths. The cost that is returned is *not* normalized by
1293 /// the factor width.
1294 VectorizationCostTy
expectedCost(unsigned VF
);
1296 /// Returns the execution time cost of an instruction for a given vector
1297 /// width. Vector width of one means scalar.
1298 VectorizationCostTy
getInstructionCost(Instruction
*I
, unsigned VF
);
1300 /// The cost-computation logic from getInstructionCost which provides
1301 /// the vector type as an output parameter.
1302 unsigned getInstructionCost(Instruction
*I
, unsigned VF
, Type
*&VectorTy
);
1304 /// Calculate vectorization cost of memory instruction \p I.
1305 unsigned getMemoryInstructionCost(Instruction
*I
, unsigned VF
);
1307 /// The cost computation for scalarized memory instruction.
1308 unsigned getMemInstScalarizationCost(Instruction
*I
, unsigned VF
);
1310 /// The cost computation for interleaving group of memory instructions.
1311 unsigned getInterleaveGroupCost(Instruction
*I
, unsigned VF
);
1313 /// The cost computation for Gather/Scatter instruction.
1314 unsigned getGatherScatterCost(Instruction
*I
, unsigned VF
);
1316 /// The cost computation for widening instruction \p I with consecutive
1318 unsigned getConsecutiveMemOpCost(Instruction
*I
, unsigned VF
);
1320 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1321 /// Load: scalar load + broadcast.
1322 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1324 unsigned getUniformMemOpCost(Instruction
*I
, unsigned VF
);
1326 /// Estimate the overhead of scalarizing an instruction. This is a
1327 /// convenience wrapper for the type-based getScalarizationOverhead API.
1328 unsigned getScalarizationOverhead(Instruction
*I
, unsigned VF
);
1330 /// Returns whether the instruction is a load or store and will be a emitted
1331 /// as a vector operation.
1332 bool isConsecutiveLoadOrStore(Instruction
*I
);
1334 /// Returns true if an artificially high cost for emulated masked memrefs
1336 bool useEmulatedMaskMemRefHack(Instruction
*I
);
1338 /// Map of scalar integer values to the smallest bitwidth they can be legally
1339 /// represented as. The vector equivalents of these values should be truncated
1341 MapVector
<Instruction
*, uint64_t> MinBWs
;
1343 /// A type representing the costs for instructions if they were to be
1344 /// scalarized rather than vectorized. The entries are Instruction-Cost
1346 using ScalarCostsTy
= DenseMap
<Instruction
*, unsigned>;
1348 /// A set containing all BasicBlocks that are known to present after
1349 /// vectorization as a predicated block.
1350 SmallPtrSet
<BasicBlock
*, 4> PredicatedBBsAfterVectorization
;
1352 /// Records whether it is allowed to have the original scalar loop execute at
1353 /// least once. This may be needed as a fallback loop in case runtime
1354 /// aliasing/dependence checks fail, or to handle the tail/remainder
1355 /// iterations when the trip count is unknown or doesn't divide by the VF,
1356 /// or as a peel-loop to handle gaps in interleave-groups.
1357 /// Under optsize and when the trip count is very small we don't allow any
1358 /// iterations to execute in the scalar loop.
1359 ScalarEpilogueLowering ScalarEpilogueStatus
= CM_ScalarEpilogueAllowed
;
1361 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1362 bool FoldTailByMasking
= false;
1364 /// A map holding scalar costs for different vectorization factors. The
1365 /// presence of a cost for an instruction in the mapping indicates that the
1366 /// instruction will be scalarized when vectorizing with the associated
1367 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1368 DenseMap
<unsigned, ScalarCostsTy
> InstsToScalarize
;
1370 /// Holds the instructions known to be uniform after vectorization.
1371 /// The data is collected per VF.
1372 DenseMap
<unsigned, SmallPtrSet
<Instruction
*, 4>> Uniforms
;
1374 /// Holds the instructions known to be scalar after vectorization.
1375 /// The data is collected per VF.
1376 DenseMap
<unsigned, SmallPtrSet
<Instruction
*, 4>> Scalars
;
1378 /// Holds the instructions (address computations) that are forced to be
1380 DenseMap
<unsigned, SmallPtrSet
<Instruction
*, 4>> ForcedScalars
;
1382 /// Returns the expected difference in cost from scalarizing the expression
1383 /// feeding a predicated instruction \p PredInst. The instructions to
1384 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1385 /// non-negative return value implies the expression will be scalarized.
1386 /// Currently, only single-use chains are considered for scalarization.
1387 int computePredInstDiscount(Instruction
*PredInst
, ScalarCostsTy
&ScalarCosts
,
1390 /// Collect the instructions that are uniform after vectorization. An
1391 /// instruction is uniform if we represent it with a single scalar value in
1392 /// the vectorized loop corresponding to each vector iteration. Examples of
1393 /// uniform instructions include pointer operands of consecutive or
1394 /// interleaved memory accesses. Note that although uniformity implies an
1395 /// instruction will be scalar, the reverse is not true. In general, a
1396 /// scalarized instruction will be represented by VF scalar values in the
1397 /// vectorized loop, each corresponding to an iteration of the original
1399 void collectLoopUniforms(unsigned VF
);
1401 /// Collect the instructions that are scalar after vectorization. An
1402 /// instruction is scalar if it is known to be uniform or will be scalarized
1403 /// during vectorization. Non-uniform scalarized instructions will be
1404 /// represented by VF values in the vectorized loop, each corresponding to an
1405 /// iteration of the original scalar loop.
1406 void collectLoopScalars(unsigned VF
);
1408 /// Keeps cost model vectorization decision and cost for instructions.
1409 /// Right now it is used for memory instructions only.
1410 using DecisionList
= DenseMap
<std::pair
<Instruction
*, unsigned>,
1411 std::pair
<InstWidening
, unsigned>>;
1413 DecisionList WideningDecisions
;
1415 /// Returns true if \p V is expected to be vectorized and it needs to be
1417 bool needsExtract(Value
*V
, unsigned VF
) const {
1418 Instruction
*I
= dyn_cast
<Instruction
>(V
);
1419 if (VF
== 1 || !I
|| !TheLoop
->contains(I
) || TheLoop
->isLoopInvariant(I
))
1422 // Assume we can vectorize V (and hence we need extraction) if the
1423 // scalars are not computed yet. This can happen, because it is called
1424 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1425 // the scalars are collected. That should be a safe assumption in most
1426 // cases, because we check if the operands have vectorizable types
1427 // beforehand in LoopVectorizationLegality.
1428 return Scalars
.find(VF
) == Scalars
.end() ||
1429 !isScalarAfterVectorization(I
, VF
);
1432 /// Returns a range containing only operands needing to be extracted.
1433 SmallVector
<Value
*, 4> filterExtractingOperands(Instruction::op_range Ops
,
1435 return SmallVector
<Value
*, 4>(make_filter_range(
1436 Ops
, [this, VF
](Value
*V
) { return this->needsExtract(V
, VF
); }));
1440 /// The loop that we evaluate.
1443 /// Predicated scalar evolution analysis.
1444 PredicatedScalarEvolution
&PSE
;
1446 /// Loop Info analysis.
1449 /// Vectorization legality.
1450 LoopVectorizationLegality
*Legal
;
1452 /// Vector target information.
1453 const TargetTransformInfo
&TTI
;
1455 /// Target Library Info.
1456 const TargetLibraryInfo
*TLI
;
1458 /// Demanded bits analysis.
1461 /// Assumption cache.
1462 AssumptionCache
*AC
;
1464 /// Interface to emit optimization remarks.
1465 OptimizationRemarkEmitter
*ORE
;
1467 const Function
*TheFunction
;
1469 /// Loop Vectorize Hint.
1470 const LoopVectorizeHints
*Hints
;
1472 /// The interleave access information contains groups of interleaved accesses
1473 /// with the same stride and close to each other.
1474 InterleavedAccessInfo
&InterleaveInfo
;
1476 /// Values to ignore in the cost model.
1477 SmallPtrSet
<const Value
*, 16> ValuesToIgnore
;
1479 /// Values to ignore in the cost model when VF > 1.
1480 SmallPtrSet
<const Value
*, 16> VecValuesToIgnore
;
1483 } // end namespace llvm
1485 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1486 // vectorization. The loop needs to be annotated with #pragma omp simd
1487 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1488 // vector length information is not provided, vectorization is not considered
1489 // explicit. Interleave hints are not allowed either. These limitations will be
1490 // relaxed in the future.
1491 // Please, note that we are currently forced to abuse the pragma 'clang
1492 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1493 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1494 // provides *explicit vectorization hints* (LV can bypass legal checks and
1495 // assume that vectorization is legal). However, both hints are implemented
1496 // using the same metadata (llvm.loop.vectorize, processed by
1497 // LoopVectorizeHints). This will be fixed in the future when the native IR
1498 // representation for pragma 'omp simd' is introduced.
1499 static bool isExplicitVecOuterLoop(Loop
*OuterLp
,
1500 OptimizationRemarkEmitter
*ORE
) {
1501 assert(!OuterLp
->empty() && "This is not an outer loop");
1502 LoopVectorizeHints
Hints(OuterLp
, true /*DisableInterleaving*/, *ORE
);
1504 // Only outer loops with an explicit vectorization hint are supported.
1505 // Unannotated outer loops are ignored.
1506 if (Hints
.getForce() == LoopVectorizeHints::FK_Undefined
)
1509 Function
*Fn
= OuterLp
->getHeader()->getParent();
1510 if (!Hints
.allowVectorization(Fn
, OuterLp
,
1511 true /*VectorizeOnlyWhenForced*/)) {
1512 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1516 if (Hints
.getInterleave() > 1) {
1517 // TODO: Interleave support is future work.
1518 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1520 Hints
.emitRemarkWithHints();
1527 static void collectSupportedLoops(Loop
&L
, LoopInfo
*LI
,
1528 OptimizationRemarkEmitter
*ORE
,
1529 SmallVectorImpl
<Loop
*> &V
) {
1530 // Collect inner loops and outer loops without irreducible control flow. For
1531 // now, only collect outer loops that have explicit vectorization hints. If we
1532 // are stress testing the VPlan H-CFG construction, we collect the outermost
1533 // loop of every loop nest.
1534 if (L
.empty() || VPlanBuildStressTest
||
1535 (EnableVPlanNativePath
&& isExplicitVecOuterLoop(&L
, ORE
))) {
1536 LoopBlocksRPO
RPOT(&L
);
1538 if (!containsIrreducibleCFG
<const BasicBlock
*>(RPOT
, *LI
)) {
1540 // TODO: Collect inner loops inside marked outer loops in case
1541 // vectorization fails for the outer loop. Do not invoke
1542 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1543 // already known to be reducible. We can use an inherited attribute for
1548 for (Loop
*InnerL
: L
)
1549 collectSupportedLoops(*InnerL
, LI
, ORE
, V
);
1554 /// The LoopVectorize Pass.
1555 struct LoopVectorize
: public FunctionPass
{
1556 /// Pass identification, replacement for typeid
1559 LoopVectorizePass Impl
;
1561 explicit LoopVectorize(bool InterleaveOnlyWhenForced
= false,
1562 bool VectorizeOnlyWhenForced
= false)
1563 : FunctionPass(ID
) {
1564 Impl
.InterleaveOnlyWhenForced
= InterleaveOnlyWhenForced
;
1565 Impl
.VectorizeOnlyWhenForced
= VectorizeOnlyWhenForced
;
1566 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1569 bool runOnFunction(Function
&F
) override
{
1570 if (skipFunction(F
))
1573 auto *SE
= &getAnalysis
<ScalarEvolutionWrapperPass
>().getSE();
1574 auto *LI
= &getAnalysis
<LoopInfoWrapperPass
>().getLoopInfo();
1575 auto *TTI
= &getAnalysis
<TargetTransformInfoWrapperPass
>().getTTI(F
);
1576 auto *DT
= &getAnalysis
<DominatorTreeWrapperPass
>().getDomTree();
1577 auto *BFI
= &getAnalysis
<BlockFrequencyInfoWrapperPass
>().getBFI();
1578 auto *TLIP
= getAnalysisIfAvailable
<TargetLibraryInfoWrapperPass
>();
1579 auto *TLI
= TLIP
? &TLIP
->getTLI(F
) : nullptr;
1580 auto *AA
= &getAnalysis
<AAResultsWrapperPass
>().getAAResults();
1581 auto *AC
= &getAnalysis
<AssumptionCacheTracker
>().getAssumptionCache(F
);
1582 auto *LAA
= &getAnalysis
<LoopAccessLegacyAnalysis
>();
1583 auto *DB
= &getAnalysis
<DemandedBitsWrapperPass
>().getDemandedBits();
1584 auto *ORE
= &getAnalysis
<OptimizationRemarkEmitterWrapperPass
>().getORE();
1585 auto *PSI
= &getAnalysis
<ProfileSummaryInfoWrapperPass
>().getPSI();
1587 std::function
<const LoopAccessInfo
&(Loop
&)> GetLAA
=
1588 [&](Loop
&L
) -> const LoopAccessInfo
& { return LAA
->getInfo(&L
); };
1590 return Impl
.runImpl(F
, *SE
, *LI
, *TTI
, *DT
, *BFI
, TLI
, *DB
, *AA
, *AC
,
1594 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
1595 AU
.addRequired
<AssumptionCacheTracker
>();
1596 AU
.addRequired
<BlockFrequencyInfoWrapperPass
>();
1597 AU
.addRequired
<DominatorTreeWrapperPass
>();
1598 AU
.addRequired
<LoopInfoWrapperPass
>();
1599 AU
.addRequired
<ScalarEvolutionWrapperPass
>();
1600 AU
.addRequired
<TargetTransformInfoWrapperPass
>();
1601 AU
.addRequired
<AAResultsWrapperPass
>();
1602 AU
.addRequired
<LoopAccessLegacyAnalysis
>();
1603 AU
.addRequired
<DemandedBitsWrapperPass
>();
1604 AU
.addRequired
<OptimizationRemarkEmitterWrapperPass
>();
1606 // We currently do not preserve loopinfo/dominator analyses with outer loop
1607 // vectorization. Until this is addressed, mark these analyses as preserved
1608 // only for non-VPlan-native path.
1609 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1610 if (!EnableVPlanNativePath
) {
1611 AU
.addPreserved
<LoopInfoWrapperPass
>();
1612 AU
.addPreserved
<DominatorTreeWrapperPass
>();
1615 AU
.addPreserved
<BasicAAWrapperPass
>();
1616 AU
.addPreserved
<GlobalsAAWrapperPass
>();
1617 AU
.addRequired
<ProfileSummaryInfoWrapperPass
>();
1621 } // end anonymous namespace
1623 //===----------------------------------------------------------------------===//
1624 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1625 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1626 //===----------------------------------------------------------------------===//
1628 Value
*InnerLoopVectorizer::getBroadcastInstrs(Value
*V
) {
1629 // We need to place the broadcast of invariant variables outside the loop,
1630 // but only if it's proven safe to do so. Else, broadcast will be inside
1631 // vector loop body.
1632 Instruction
*Instr
= dyn_cast
<Instruction
>(V
);
1633 bool SafeToHoist
= OrigLoop
->isLoopInvariant(V
) &&
1635 DT
->dominates(Instr
->getParent(), LoopVectorPreHeader
));
1636 // Place the code for broadcasting invariant variables in the new preheader.
1637 IRBuilder
<>::InsertPointGuard
Guard(Builder
);
1639 Builder
.SetInsertPoint(LoopVectorPreHeader
->getTerminator());
1641 // Broadcast the scalar into all locations in the vector.
1642 Value
*Shuf
= Builder
.CreateVectorSplat(VF
, V
, "broadcast");
1647 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1648 const InductionDescriptor
&II
, Value
*Step
, Instruction
*EntryVal
) {
1649 assert((isa
<PHINode
>(EntryVal
) || isa
<TruncInst
>(EntryVal
)) &&
1650 "Expected either an induction phi-node or a truncate of it!");
1651 Value
*Start
= II
.getStartValue();
1653 // Construct the initial value of the vector IV in the vector loop preheader
1654 auto CurrIP
= Builder
.saveIP();
1655 Builder
.SetInsertPoint(LoopVectorPreHeader
->getTerminator());
1656 if (isa
<TruncInst
>(EntryVal
)) {
1657 assert(Start
->getType()->isIntegerTy() &&
1658 "Truncation requires an integer type");
1659 auto *TruncType
= cast
<IntegerType
>(EntryVal
->getType());
1660 Step
= Builder
.CreateTrunc(Step
, TruncType
);
1661 Start
= Builder
.CreateCast(Instruction::Trunc
, Start
, TruncType
);
1663 Value
*SplatStart
= Builder
.CreateVectorSplat(VF
, Start
);
1664 Value
*SteppedStart
=
1665 getStepVector(SplatStart
, 0, Step
, II
.getInductionOpcode());
1667 // We create vector phi nodes for both integer and floating-point induction
1668 // variables. Here, we determine the kind of arithmetic we will perform.
1669 Instruction::BinaryOps AddOp
;
1670 Instruction::BinaryOps MulOp
;
1671 if (Step
->getType()->isIntegerTy()) {
1672 AddOp
= Instruction::Add
;
1673 MulOp
= Instruction::Mul
;
1675 AddOp
= II
.getInductionOpcode();
1676 MulOp
= Instruction::FMul
;
1679 // Multiply the vectorization factor by the step using integer or
1680 // floating-point arithmetic as appropriate.
1681 Value
*ConstVF
= getSignedIntOrFpConstant(Step
->getType(), VF
);
1682 Value
*Mul
= addFastMathFlag(Builder
.CreateBinOp(MulOp
, Step
, ConstVF
));
1684 // Create a vector splat to use in the induction update.
1686 // FIXME: If the step is non-constant, we create the vector splat with
1687 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1688 // handle a constant vector splat.
1689 Value
*SplatVF
= isa
<Constant
>(Mul
)
1690 ? ConstantVector::getSplat(VF
, cast
<Constant
>(Mul
))
1691 : Builder
.CreateVectorSplat(VF
, Mul
);
1692 Builder
.restoreIP(CurrIP
);
1694 // We may need to add the step a number of times, depending on the unroll
1695 // factor. The last of those goes into the PHI.
1696 PHINode
*VecInd
= PHINode::Create(SteppedStart
->getType(), 2, "vec.ind",
1697 &*LoopVectorBody
->getFirstInsertionPt());
1698 VecInd
->setDebugLoc(EntryVal
->getDebugLoc());
1699 Instruction
*LastInduction
= VecInd
;
1700 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
1701 VectorLoopValueMap
.setVectorValue(EntryVal
, Part
, LastInduction
);
1703 if (isa
<TruncInst
>(EntryVal
))
1704 addMetadata(LastInduction
, EntryVal
);
1705 recordVectorLoopValueForInductionCast(II
, EntryVal
, LastInduction
, Part
);
1707 LastInduction
= cast
<Instruction
>(addFastMathFlag(
1708 Builder
.CreateBinOp(AddOp
, LastInduction
, SplatVF
, "step.add")));
1709 LastInduction
->setDebugLoc(EntryVal
->getDebugLoc());
1712 // Move the last step to the end of the latch block. This ensures consistent
1713 // placement of all induction updates.
1714 auto *LoopVectorLatch
= LI
->getLoopFor(LoopVectorBody
)->getLoopLatch();
1715 auto *Br
= cast
<BranchInst
>(LoopVectorLatch
->getTerminator());
1716 auto *ICmp
= cast
<Instruction
>(Br
->getCondition());
1717 LastInduction
->moveBefore(ICmp
);
1718 LastInduction
->setName("vec.ind.next");
1720 VecInd
->addIncoming(SteppedStart
, LoopVectorPreHeader
);
1721 VecInd
->addIncoming(LastInduction
, LoopVectorLatch
);
1724 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction
*I
) const {
1725 return Cost
->isScalarAfterVectorization(I
, VF
) ||
1726 Cost
->isProfitableToScalarize(I
, VF
);
1729 bool InnerLoopVectorizer::needsScalarInduction(Instruction
*IV
) const {
1730 if (shouldScalarizeInstruction(IV
))
1732 auto isScalarInst
= [&](User
*U
) -> bool {
1733 auto *I
= cast
<Instruction
>(U
);
1734 return (OrigLoop
->contains(I
) && shouldScalarizeInstruction(I
));
1736 return llvm::any_of(IV
->users(), isScalarInst
);
1739 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1740 const InductionDescriptor
&ID
, const Instruction
*EntryVal
,
1741 Value
*VectorLoopVal
, unsigned Part
, unsigned Lane
) {
1742 assert((isa
<PHINode
>(EntryVal
) || isa
<TruncInst
>(EntryVal
)) &&
1743 "Expected either an induction phi-node or a truncate of it!");
1745 // This induction variable is not the phi from the original loop but the
1746 // newly-created IV based on the proof that casted Phi is equal to the
1747 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1748 // re-uses the same InductionDescriptor that original IV uses but we don't
1749 // have to do any recording in this case - that is done when original IV is
1751 if (isa
<TruncInst
>(EntryVal
))
1754 const SmallVectorImpl
<Instruction
*> &Casts
= ID
.getCastInsts();
1757 // Only the first Cast instruction in the Casts vector is of interest.
1758 // The rest of the Casts (if exist) have no uses outside the
1759 // induction update chain itself.
1760 Instruction
*CastInst
= *Casts
.begin();
1761 if (Lane
< UINT_MAX
)
1762 VectorLoopValueMap
.setScalarValue(CastInst
, {Part
, Lane
}, VectorLoopVal
);
1764 VectorLoopValueMap
.setVectorValue(CastInst
, Part
, VectorLoopVal
);
1767 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode
*IV
, TruncInst
*Trunc
) {
1768 assert((IV
->getType()->isIntegerTy() || IV
!= OldInduction
) &&
1769 "Primary induction variable must have an integer type");
1771 auto II
= Legal
->getInductionVars()->find(IV
);
1772 assert(II
!= Legal
->getInductionVars()->end() && "IV is not an induction");
1774 auto ID
= II
->second
;
1775 assert(IV
->getType() == ID
.getStartValue()->getType() && "Types must match");
1777 // The scalar value to broadcast. This will be derived from the canonical
1778 // induction variable.
1779 Value
*ScalarIV
= nullptr;
1781 // The value from the original loop to which we are mapping the new induction
1783 Instruction
*EntryVal
= Trunc
? cast
<Instruction
>(Trunc
) : IV
;
1785 // True if we have vectorized the induction variable.
1786 auto VectorizedIV
= false;
1788 // Determine if we want a scalar version of the induction variable. This is
1789 // true if the induction variable itself is not widened, or if it has at
1790 // least one user in the loop that is not widened.
1791 auto NeedsScalarIV
= VF
> 1 && needsScalarInduction(EntryVal
);
1793 // Generate code for the induction step. Note that induction steps are
1794 // required to be loop-invariant
1795 assert(PSE
.getSE()->isLoopInvariant(ID
.getStep(), OrigLoop
) &&
1796 "Induction step should be loop invariant");
1797 auto &DL
= OrigLoop
->getHeader()->getModule()->getDataLayout();
1798 Value
*Step
= nullptr;
1799 if (PSE
.getSE()->isSCEVable(IV
->getType())) {
1800 SCEVExpander
Exp(*PSE
.getSE(), DL
, "induction");
1801 Step
= Exp
.expandCodeFor(ID
.getStep(), ID
.getStep()->getType(),
1802 LoopVectorPreHeader
->getTerminator());
1804 Step
= cast
<SCEVUnknown
>(ID
.getStep())->getValue();
1807 // Try to create a new independent vector induction variable. If we can't
1808 // create the phi node, we will splat the scalar induction variable in each
1810 if (VF
> 1 && !shouldScalarizeInstruction(EntryVal
)) {
1811 createVectorIntOrFpInductionPHI(ID
, Step
, EntryVal
);
1812 VectorizedIV
= true;
1815 // If we haven't yet vectorized the induction variable, or if we will create
1816 // a scalar one, we need to define the scalar induction variable and step
1817 // values. If we were given a truncation type, truncate the canonical
1818 // induction variable and step. Otherwise, derive these values from the
1819 // induction descriptor.
1820 if (!VectorizedIV
|| NeedsScalarIV
) {
1821 ScalarIV
= Induction
;
1822 if (IV
!= OldInduction
) {
1823 ScalarIV
= IV
->getType()->isIntegerTy()
1824 ? Builder
.CreateSExtOrTrunc(Induction
, IV
->getType())
1825 : Builder
.CreateCast(Instruction::SIToFP
, Induction
,
1827 ScalarIV
= emitTransformedIndex(Builder
, ScalarIV
, PSE
.getSE(), DL
, ID
);
1828 ScalarIV
->setName("offset.idx");
1831 auto *TruncType
= cast
<IntegerType
>(Trunc
->getType());
1832 assert(Step
->getType()->isIntegerTy() &&
1833 "Truncation requires an integer step");
1834 ScalarIV
= Builder
.CreateTrunc(ScalarIV
, TruncType
);
1835 Step
= Builder
.CreateTrunc(Step
, TruncType
);
1839 // If we haven't yet vectorized the induction variable, splat the scalar
1840 // induction variable, and build the necessary step vectors.
1841 // TODO: Don't do it unless the vectorized IV is really required.
1842 if (!VectorizedIV
) {
1843 Value
*Broadcasted
= getBroadcastInstrs(ScalarIV
);
1844 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
1846 getStepVector(Broadcasted
, VF
* Part
, Step
, ID
.getInductionOpcode());
1847 VectorLoopValueMap
.setVectorValue(EntryVal
, Part
, EntryPart
);
1849 addMetadata(EntryPart
, Trunc
);
1850 recordVectorLoopValueForInductionCast(ID
, EntryVal
, EntryPart
, Part
);
1854 // If an induction variable is only used for counting loop iterations or
1855 // calculating addresses, it doesn't need to be widened. Create scalar steps
1856 // that can be used by instructions we will later scalarize. Note that the
1857 // addition of the scalar steps will not increase the number of instructions
1858 // in the loop in the common case prior to InstCombine. We will be trading
1859 // one vector extract for each scalar step.
1861 buildScalarSteps(ScalarIV
, Step
, EntryVal
, ID
);
1864 Value
*InnerLoopVectorizer::getStepVector(Value
*Val
, int StartIdx
, Value
*Step
,
1865 Instruction::BinaryOps BinOp
) {
1866 // Create and check the types.
1867 assert(Val
->getType()->isVectorTy() && "Must be a vector");
1868 int VLen
= Val
->getType()->getVectorNumElements();
1870 Type
*STy
= Val
->getType()->getScalarType();
1871 assert((STy
->isIntegerTy() || STy
->isFloatingPointTy()) &&
1872 "Induction Step must be an integer or FP");
1873 assert(Step
->getType() == STy
&& "Step has wrong type");
1875 SmallVector
<Constant
*, 8> Indices
;
1877 if (STy
->isIntegerTy()) {
1878 // Create a vector of consecutive numbers from zero to VF.
1879 for (int i
= 0; i
< VLen
; ++i
)
1880 Indices
.push_back(ConstantInt::get(STy
, StartIdx
+ i
));
1882 // Add the consecutive indices to the vector value.
1883 Constant
*Cv
= ConstantVector::get(Indices
);
1884 assert(Cv
->getType() == Val
->getType() && "Invalid consecutive vec");
1885 Step
= Builder
.CreateVectorSplat(VLen
, Step
);
1886 assert(Step
->getType() == Val
->getType() && "Invalid step vec");
1887 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1888 // which can be found from the original scalar operations.
1889 Step
= Builder
.CreateMul(Cv
, Step
);
1890 return Builder
.CreateAdd(Val
, Step
, "induction");
1893 // Floating point induction.
1894 assert((BinOp
== Instruction::FAdd
|| BinOp
== Instruction::FSub
) &&
1895 "Binary Opcode should be specified for FP induction");
1896 // Create a vector of consecutive numbers from zero to VF.
1897 for (int i
= 0; i
< VLen
; ++i
)
1898 Indices
.push_back(ConstantFP::get(STy
, (double)(StartIdx
+ i
)));
1900 // Add the consecutive indices to the vector value.
1901 Constant
*Cv
= ConstantVector::get(Indices
);
1903 Step
= Builder
.CreateVectorSplat(VLen
, Step
);
1905 // Floating point operations had to be 'fast' to enable the induction.
1906 FastMathFlags Flags
;
1909 Value
*MulOp
= Builder
.CreateFMul(Cv
, Step
);
1910 if (isa
<Instruction
>(MulOp
))
1911 // Have to check, MulOp may be a constant
1912 cast
<Instruction
>(MulOp
)->setFastMathFlags(Flags
);
1914 Value
*BOp
= Builder
.CreateBinOp(BinOp
, Val
, MulOp
, "induction");
1915 if (isa
<Instruction
>(BOp
))
1916 cast
<Instruction
>(BOp
)->setFastMathFlags(Flags
);
1920 void InnerLoopVectorizer::buildScalarSteps(Value
*ScalarIV
, Value
*Step
,
1921 Instruction
*EntryVal
,
1922 const InductionDescriptor
&ID
) {
1923 // We shouldn't have to build scalar steps if we aren't vectorizing.
1924 assert(VF
> 1 && "VF should be greater than one");
1926 // Get the value type and ensure it and the step have the same integer type.
1927 Type
*ScalarIVTy
= ScalarIV
->getType()->getScalarType();
1928 assert(ScalarIVTy
== Step
->getType() &&
1929 "Val and Step should have the same type");
1931 // We build scalar steps for both integer and floating-point induction
1932 // variables. Here, we determine the kind of arithmetic we will perform.
1933 Instruction::BinaryOps AddOp
;
1934 Instruction::BinaryOps MulOp
;
1935 if (ScalarIVTy
->isIntegerTy()) {
1936 AddOp
= Instruction::Add
;
1937 MulOp
= Instruction::Mul
;
1939 AddOp
= ID
.getInductionOpcode();
1940 MulOp
= Instruction::FMul
;
1943 // Determine the number of scalars we need to generate for each unroll
1944 // iteration. If EntryVal is uniform, we only need to generate the first
1945 // lane. Otherwise, we generate all VF values.
1947 Cost
->isUniformAfterVectorization(cast
<Instruction
>(EntryVal
), VF
) ? 1
1949 // Compute the scalar steps and save the results in VectorLoopValueMap.
1950 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
1951 for (unsigned Lane
= 0; Lane
< Lanes
; ++Lane
) {
1952 auto *StartIdx
= getSignedIntOrFpConstant(ScalarIVTy
, VF
* Part
+ Lane
);
1953 auto *Mul
= addFastMathFlag(Builder
.CreateBinOp(MulOp
, StartIdx
, Step
));
1954 auto *Add
= addFastMathFlag(Builder
.CreateBinOp(AddOp
, ScalarIV
, Mul
));
1955 VectorLoopValueMap
.setScalarValue(EntryVal
, {Part
, Lane
}, Add
);
1956 recordVectorLoopValueForInductionCast(ID
, EntryVal
, Add
, Part
, Lane
);
1961 Value
*InnerLoopVectorizer::getOrCreateVectorValue(Value
*V
, unsigned Part
) {
1962 assert(V
!= Induction
&& "The new induction variable should not be used.");
1963 assert(!V
->getType()->isVectorTy() && "Can't widen a vector");
1964 assert(!V
->getType()->isVoidTy() && "Type does not produce a value");
1966 // If we have a stride that is replaced by one, do it here. Defer this for
1967 // the VPlan-native path until we start running Legal checks in that path.
1968 if (!EnableVPlanNativePath
&& Legal
->hasStride(V
))
1969 V
= ConstantInt::get(V
->getType(), 1);
1971 // If we have a vector mapped to this value, return it.
1972 if (VectorLoopValueMap
.hasVectorValue(V
, Part
))
1973 return VectorLoopValueMap
.getVectorValue(V
, Part
);
1975 // If the value has not been vectorized, check if it has been scalarized
1976 // instead. If it has been scalarized, and we actually need the value in
1977 // vector form, we will construct the vector values on demand.
1978 if (VectorLoopValueMap
.hasAnyScalarValue(V
)) {
1979 Value
*ScalarValue
= VectorLoopValueMap
.getScalarValue(V
, {Part
, 0});
1981 // If we've scalarized a value, that value should be an instruction.
1982 auto *I
= cast
<Instruction
>(V
);
1984 // If we aren't vectorizing, we can just copy the scalar map values over to
1987 VectorLoopValueMap
.setVectorValue(V
, Part
, ScalarValue
);
1991 // Get the last scalar instruction we generated for V and Part. If the value
1992 // is known to be uniform after vectorization, this corresponds to lane zero
1993 // of the Part unroll iteration. Otherwise, the last instruction is the one
1994 // we created for the last vector lane of the Part unroll iteration.
1995 unsigned LastLane
= Cost
->isUniformAfterVectorization(I
, VF
) ? 0 : VF
- 1;
1996 auto *LastInst
= cast
<Instruction
>(
1997 VectorLoopValueMap
.getScalarValue(V
, {Part
, LastLane
}));
1999 // Set the insert point after the last scalarized instruction. This ensures
2000 // the insertelement sequence will directly follow the scalar definitions.
2001 auto OldIP
= Builder
.saveIP();
2002 auto NewIP
= std::next(BasicBlock::iterator(LastInst
));
2003 Builder
.SetInsertPoint(&*NewIP
);
2005 // However, if we are vectorizing, we need to construct the vector values.
2006 // If the value is known to be uniform after vectorization, we can just
2007 // broadcast the scalar value corresponding to lane zero for each unroll
2008 // iteration. Otherwise, we construct the vector values using insertelement
2009 // instructions. Since the resulting vectors are stored in
2010 // VectorLoopValueMap, we will only generate the insertelements once.
2011 Value
*VectorValue
= nullptr;
2012 if (Cost
->isUniformAfterVectorization(I
, VF
)) {
2013 VectorValue
= getBroadcastInstrs(ScalarValue
);
2014 VectorLoopValueMap
.setVectorValue(V
, Part
, VectorValue
);
2016 // Initialize packing with insertelements to start from undef.
2017 Value
*Undef
= UndefValue::get(VectorType::get(V
->getType(), VF
));
2018 VectorLoopValueMap
.setVectorValue(V
, Part
, Undef
);
2019 for (unsigned Lane
= 0; Lane
< VF
; ++Lane
)
2020 packScalarIntoVectorValue(V
, {Part
, Lane
});
2021 VectorValue
= VectorLoopValueMap
.getVectorValue(V
, Part
);
2023 Builder
.restoreIP(OldIP
);
2027 // If this scalar is unknown, assume that it is a constant or that it is
2028 // loop invariant. Broadcast V and save the value for future uses.
2029 Value
*B
= getBroadcastInstrs(V
);
2030 VectorLoopValueMap
.setVectorValue(V
, Part
, B
);
2035 InnerLoopVectorizer::getOrCreateScalarValue(Value
*V
,
2036 const VPIteration
&Instance
) {
2037 // If the value is not an instruction contained in the loop, it should
2038 // already be scalar.
2039 if (OrigLoop
->isLoopInvariant(V
))
2042 assert(Instance
.Lane
> 0
2043 ? !Cost
->isUniformAfterVectorization(cast
<Instruction
>(V
), VF
)
2044 : true && "Uniform values only have lane zero");
2046 // If the value from the original loop has not been vectorized, it is
2047 // represented by UF x VF scalar values in the new loop. Return the requested
2049 if (VectorLoopValueMap
.hasScalarValue(V
, Instance
))
2050 return VectorLoopValueMap
.getScalarValue(V
, Instance
);
2052 // If the value has not been scalarized, get its entry in VectorLoopValueMap
2053 // for the given unroll part. If this entry is not a vector type (i.e., the
2054 // vectorization factor is one), there is no need to generate an
2055 // extractelement instruction.
2056 auto *U
= getOrCreateVectorValue(V
, Instance
.Part
);
2057 if (!U
->getType()->isVectorTy()) {
2058 assert(VF
== 1 && "Value not scalarized has non-vector type");
2062 // Otherwise, the value from the original loop has been vectorized and is
2063 // represented by UF vector values. Extract and return the requested scalar
2064 // value from the appropriate vector lane.
2065 return Builder
.CreateExtractElement(U
, Builder
.getInt32(Instance
.Lane
));
2068 void InnerLoopVectorizer::packScalarIntoVectorValue(
2069 Value
*V
, const VPIteration
&Instance
) {
2070 assert(V
!= Induction
&& "The new induction variable should not be used.");
2071 assert(!V
->getType()->isVectorTy() && "Can't pack a vector");
2072 assert(!V
->getType()->isVoidTy() && "Type does not produce a value");
2074 Value
*ScalarInst
= VectorLoopValueMap
.getScalarValue(V
, Instance
);
2075 Value
*VectorValue
= VectorLoopValueMap
.getVectorValue(V
, Instance
.Part
);
2076 VectorValue
= Builder
.CreateInsertElement(VectorValue
, ScalarInst
,
2077 Builder
.getInt32(Instance
.Lane
));
2078 VectorLoopValueMap
.resetVectorValue(V
, Instance
.Part
, VectorValue
);
2081 Value
*InnerLoopVectorizer::reverseVector(Value
*Vec
) {
2082 assert(Vec
->getType()->isVectorTy() && "Invalid type");
2083 SmallVector
<Constant
*, 8> ShuffleMask
;
2084 for (unsigned i
= 0; i
< VF
; ++i
)
2085 ShuffleMask
.push_back(Builder
.getInt32(VF
- i
- 1));
2087 return Builder
.CreateShuffleVector(Vec
, UndefValue::get(Vec
->getType()),
2088 ConstantVector::get(ShuffleMask
),
2092 // Return whether we allow using masked interleave-groups (for dealing with
2093 // strided loads/stores that reside in predicated blocks, or for dealing
2095 static bool useMaskedInterleavedAccesses(const TargetTransformInfo
&TTI
) {
2096 // If an override option has been passed in for interleaved accesses, use it.
2097 if (EnableMaskedInterleavedMemAccesses
.getNumOccurrences() > 0)
2098 return EnableMaskedInterleavedMemAccesses
;
2100 return TTI
.enableMaskedInterleavedAccessVectorization();
2103 // Try to vectorize the interleave group that \p Instr belongs to.
2105 // E.g. Translate following interleaved load group (factor = 3):
2106 // for (i = 0; i < N; i+=3) {
2107 // R = Pic[i]; // Member of index 0
2108 // G = Pic[i+1]; // Member of index 1
2109 // B = Pic[i+2]; // Member of index 2
2110 // ... // do something to R, G, B
2113 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2114 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
2115 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
2116 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
2118 // Or translate following interleaved store group (factor = 3):
2119 // for (i = 0; i < N; i+=3) {
2120 // ... do something to R, G, B
2121 // Pic[i] = R; // Member of index 0
2122 // Pic[i+1] = G; // Member of index 1
2123 // Pic[i+2] = B; // Member of index 2
2126 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2127 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2128 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2129 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2130 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2131 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction
*Instr
,
2132 VectorParts
*BlockInMask
) {
2133 const InterleaveGroup
<Instruction
> *Group
=
2134 Cost
->getInterleavedAccessGroup(Instr
);
2135 assert(Group
&& "Fail to get an interleaved access group.");
2137 // Skip if current instruction is not the insert position.
2138 if (Instr
!= Group
->getInsertPos())
2141 const DataLayout
&DL
= Instr
->getModule()->getDataLayout();
2142 Value
*Ptr
= getLoadStorePointerOperand(Instr
);
2144 // Prepare for the vector type of the interleaved load/store.
2145 Type
*ScalarTy
= getMemInstValueType(Instr
);
2146 unsigned InterleaveFactor
= Group
->getFactor();
2147 Type
*VecTy
= VectorType::get(ScalarTy
, InterleaveFactor
* VF
);
2148 Type
*PtrTy
= VecTy
->getPointerTo(getLoadStoreAddressSpace(Instr
));
2150 // Prepare for the new pointers.
2151 setDebugLocFromInst(Builder
, Ptr
);
2152 SmallVector
<Value
*, 2> NewPtrs
;
2153 unsigned Index
= Group
->getIndex(Instr
);
2156 bool IsMaskForCondRequired
= BlockInMask
;
2157 if (IsMaskForCondRequired
) {
2158 Mask
= *BlockInMask
;
2159 // TODO: extend the masked interleaved-group support to reversed access.
2160 assert(!Group
->isReverse() && "Reversed masked interleave-group "
2164 // If the group is reverse, adjust the index to refer to the last vector lane
2165 // instead of the first. We adjust the index from the first vector lane,
2166 // rather than directly getting the pointer for lane VF - 1, because the
2167 // pointer operand of the interleaved access is supposed to be uniform. For
2168 // uniform instructions, we're only required to generate a value for the
2169 // first vector lane in each unroll iteration.
2170 if (Group
->isReverse())
2171 Index
+= (VF
- 1) * Group
->getFactor();
2173 bool InBounds
= false;
2174 if (auto *gep
= dyn_cast
<GetElementPtrInst
>(Ptr
->stripPointerCasts()))
2175 InBounds
= gep
->isInBounds();
2177 for (unsigned Part
= 0; Part
< UF
; Part
++) {
2178 Value
*NewPtr
= getOrCreateScalarValue(Ptr
, {Part
, 0});
2180 // Notice current instruction could be any index. Need to adjust the address
2181 // to the member of index 0.
2183 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2184 // b = A[i]; // Member of index 0
2185 // Current pointer is pointed to A[i+1], adjust it to A[i].
2187 // E.g. A[i+1] = a; // Member of index 1
2188 // A[i] = b; // Member of index 0
2189 // A[i+2] = c; // Member of index 2 (Current instruction)
2190 // Current pointer is pointed to A[i+2], adjust it to A[i].
2191 NewPtr
= Builder
.CreateGEP(ScalarTy
, NewPtr
, Builder
.getInt32(-Index
));
2193 cast
<GetElementPtrInst
>(NewPtr
)->setIsInBounds(true);
2195 // Cast to the vector pointer type.
2196 NewPtrs
.push_back(Builder
.CreateBitCast(NewPtr
, PtrTy
));
2199 setDebugLocFromInst(Builder
, Instr
);
2200 Value
*UndefVec
= UndefValue::get(VecTy
);
2202 Value
*MaskForGaps
= nullptr;
2203 if (Group
->requiresScalarEpilogue() && !Cost
->isScalarEpilogueAllowed()) {
2204 MaskForGaps
= createBitMaskForGaps(Builder
, VF
, *Group
);
2205 assert(MaskForGaps
&& "Mask for Gaps is required but it is null");
2208 // Vectorize the interleaved load group.
2209 if (isa
<LoadInst
>(Instr
)) {
2210 // For each unroll part, create a wide load for the group.
2211 SmallVector
<Value
*, 2> NewLoads
;
2212 for (unsigned Part
= 0; Part
< UF
; Part
++) {
2213 Instruction
*NewLoad
;
2214 if (IsMaskForCondRequired
|| MaskForGaps
) {
2215 assert(useMaskedInterleavedAccesses(*TTI
) &&
2216 "masked interleaved groups are not allowed.");
2217 Value
*GroupMask
= MaskForGaps
;
2218 if (IsMaskForCondRequired
) {
2219 auto *Undefs
= UndefValue::get(Mask
[Part
]->getType());
2220 auto *RepMask
= createReplicatedMask(Builder
, InterleaveFactor
, VF
);
2221 Value
*ShuffledMask
= Builder
.CreateShuffleVector(
2222 Mask
[Part
], Undefs
, RepMask
, "interleaved.mask");
2223 GroupMask
= MaskForGaps
2224 ? Builder
.CreateBinOp(Instruction::And
, ShuffledMask
,
2229 Builder
.CreateMaskedLoad(NewPtrs
[Part
], Group
->getAlignment(),
2230 GroupMask
, UndefVec
, "wide.masked.vec");
2233 NewLoad
= Builder
.CreateAlignedLoad(VecTy
, NewPtrs
[Part
],
2234 Group
->getAlignment(), "wide.vec");
2235 Group
->addMetadata(NewLoad
);
2236 NewLoads
.push_back(NewLoad
);
2239 // For each member in the group, shuffle out the appropriate data from the
2241 for (unsigned I
= 0; I
< InterleaveFactor
; ++I
) {
2242 Instruction
*Member
= Group
->getMember(I
);
2244 // Skip the gaps in the group.
2248 Constant
*StrideMask
= createStrideMask(Builder
, I
, InterleaveFactor
, VF
);
2249 for (unsigned Part
= 0; Part
< UF
; Part
++) {
2250 Value
*StridedVec
= Builder
.CreateShuffleVector(
2251 NewLoads
[Part
], UndefVec
, StrideMask
, "strided.vec");
2253 // If this member has different type, cast the result type.
2254 if (Member
->getType() != ScalarTy
) {
2255 VectorType
*OtherVTy
= VectorType::get(Member
->getType(), VF
);
2256 StridedVec
= createBitOrPointerCast(StridedVec
, OtherVTy
, DL
);
2259 if (Group
->isReverse())
2260 StridedVec
= reverseVector(StridedVec
);
2262 VectorLoopValueMap
.setVectorValue(Member
, Part
, StridedVec
);
2268 // The sub vector type for current instruction.
2269 VectorType
*SubVT
= VectorType::get(ScalarTy
, VF
);
2271 // Vectorize the interleaved store group.
2272 for (unsigned Part
= 0; Part
< UF
; Part
++) {
2273 // Collect the stored vector from each member.
2274 SmallVector
<Value
*, 4> StoredVecs
;
2275 for (unsigned i
= 0; i
< InterleaveFactor
; i
++) {
2276 // Interleaved store group doesn't allow a gap, so each index has a member
2277 Instruction
*Member
= Group
->getMember(i
);
2278 assert(Member
&& "Fail to get a member from an interleaved store group");
2280 Value
*StoredVec
= getOrCreateVectorValue(
2281 cast
<StoreInst
>(Member
)->getValueOperand(), Part
);
2282 if (Group
->isReverse())
2283 StoredVec
= reverseVector(StoredVec
);
2285 // If this member has different type, cast it to a unified type.
2287 if (StoredVec
->getType() != SubVT
)
2288 StoredVec
= createBitOrPointerCast(StoredVec
, SubVT
, DL
);
2290 StoredVecs
.push_back(StoredVec
);
2293 // Concatenate all vectors into a wide vector.
2294 Value
*WideVec
= concatenateVectors(Builder
, StoredVecs
);
2296 // Interleave the elements in the wide vector.
2297 Constant
*IMask
= createInterleaveMask(Builder
, VF
, InterleaveFactor
);
2298 Value
*IVec
= Builder
.CreateShuffleVector(WideVec
, UndefVec
, IMask
,
2301 Instruction
*NewStoreInstr
;
2302 if (IsMaskForCondRequired
) {
2303 auto *Undefs
= UndefValue::get(Mask
[Part
]->getType());
2304 auto *RepMask
= createReplicatedMask(Builder
, InterleaveFactor
, VF
);
2305 Value
*ShuffledMask
= Builder
.CreateShuffleVector(
2306 Mask
[Part
], Undefs
, RepMask
, "interleaved.mask");
2307 NewStoreInstr
= Builder
.CreateMaskedStore(
2308 IVec
, NewPtrs
[Part
], Group
->getAlignment(), ShuffledMask
);
2311 NewStoreInstr
= Builder
.CreateAlignedStore(IVec
, NewPtrs
[Part
],
2312 Group
->getAlignment());
2314 Group
->addMetadata(NewStoreInstr
);
2318 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction
*Instr
,
2319 VectorParts
*BlockInMask
) {
2320 // Attempt to issue a wide load.
2321 LoadInst
*LI
= dyn_cast
<LoadInst
>(Instr
);
2322 StoreInst
*SI
= dyn_cast
<StoreInst
>(Instr
);
2324 assert((LI
|| SI
) && "Invalid Load/Store instruction");
2326 LoopVectorizationCostModel::InstWidening Decision
=
2327 Cost
->getWideningDecision(Instr
, VF
);
2328 assert(Decision
!= LoopVectorizationCostModel::CM_Unknown
&&
2329 "CM decision should be taken at this point");
2330 if (Decision
== LoopVectorizationCostModel::CM_Interleave
)
2331 return vectorizeInterleaveGroup(Instr
);
2333 Type
*ScalarDataTy
= getMemInstValueType(Instr
);
2334 Type
*DataTy
= VectorType::get(ScalarDataTy
, VF
);
2335 Value
*Ptr
= getLoadStorePointerOperand(Instr
);
2336 unsigned Alignment
= getLoadStoreAlignment(Instr
);
2337 // An alignment of 0 means target abi alignment. We need to use the scalar's
2338 // target abi alignment in such a case.
2339 const DataLayout
&DL
= Instr
->getModule()->getDataLayout();
2341 Alignment
= DL
.getABITypeAlignment(ScalarDataTy
);
2342 unsigned AddressSpace
= getLoadStoreAddressSpace(Instr
);
2344 // Determine if the pointer operand of the access is either consecutive or
2345 // reverse consecutive.
2346 bool Reverse
= (Decision
== LoopVectorizationCostModel::CM_Widen_Reverse
);
2347 bool ConsecutiveStride
=
2348 Reverse
|| (Decision
== LoopVectorizationCostModel::CM_Widen
);
2349 bool CreateGatherScatter
=
2350 (Decision
== LoopVectorizationCostModel::CM_GatherScatter
);
2352 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2353 // gather/scatter. Otherwise Decision should have been to Scalarize.
2354 assert((ConsecutiveStride
|| CreateGatherScatter
) &&
2355 "The instruction should be scalarized");
2357 // Handle consecutive loads/stores.
2358 if (ConsecutiveStride
)
2359 Ptr
= getOrCreateScalarValue(Ptr
, {0, 0});
2362 bool isMaskRequired
= BlockInMask
;
2364 Mask
= *BlockInMask
;
2366 bool InBounds
= false;
2367 if (auto *gep
= dyn_cast
<GetElementPtrInst
>(
2368 getLoadStorePointerOperand(Instr
)->stripPointerCasts()))
2369 InBounds
= gep
->isInBounds();
2371 const auto CreateVecPtr
= [&](unsigned Part
, Value
*Ptr
) -> Value
* {
2372 // Calculate the pointer for the specific unroll-part.
2373 GetElementPtrInst
*PartPtr
= nullptr;
2376 // If the address is consecutive but reversed, then the
2377 // wide store needs to start at the last vector element.
2378 PartPtr
= cast
<GetElementPtrInst
>(
2379 Builder
.CreateGEP(ScalarDataTy
, Ptr
, Builder
.getInt32(-Part
* VF
)));
2380 PartPtr
->setIsInBounds(InBounds
);
2381 PartPtr
= cast
<GetElementPtrInst
>(
2382 Builder
.CreateGEP(ScalarDataTy
, PartPtr
, Builder
.getInt32(1 - VF
)));
2383 PartPtr
->setIsInBounds(InBounds
);
2384 if (isMaskRequired
) // Reverse of a null all-one mask is a null mask.
2385 Mask
[Part
] = reverseVector(Mask
[Part
]);
2387 PartPtr
= cast
<GetElementPtrInst
>(
2388 Builder
.CreateGEP(ScalarDataTy
, Ptr
, Builder
.getInt32(Part
* VF
)));
2389 PartPtr
->setIsInBounds(InBounds
);
2392 return Builder
.CreateBitCast(PartPtr
, DataTy
->getPointerTo(AddressSpace
));
2397 setDebugLocFromInst(Builder
, SI
);
2399 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
2400 Instruction
*NewSI
= nullptr;
2401 Value
*StoredVal
= getOrCreateVectorValue(SI
->getValueOperand(), Part
);
2402 if (CreateGatherScatter
) {
2403 Value
*MaskPart
= isMaskRequired
? Mask
[Part
] : nullptr;
2404 Value
*VectorGep
= getOrCreateVectorValue(Ptr
, Part
);
2405 NewSI
= Builder
.CreateMaskedScatter(StoredVal
, VectorGep
, Alignment
,
2409 // If we store to reverse consecutive memory locations, then we need
2410 // to reverse the order of elements in the stored value.
2411 StoredVal
= reverseVector(StoredVal
);
2412 // We don't want to update the value in the map as it might be used in
2413 // another expression. So don't call resetVectorValue(StoredVal).
2415 auto *VecPtr
= CreateVecPtr(Part
, Ptr
);
2417 NewSI
= Builder
.CreateMaskedStore(StoredVal
, VecPtr
, Alignment
,
2420 NewSI
= Builder
.CreateAlignedStore(StoredVal
, VecPtr
, Alignment
);
2422 addMetadata(NewSI
, SI
);
2428 assert(LI
&& "Must have a load instruction");
2429 setDebugLocFromInst(Builder
, LI
);
2430 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
2432 if (CreateGatherScatter
) {
2433 Value
*MaskPart
= isMaskRequired
? Mask
[Part
] : nullptr;
2434 Value
*VectorGep
= getOrCreateVectorValue(Ptr
, Part
);
2435 NewLI
= Builder
.CreateMaskedGather(VectorGep
, Alignment
, MaskPart
,
2436 nullptr, "wide.masked.gather");
2437 addMetadata(NewLI
, LI
);
2439 auto *VecPtr
= CreateVecPtr(Part
, Ptr
);
2441 NewLI
= Builder
.CreateMaskedLoad(VecPtr
, Alignment
, Mask
[Part
],
2442 UndefValue::get(DataTy
),
2443 "wide.masked.load");
2446 Builder
.CreateAlignedLoad(DataTy
, VecPtr
, Alignment
, "wide.load");
2448 // Add metadata to the load, but setVectorValue to the reverse shuffle.
2449 addMetadata(NewLI
, LI
);
2451 NewLI
= reverseVector(NewLI
);
2453 VectorLoopValueMap
.setVectorValue(Instr
, Part
, NewLI
);
2457 void InnerLoopVectorizer::scalarizeInstruction(Instruction
*Instr
,
2458 const VPIteration
&Instance
,
2459 bool IfPredicateInstr
) {
2460 assert(!Instr
->getType()->isAggregateType() && "Can't handle vectors");
2462 setDebugLocFromInst(Builder
, Instr
);
2464 // Does this instruction return a value ?
2465 bool IsVoidRetTy
= Instr
->getType()->isVoidTy();
2467 Instruction
*Cloned
= Instr
->clone();
2469 Cloned
->setName(Instr
->getName() + ".cloned");
2471 // Replace the operands of the cloned instructions with their scalar
2472 // equivalents in the new loop.
2473 for (unsigned op
= 0, e
= Instr
->getNumOperands(); op
!= e
; ++op
) {
2474 auto *NewOp
= getOrCreateScalarValue(Instr
->getOperand(op
), Instance
);
2475 Cloned
->setOperand(op
, NewOp
);
2477 addNewMetadata(Cloned
, Instr
);
2479 // Place the cloned scalar in the new loop.
2480 Builder
.Insert(Cloned
);
2482 // Add the cloned scalar to the scalar map entry.
2483 VectorLoopValueMap
.setScalarValue(Instr
, Instance
, Cloned
);
2485 // If we just cloned a new assumption, add it the assumption cache.
2486 if (auto *II
= dyn_cast
<IntrinsicInst
>(Cloned
))
2487 if (II
->getIntrinsicID() == Intrinsic::assume
)
2488 AC
->registerAssumption(II
);
2491 if (IfPredicateInstr
)
2492 PredicatedInstructions
.push_back(Cloned
);
2495 PHINode
*InnerLoopVectorizer::createInductionVariable(Loop
*L
, Value
*Start
,
2496 Value
*End
, Value
*Step
,
2498 BasicBlock
*Header
= L
->getHeader();
2499 BasicBlock
*Latch
= L
->getLoopLatch();
2500 // As we're just creating this loop, it's possible no latch exists
2501 // yet. If so, use the header as this will be a single block loop.
2505 IRBuilder
<> Builder(&*Header
->getFirstInsertionPt());
2506 Instruction
*OldInst
= getDebugLocFromInstOrOperands(OldInduction
);
2507 setDebugLocFromInst(Builder
, OldInst
);
2508 auto *Induction
= Builder
.CreatePHI(Start
->getType(), 2, "index");
2510 Builder
.SetInsertPoint(Latch
->getTerminator());
2511 setDebugLocFromInst(Builder
, OldInst
);
2513 // Create i+1 and fill the PHINode.
2514 Value
*Next
= Builder
.CreateAdd(Induction
, Step
, "index.next");
2515 Induction
->addIncoming(Start
, L
->getLoopPreheader());
2516 Induction
->addIncoming(Next
, Latch
);
2517 // Create the compare.
2518 Value
*ICmp
= Builder
.CreateICmpEQ(Next
, End
);
2519 Builder
.CreateCondBr(ICmp
, L
->getExitBlock(), Header
);
2521 // Now we have two terminators. Remove the old one from the block.
2522 Latch
->getTerminator()->eraseFromParent();
2527 Value
*InnerLoopVectorizer::getOrCreateTripCount(Loop
*L
) {
2531 assert(L
&& "Create Trip Count for null loop.");
2532 IRBuilder
<> Builder(L
->getLoopPreheader()->getTerminator());
2533 // Find the loop boundaries.
2534 ScalarEvolution
*SE
= PSE
.getSE();
2535 const SCEV
*BackedgeTakenCount
= PSE
.getBackedgeTakenCount();
2536 assert(BackedgeTakenCount
!= SE
->getCouldNotCompute() &&
2537 "Invalid loop count");
2539 Type
*IdxTy
= Legal
->getWidestInductionType();
2540 assert(IdxTy
&& "No type for induction");
2542 // The exit count might have the type of i64 while the phi is i32. This can
2543 // happen if we have an induction variable that is sign extended before the
2544 // compare. The only way that we get a backedge taken count is that the
2545 // induction variable was signed and as such will not overflow. In such a case
2546 // truncation is legal.
2547 if (BackedgeTakenCount
->getType()->getPrimitiveSizeInBits() >
2548 IdxTy
->getPrimitiveSizeInBits())
2549 BackedgeTakenCount
= SE
->getTruncateOrNoop(BackedgeTakenCount
, IdxTy
);
2550 BackedgeTakenCount
= SE
->getNoopOrZeroExtend(BackedgeTakenCount
, IdxTy
);
2552 // Get the total trip count from the count by adding 1.
2553 const SCEV
*ExitCount
= SE
->getAddExpr(
2554 BackedgeTakenCount
, SE
->getOne(BackedgeTakenCount
->getType()));
2556 const DataLayout
&DL
= L
->getHeader()->getModule()->getDataLayout();
2558 // Expand the trip count and place the new instructions in the preheader.
2559 // Notice that the pre-header does not change, only the loop body.
2560 SCEVExpander
Exp(*SE
, DL
, "induction");
2562 // Count holds the overall loop count (N).
2563 TripCount
= Exp
.expandCodeFor(ExitCount
, ExitCount
->getType(),
2564 L
->getLoopPreheader()->getTerminator());
2566 if (TripCount
->getType()->isPointerTy())
2568 CastInst::CreatePointerCast(TripCount
, IdxTy
, "exitcount.ptrcnt.to.int",
2569 L
->getLoopPreheader()->getTerminator());
2574 Value
*InnerLoopVectorizer::getOrCreateVectorTripCount(Loop
*L
) {
2575 if (VectorTripCount
)
2576 return VectorTripCount
;
2578 Value
*TC
= getOrCreateTripCount(L
);
2579 IRBuilder
<> Builder(L
->getLoopPreheader()->getTerminator());
2581 Type
*Ty
= TC
->getType();
2582 Constant
*Step
= ConstantInt::get(Ty
, VF
* UF
);
2584 // If the tail is to be folded by masking, round the number of iterations N
2585 // up to a multiple of Step instead of rounding down. This is done by first
2586 // adding Step-1 and then rounding down. Note that it's ok if this addition
2587 // overflows: the vector induction variable will eventually wrap to zero given
2588 // that it starts at zero and its Step is a power of two; the loop will then
2589 // exit, with the last early-exit vector comparison also producing all-true.
2590 if (Cost
->foldTailByMasking()) {
2591 assert(isPowerOf2_32(VF
* UF
) &&
2592 "VF*UF must be a power of 2 when folding tail by masking");
2593 TC
= Builder
.CreateAdd(TC
, ConstantInt::get(Ty
, VF
* UF
- 1), "n.rnd.up");
2596 // Now we need to generate the expression for the part of the loop that the
2597 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2598 // iterations are not required for correctness, or N - Step, otherwise. Step
2599 // is equal to the vectorization factor (number of SIMD elements) times the
2600 // unroll factor (number of SIMD instructions).
2601 Value
*R
= Builder
.CreateURem(TC
, Step
, "n.mod.vf");
2603 // If there is a non-reversed interleaved group that may speculatively access
2604 // memory out-of-bounds, we need to ensure that there will be at least one
2605 // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2606 // the trip count, we set the remainder to be equal to the step. If the step
2607 // does not evenly divide the trip count, no adjustment is necessary since
2608 // there will already be scalar iterations. Note that the minimum iterations
2609 // check ensures that N >= Step.
2610 if (VF
> 1 && Cost
->requiresScalarEpilogue()) {
2611 auto *IsZero
= Builder
.CreateICmpEQ(R
, ConstantInt::get(R
->getType(), 0));
2612 R
= Builder
.CreateSelect(IsZero
, Step
, R
);
2615 VectorTripCount
= Builder
.CreateSub(TC
, R
, "n.vec");
2617 return VectorTripCount
;
2620 Value
*InnerLoopVectorizer::createBitOrPointerCast(Value
*V
, VectorType
*DstVTy
,
2621 const DataLayout
&DL
) {
2622 // Verify that V is a vector type with same number of elements as DstVTy.
2623 unsigned VF
= DstVTy
->getNumElements();
2624 VectorType
*SrcVecTy
= cast
<VectorType
>(V
->getType());
2625 assert((VF
== SrcVecTy
->getNumElements()) && "Vector dimensions do not match");
2626 Type
*SrcElemTy
= SrcVecTy
->getElementType();
2627 Type
*DstElemTy
= DstVTy
->getElementType();
2628 assert((DL
.getTypeSizeInBits(SrcElemTy
) == DL
.getTypeSizeInBits(DstElemTy
)) &&
2629 "Vector elements must have same size");
2631 // Do a direct cast if element types are castable.
2632 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy
, DstElemTy
, DL
)) {
2633 return Builder
.CreateBitOrPointerCast(V
, DstVTy
);
2635 // V cannot be directly casted to desired vector type.
2636 // May happen when V is a floating point vector but DstVTy is a vector of
2637 // pointers or vice-versa. Handle this using a two-step bitcast using an
2638 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2639 assert((DstElemTy
->isPointerTy() != SrcElemTy
->isPointerTy()) &&
2640 "Only one type should be a pointer type");
2641 assert((DstElemTy
->isFloatingPointTy() != SrcElemTy
->isFloatingPointTy()) &&
2642 "Only one type should be a floating point type");
2644 IntegerType::getIntNTy(V
->getContext(), DL
.getTypeSizeInBits(SrcElemTy
));
2645 VectorType
*VecIntTy
= VectorType::get(IntTy
, VF
);
2646 Value
*CastVal
= Builder
.CreateBitOrPointerCast(V
, VecIntTy
);
2647 return Builder
.CreateBitOrPointerCast(CastVal
, DstVTy
);
2650 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop
*L
,
2651 BasicBlock
*Bypass
) {
2652 Value
*Count
= getOrCreateTripCount(L
);
2653 BasicBlock
*BB
= L
->getLoopPreheader();
2654 IRBuilder
<> Builder(BB
->getTerminator());
2656 // Generate code to check if the loop's trip count is less than VF * UF, or
2657 // equal to it in case a scalar epilogue is required; this implies that the
2658 // vector trip count is zero. This check also covers the case where adding one
2659 // to the backedge-taken count overflowed leading to an incorrect trip count
2660 // of zero. In this case we will also jump to the scalar loop.
2661 auto P
= Cost
->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2662 : ICmpInst::ICMP_ULT
;
2664 // If tail is to be folded, vector loop takes care of all iterations.
2665 Value
*CheckMinIters
= Builder
.getFalse();
2666 if (!Cost
->foldTailByMasking())
2667 CheckMinIters
= Builder
.CreateICmp(
2668 P
, Count
, ConstantInt::get(Count
->getType(), VF
* UF
),
2671 BasicBlock
*NewBB
= BB
->splitBasicBlock(BB
->getTerminator(), "vector.ph");
2672 // Update dominator tree immediately if the generated block is a
2673 // LoopBypassBlock because SCEV expansions to generate loop bypass
2674 // checks may query it before the current function is finished.
2675 DT
->addNewBlock(NewBB
, BB
);
2676 if (L
->getParentLoop())
2677 L
->getParentLoop()->addBasicBlockToLoop(NewBB
, *LI
);
2678 ReplaceInstWithInst(BB
->getTerminator(),
2679 BranchInst::Create(Bypass
, NewBB
, CheckMinIters
));
2680 LoopBypassBlocks
.push_back(BB
);
2683 void InnerLoopVectorizer::emitSCEVChecks(Loop
*L
, BasicBlock
*Bypass
) {
2684 BasicBlock
*BB
= L
->getLoopPreheader();
2686 // Generate the code to check that the SCEV assumptions that we made.
2687 // We want the new basic block to start at the first instruction in a
2688 // sequence of instructions that form a check.
2689 SCEVExpander
Exp(*PSE
.getSE(), Bypass
->getModule()->getDataLayout(),
2692 Exp
.expandCodeForPredicate(&PSE
.getUnionPredicate(), BB
->getTerminator());
2694 if (auto *C
= dyn_cast
<ConstantInt
>(SCEVCheck
))
2698 assert(!BB
->getParent()->hasOptSize() &&
2699 "Cannot SCEV check stride or overflow when optimizing for size");
2701 // Create a new block containing the stride check.
2702 BB
->setName("vector.scevcheck");
2703 auto *NewBB
= BB
->splitBasicBlock(BB
->getTerminator(), "vector.ph");
2704 // Update dominator tree immediately if the generated block is a
2705 // LoopBypassBlock because SCEV expansions to generate loop bypass
2706 // checks may query it before the current function is finished.
2707 DT
->addNewBlock(NewBB
, BB
);
2708 if (L
->getParentLoop())
2709 L
->getParentLoop()->addBasicBlockToLoop(NewBB
, *LI
);
2710 ReplaceInstWithInst(BB
->getTerminator(),
2711 BranchInst::Create(Bypass
, NewBB
, SCEVCheck
));
2712 LoopBypassBlocks
.push_back(BB
);
2713 AddedSafetyChecks
= true;
2716 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop
*L
, BasicBlock
*Bypass
) {
2717 // VPlan-native path does not do any analysis for runtime checks currently.
2718 if (EnableVPlanNativePath
)
2721 BasicBlock
*BB
= L
->getLoopPreheader();
2723 // Generate the code that checks in runtime if arrays overlap. We put the
2724 // checks into a separate block to make the more common case of few elements
2726 Instruction
*FirstCheckInst
;
2727 Instruction
*MemRuntimeCheck
;
2728 std::tie(FirstCheckInst
, MemRuntimeCheck
) =
2729 Legal
->getLAI()->addRuntimeChecks(BB
->getTerminator());
2730 if (!MemRuntimeCheck
)
2733 assert(!BB
->getParent()->hasOptSize() &&
2734 "Cannot emit memory checks when optimizing for size");
2736 // Create a new block containing the memory check.
2737 BB
->setName("vector.memcheck");
2738 auto *NewBB
= BB
->splitBasicBlock(BB
->getTerminator(), "vector.ph");
2739 // Update dominator tree immediately if the generated block is a
2740 // LoopBypassBlock because SCEV expansions to generate loop bypass
2741 // checks may query it before the current function is finished.
2742 DT
->addNewBlock(NewBB
, BB
);
2743 if (L
->getParentLoop())
2744 L
->getParentLoop()->addBasicBlockToLoop(NewBB
, *LI
);
2745 ReplaceInstWithInst(BB
->getTerminator(),
2746 BranchInst::Create(Bypass
, NewBB
, MemRuntimeCheck
));
2747 LoopBypassBlocks
.push_back(BB
);
2748 AddedSafetyChecks
= true;
2750 // We currently don't use LoopVersioning for the actual loop cloning but we
2751 // still use it to add the noalias metadata.
2752 LVer
= std::make_unique
<LoopVersioning
>(*Legal
->getLAI(), OrigLoop
, LI
, DT
,
2754 LVer
->prepareNoAliasMetadata();
2757 Value
*InnerLoopVectorizer::emitTransformedIndex(
2758 IRBuilder
<> &B
, Value
*Index
, ScalarEvolution
*SE
, const DataLayout
&DL
,
2759 const InductionDescriptor
&ID
) const {
2761 SCEVExpander
Exp(*SE
, DL
, "induction");
2762 auto Step
= ID
.getStep();
2763 auto StartValue
= ID
.getStartValue();
2764 assert(Index
->getType() == Step
->getType() &&
2765 "Index type does not match StepValue type");
2767 // Note: the IR at this point is broken. We cannot use SE to create any new
2768 // SCEV and then expand it, hoping that SCEV's simplification will give us
2769 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2770 // lead to various SCEV crashes. So all we can do is to use builder and rely
2771 // on InstCombine for future simplifications. Here we handle some trivial
2773 auto CreateAdd
= [&B
](Value
*X
, Value
*Y
) {
2774 assert(X
->getType() == Y
->getType() && "Types don't match!");
2775 if (auto *CX
= dyn_cast
<ConstantInt
>(X
))
2778 if (auto *CY
= dyn_cast
<ConstantInt
>(Y
))
2781 return B
.CreateAdd(X
, Y
);
2784 auto CreateMul
= [&B
](Value
*X
, Value
*Y
) {
2785 assert(X
->getType() == Y
->getType() && "Types don't match!");
2786 if (auto *CX
= dyn_cast
<ConstantInt
>(X
))
2789 if (auto *CY
= dyn_cast
<ConstantInt
>(Y
))
2792 return B
.CreateMul(X
, Y
);
2795 switch (ID
.getKind()) {
2796 case InductionDescriptor::IK_IntInduction
: {
2797 assert(Index
->getType() == StartValue
->getType() &&
2798 "Index type does not match StartValue type");
2799 if (ID
.getConstIntStepValue() && ID
.getConstIntStepValue()->isMinusOne())
2800 return B
.CreateSub(StartValue
, Index
);
2801 auto *Offset
= CreateMul(
2802 Index
, Exp
.expandCodeFor(Step
, Index
->getType(), &*B
.GetInsertPoint()));
2803 return CreateAdd(StartValue
, Offset
);
2805 case InductionDescriptor::IK_PtrInduction
: {
2806 assert(isa
<SCEVConstant
>(Step
) &&
2807 "Expected constant step for pointer induction");
2809 StartValue
->getType()->getPointerElementType(), StartValue
,
2810 CreateMul(Index
, Exp
.expandCodeFor(Step
, Index
->getType(),
2811 &*B
.GetInsertPoint())));
2813 case InductionDescriptor::IK_FpInduction
: {
2814 assert(Step
->getType()->isFloatingPointTy() && "Expected FP Step value");
2815 auto InductionBinOp
= ID
.getInductionBinOp();
2816 assert(InductionBinOp
&&
2817 (InductionBinOp
->getOpcode() == Instruction::FAdd
||
2818 InductionBinOp
->getOpcode() == Instruction::FSub
) &&
2819 "Original bin op should be defined for FP induction");
2821 Value
*StepValue
= cast
<SCEVUnknown
>(Step
)->getValue();
2823 // Floating point operations had to be 'fast' to enable the induction.
2824 FastMathFlags Flags
;
2827 Value
*MulExp
= B
.CreateFMul(StepValue
, Index
);
2828 if (isa
<Instruction
>(MulExp
))
2829 // We have to check, the MulExp may be a constant.
2830 cast
<Instruction
>(MulExp
)->setFastMathFlags(Flags
);
2832 Value
*BOp
= B
.CreateBinOp(InductionBinOp
->getOpcode(), StartValue
, MulExp
,
2834 if (isa
<Instruction
>(BOp
))
2835 cast
<Instruction
>(BOp
)->setFastMathFlags(Flags
);
2839 case InductionDescriptor::IK_NoInduction
:
2842 llvm_unreachable("invalid enum");
2845 BasicBlock
*InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2847 In this function we generate a new loop. The new loop will contain
2848 the vectorized instructions while the old loop will continue to run the
2851 [ ] <-- loop iteration number check.
2854 | [ ] <-- vector loop bypass (may consist of multiple blocks).
2857 || [ ] <-- vector pre header.
2861 | [ ]_| <-- vector loop.
2864 | -[ ] <--- middle-block.
2867 -|- >[ ] <--- new preheader.
2871 | [ ]_| <-- old scalar loop to handle remainder.
2874 >[ ] <-- exit block.
2878 BasicBlock
*OldBasicBlock
= OrigLoop
->getHeader();
2879 BasicBlock
*VectorPH
= OrigLoop
->getLoopPreheader();
2880 BasicBlock
*ExitBlock
= OrigLoop
->getExitBlock();
2881 MDNode
*OrigLoopID
= OrigLoop
->getLoopID();
2882 assert(VectorPH
&& "Invalid loop structure");
2883 assert(ExitBlock
&& "Must have an exit block");
2885 // Some loops have a single integer induction variable, while other loops
2886 // don't. One example is c++ iterators that often have multiple pointer
2887 // induction variables. In the code below we also support a case where we
2888 // don't have a single induction variable.
2890 // We try to obtain an induction variable from the original loop as hard
2891 // as possible. However if we don't find one that:
2893 // - counts from zero, stepping by one
2894 // - is the size of the widest induction variable type
2895 // then we create a new one.
2896 OldInduction
= Legal
->getPrimaryInduction();
2897 Type
*IdxTy
= Legal
->getWidestInductionType();
2899 // Split the single block loop into the two loop structure described above.
2900 BasicBlock
*VecBody
=
2901 VectorPH
->splitBasicBlock(VectorPH
->getTerminator(), "vector.body");
2902 BasicBlock
*MiddleBlock
=
2903 VecBody
->splitBasicBlock(VecBody
->getTerminator(), "middle.block");
2904 BasicBlock
*ScalarPH
=
2905 MiddleBlock
->splitBasicBlock(MiddleBlock
->getTerminator(), "scalar.ph");
2907 // Create and register the new vector loop.
2908 Loop
*Lp
= LI
->AllocateLoop();
2909 Loop
*ParentLoop
= OrigLoop
->getParentLoop();
2911 // Insert the new loop into the loop nest and register the new basic blocks
2912 // before calling any utilities such as SCEV that require valid LoopInfo.
2914 ParentLoop
->addChildLoop(Lp
);
2915 ParentLoop
->addBasicBlockToLoop(ScalarPH
, *LI
);
2916 ParentLoop
->addBasicBlockToLoop(MiddleBlock
, *LI
);
2918 LI
->addTopLevelLoop(Lp
);
2920 Lp
->addBasicBlockToLoop(VecBody
, *LI
);
2922 // Find the loop boundaries.
2923 Value
*Count
= getOrCreateTripCount(Lp
);
2925 Value
*StartIdx
= ConstantInt::get(IdxTy
, 0);
2927 // Now, compare the new count to zero. If it is zero skip the vector loop and
2928 // jump to the scalar loop. This check also covers the case where the
2929 // backedge-taken count is uint##_max: adding one to it will overflow leading
2930 // to an incorrect trip count of zero. In this (rare) case we will also jump
2931 // to the scalar loop.
2932 emitMinimumIterationCountCheck(Lp
, ScalarPH
);
2934 // Generate the code to check any assumptions that we've made for SCEV
2936 emitSCEVChecks(Lp
, ScalarPH
);
2938 // Generate the code that checks in runtime if arrays overlap. We put the
2939 // checks into a separate block to make the more common case of few elements
2941 emitMemRuntimeChecks(Lp
, ScalarPH
);
2943 // Generate the induction variable.
2944 // The loop step is equal to the vectorization factor (num of SIMD elements)
2945 // times the unroll factor (num of SIMD instructions).
2946 Value
*CountRoundDown
= getOrCreateVectorTripCount(Lp
);
2947 Constant
*Step
= ConstantInt::get(IdxTy
, VF
* UF
);
2949 createInductionVariable(Lp
, StartIdx
, CountRoundDown
, Step
,
2950 getDebugLocFromInstOrOperands(OldInduction
));
2952 // We are going to resume the execution of the scalar loop.
2953 // Go over all of the induction variables that we found and fix the
2954 // PHIs that are left in the scalar version of the loop.
2955 // The starting values of PHI nodes depend on the counter of the last
2956 // iteration in the vectorized loop.
2957 // If we come from a bypass edge then we need to start from the original
2960 // This variable saves the new starting index for the scalar loop. It is used
2961 // to test if there are any tail iterations left once the vector loop has
2963 LoopVectorizationLegality::InductionList
*List
= Legal
->getInductionVars();
2964 for (auto &InductionEntry
: *List
) {
2965 PHINode
*OrigPhi
= InductionEntry
.first
;
2966 InductionDescriptor II
= InductionEntry
.second
;
2968 // Create phi nodes to merge from the backedge-taken check block.
2969 PHINode
*BCResumeVal
= PHINode::Create(
2970 OrigPhi
->getType(), 3, "bc.resume.val", ScalarPH
->getTerminator());
2971 // Copy original phi DL over to the new one.
2972 BCResumeVal
->setDebugLoc(OrigPhi
->getDebugLoc());
2973 Value
*&EndValue
= IVEndValues
[OrigPhi
];
2974 if (OrigPhi
== OldInduction
) {
2975 // We know what the end value is.
2976 EndValue
= CountRoundDown
;
2978 IRBuilder
<> B(Lp
->getLoopPreheader()->getTerminator());
2979 Type
*StepType
= II
.getStep()->getType();
2980 Instruction::CastOps CastOp
=
2981 CastInst::getCastOpcode(CountRoundDown
, true, StepType
, true);
2982 Value
*CRD
= B
.CreateCast(CastOp
, CountRoundDown
, StepType
, "cast.crd");
2983 const DataLayout
&DL
= OrigLoop
->getHeader()->getModule()->getDataLayout();
2984 EndValue
= emitTransformedIndex(B
, CRD
, PSE
.getSE(), DL
, II
);
2985 EndValue
->setName("ind.end");
2988 // The new PHI merges the original incoming value, in case of a bypass,
2989 // or the value at the end of the vectorized loop.
2990 BCResumeVal
->addIncoming(EndValue
, MiddleBlock
);
2992 // Fix the scalar body counter (PHI node).
2993 // The old induction's phi node in the scalar body needs the truncated
2995 for (BasicBlock
*BB
: LoopBypassBlocks
)
2996 BCResumeVal
->addIncoming(II
.getStartValue(), BB
);
2997 OrigPhi
->setIncomingValueForBlock(ScalarPH
, BCResumeVal
);
3000 // We need the OrigLoop (scalar loop part) latch terminator to help
3001 // produce correct debug info for the middle block BB instructions.
3002 // The legality check stage guarantees that the loop will have a single
3004 assert(isa
<BranchInst
>(OrigLoop
->getLoopLatch()->getTerminator()) &&
3005 "Scalar loop latch terminator isn't a branch");
3006 BranchInst
*ScalarLatchBr
=
3007 cast
<BranchInst
>(OrigLoop
->getLoopLatch()->getTerminator());
3009 // Add a check in the middle block to see if we have completed
3010 // all of the iterations in the first vector loop.
3011 // If (N - N%VF) == N, then we *don't* need to run the remainder.
3012 // If tail is to be folded, we know we don't need to run the remainder.
3013 Value
*CmpN
= Builder
.getTrue();
3014 if (!Cost
->foldTailByMasking()) {
3016 CmpInst::Create(Instruction::ICmp
, CmpInst::ICMP_EQ
, Count
,
3017 CountRoundDown
, "cmp.n", MiddleBlock
->getTerminator());
3019 // Here we use the same DebugLoc as the scalar loop latch branch instead
3020 // of the corresponding compare because they may have ended up with
3021 // different line numbers and we want to avoid awkward line stepping while
3022 // debugging. Eg. if the compare has got a line number inside the loop.
3023 cast
<Instruction
>(CmpN
)->setDebugLoc(ScalarLatchBr
->getDebugLoc());
3026 BranchInst
*BrInst
= BranchInst::Create(ExitBlock
, ScalarPH
, CmpN
);
3027 BrInst
->setDebugLoc(ScalarLatchBr
->getDebugLoc());
3028 ReplaceInstWithInst(MiddleBlock
->getTerminator(), BrInst
);
3030 // Get ready to start creating new instructions into the vectorized body.
3031 Builder
.SetInsertPoint(&*VecBody
->getFirstInsertionPt());
3034 LoopVectorPreHeader
= Lp
->getLoopPreheader();
3035 LoopScalarPreHeader
= ScalarPH
;
3036 LoopMiddleBlock
= MiddleBlock
;
3037 LoopExitBlock
= ExitBlock
;
3038 LoopVectorBody
= VecBody
;
3039 LoopScalarBody
= OldBasicBlock
;
3041 Optional
<MDNode
*> VectorizedLoopID
=
3042 makeFollowupLoopID(OrigLoopID
, {LLVMLoopVectorizeFollowupAll
,
3043 LLVMLoopVectorizeFollowupVectorized
});
3044 if (VectorizedLoopID
.hasValue()) {
3045 Lp
->setLoopID(VectorizedLoopID
.getValue());
3047 // Do not setAlreadyVectorized if loop attributes have been defined
3049 return LoopVectorPreHeader
;
3052 // Keep all loop hints from the original loop on the vector loop (we'll
3053 // replace the vectorizer-specific hints below).
3054 if (MDNode
*LID
= OrigLoop
->getLoopID())
3057 LoopVectorizeHints
Hints(Lp
, true, *ORE
);
3058 Hints
.setAlreadyVectorized();
3060 return LoopVectorPreHeader
;
3063 // Fix up external users of the induction variable. At this point, we are
3064 // in LCSSA form, with all external PHIs that use the IV having one input value,
3065 // coming from the remainder loop. We need those PHIs to also have a correct
3066 // value for the IV when arriving directly from the middle block.
3067 void InnerLoopVectorizer::fixupIVUsers(PHINode
*OrigPhi
,
3068 const InductionDescriptor
&II
,
3069 Value
*CountRoundDown
, Value
*EndValue
,
3070 BasicBlock
*MiddleBlock
) {
3071 // There are two kinds of external IV usages - those that use the value
3072 // computed in the last iteration (the PHI) and those that use the penultimate
3073 // value (the value that feeds into the phi from the loop latch).
3074 // We allow both, but they, obviously, have different values.
3076 assert(OrigLoop
->getExitBlock() && "Expected a single exit block");
3078 DenseMap
<Value
*, Value
*> MissingVals
;
3080 // An external user of the last iteration's value should see the value that
3081 // the remainder loop uses to initialize its own IV.
3082 Value
*PostInc
= OrigPhi
->getIncomingValueForBlock(OrigLoop
->getLoopLatch());
3083 for (User
*U
: PostInc
->users()) {
3084 Instruction
*UI
= cast
<Instruction
>(U
);
3085 if (!OrigLoop
->contains(UI
)) {
3086 assert(isa
<PHINode
>(UI
) && "Expected LCSSA form");
3087 MissingVals
[UI
] = EndValue
;
3091 // An external user of the penultimate value need to see EndValue - Step.
3092 // The simplest way to get this is to recompute it from the constituent SCEVs,
3093 // that is Start + (Step * (CRD - 1)).
3094 for (User
*U
: OrigPhi
->users()) {
3095 auto *UI
= cast
<Instruction
>(U
);
3096 if (!OrigLoop
->contains(UI
)) {
3097 const DataLayout
&DL
=
3098 OrigLoop
->getHeader()->getModule()->getDataLayout();
3099 assert(isa
<PHINode
>(UI
) && "Expected LCSSA form");
3101 IRBuilder
<> B(MiddleBlock
->getTerminator());
3102 Value
*CountMinusOne
= B
.CreateSub(
3103 CountRoundDown
, ConstantInt::get(CountRoundDown
->getType(), 1));
3105 !II
.getStep()->getType()->isIntegerTy()
3106 ? B
.CreateCast(Instruction::SIToFP
, CountMinusOne
,
3107 II
.getStep()->getType())
3108 : B
.CreateSExtOrTrunc(CountMinusOne
, II
.getStep()->getType());
3109 CMO
->setName("cast.cmo");
3110 Value
*Escape
= emitTransformedIndex(B
, CMO
, PSE
.getSE(), DL
, II
);
3111 Escape
->setName("ind.escape");
3112 MissingVals
[UI
] = Escape
;
3116 for (auto &I
: MissingVals
) {
3117 PHINode
*PHI
= cast
<PHINode
>(I
.first
);
3118 // One corner case we have to handle is two IVs "chasing" each-other,
3119 // that is %IV2 = phi [...], [ %IV1, %latch ]
3120 // In this case, if IV1 has an external use, we need to avoid adding both
3121 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3122 // don't already have an incoming value for the middle block.
3123 if (PHI
->getBasicBlockIndex(MiddleBlock
) == -1)
3124 PHI
->addIncoming(I
.second
, MiddleBlock
);
3130 struct CSEDenseMapInfo
{
3131 static bool canHandle(const Instruction
*I
) {
3132 return isa
<InsertElementInst
>(I
) || isa
<ExtractElementInst
>(I
) ||
3133 isa
<ShuffleVectorInst
>(I
) || isa
<GetElementPtrInst
>(I
);
3136 static inline Instruction
*getEmptyKey() {
3137 return DenseMapInfo
<Instruction
*>::getEmptyKey();
3140 static inline Instruction
*getTombstoneKey() {
3141 return DenseMapInfo
<Instruction
*>::getTombstoneKey();
3144 static unsigned getHashValue(const Instruction
*I
) {
3145 assert(canHandle(I
) && "Unknown instruction!");
3146 return hash_combine(I
->getOpcode(), hash_combine_range(I
->value_op_begin(),
3147 I
->value_op_end()));
3150 static bool isEqual(const Instruction
*LHS
, const Instruction
*RHS
) {
3151 if (LHS
== getEmptyKey() || RHS
== getEmptyKey() ||
3152 LHS
== getTombstoneKey() || RHS
== getTombstoneKey())
3154 return LHS
->isIdenticalTo(RHS
);
3158 } // end anonymous namespace
3160 ///Perform cse of induction variable instructions.
3161 static void cse(BasicBlock
*BB
) {
3162 // Perform simple cse.
3163 SmallDenseMap
<Instruction
*, Instruction
*, 4, CSEDenseMapInfo
> CSEMap
;
3164 for (BasicBlock::iterator I
= BB
->begin(), E
= BB
->end(); I
!= E
;) {
3165 Instruction
*In
= &*I
++;
3167 if (!CSEDenseMapInfo::canHandle(In
))
3170 // Check if we can replace this instruction with any of the
3171 // visited instructions.
3172 if (Instruction
*V
= CSEMap
.lookup(In
)) {
3173 In
->replaceAllUsesWith(V
);
3174 In
->eraseFromParent();
3182 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst
*CI
,
3184 bool &NeedToScalarize
) {
3185 Function
*F
= CI
->getCalledFunction();
3186 StringRef FnName
= CI
->getCalledFunction()->getName();
3187 Type
*ScalarRetTy
= CI
->getType();
3188 SmallVector
<Type
*, 4> Tys
, ScalarTys
;
3189 for (auto &ArgOp
: CI
->arg_operands())
3190 ScalarTys
.push_back(ArgOp
->getType());
3192 // Estimate cost of scalarized vector call. The source operands are assumed
3193 // to be vectors, so we need to extract individual elements from there,
3194 // execute VF scalar calls, and then gather the result into the vector return
3196 unsigned ScalarCallCost
= TTI
.getCallInstrCost(F
, ScalarRetTy
, ScalarTys
);
3198 return ScalarCallCost
;
3200 // Compute corresponding vector type for return value and arguments.
3201 Type
*RetTy
= ToVectorTy(ScalarRetTy
, VF
);
3202 for (Type
*ScalarTy
: ScalarTys
)
3203 Tys
.push_back(ToVectorTy(ScalarTy
, VF
));
3205 // Compute costs of unpacking argument values for the scalar calls and
3206 // packing the return values to a vector.
3207 unsigned ScalarizationCost
= getScalarizationOverhead(CI
, VF
);
3209 unsigned Cost
= ScalarCallCost
* VF
+ ScalarizationCost
;
3211 // If we can't emit a vector call for this function, then the currently found
3212 // cost is the cost we need to return.
3213 NeedToScalarize
= true;
3214 if (!TLI
|| !TLI
->isFunctionVectorizable(FnName
, VF
) || CI
->isNoBuiltin())
3217 // If the corresponding vector cost is cheaper, return its cost.
3218 unsigned VectorCallCost
= TTI
.getCallInstrCost(nullptr, RetTy
, Tys
);
3219 if (VectorCallCost
< Cost
) {
3220 NeedToScalarize
= false;
3221 return VectorCallCost
;
3226 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst
*CI
,
3228 Intrinsic::ID ID
= getVectorIntrinsicIDForCall(CI
, TLI
);
3229 assert(ID
&& "Expected intrinsic call!");
3232 if (auto *FPMO
= dyn_cast
<FPMathOperator
>(CI
))
3233 FMF
= FPMO
->getFastMathFlags();
3235 SmallVector
<Value
*, 4> Operands(CI
->arg_operands());
3236 return TTI
.getIntrinsicInstrCost(ID
, CI
->getType(), Operands
, FMF
, VF
);
3239 static Type
*smallestIntegerVectorType(Type
*T1
, Type
*T2
) {
3240 auto *I1
= cast
<IntegerType
>(T1
->getVectorElementType());
3241 auto *I2
= cast
<IntegerType
>(T2
->getVectorElementType());
3242 return I1
->getBitWidth() < I2
->getBitWidth() ? T1
: T2
;
3244 static Type
*largestIntegerVectorType(Type
*T1
, Type
*T2
) {
3245 auto *I1
= cast
<IntegerType
>(T1
->getVectorElementType());
3246 auto *I2
= cast
<IntegerType
>(T2
->getVectorElementType());
3247 return I1
->getBitWidth() > I2
->getBitWidth() ? T1
: T2
;
3250 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3251 // For every instruction `I` in MinBWs, truncate the operands, create a
3252 // truncated version of `I` and reextend its result. InstCombine runs
3253 // later and will remove any ext/trunc pairs.
3254 SmallPtrSet
<Value
*, 4> Erased
;
3255 for (const auto &KV
: Cost
->getMinimalBitwidths()) {
3256 // If the value wasn't vectorized, we must maintain the original scalar
3257 // type. The absence of the value from VectorLoopValueMap indicates that it
3258 // wasn't vectorized.
3259 if (!VectorLoopValueMap
.hasAnyVectorValue(KV
.first
))
3261 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3262 Value
*I
= getOrCreateVectorValue(KV
.first
, Part
);
3263 if (Erased
.find(I
) != Erased
.end() || I
->use_empty() ||
3264 !isa
<Instruction
>(I
))
3266 Type
*OriginalTy
= I
->getType();
3267 Type
*ScalarTruncatedTy
=
3268 IntegerType::get(OriginalTy
->getContext(), KV
.second
);
3269 Type
*TruncatedTy
= VectorType::get(ScalarTruncatedTy
,
3270 OriginalTy
->getVectorNumElements());
3271 if (TruncatedTy
== OriginalTy
)
3274 IRBuilder
<> B(cast
<Instruction
>(I
));
3275 auto ShrinkOperand
= [&](Value
*V
) -> Value
* {
3276 if (auto *ZI
= dyn_cast
<ZExtInst
>(V
))
3277 if (ZI
->getSrcTy() == TruncatedTy
)
3278 return ZI
->getOperand(0);
3279 return B
.CreateZExtOrTrunc(V
, TruncatedTy
);
3282 // The actual instruction modification depends on the instruction type,
3284 Value
*NewI
= nullptr;
3285 if (auto *BO
= dyn_cast
<BinaryOperator
>(I
)) {
3286 NewI
= B
.CreateBinOp(BO
->getOpcode(), ShrinkOperand(BO
->getOperand(0)),
3287 ShrinkOperand(BO
->getOperand(1)));
3289 // Any wrapping introduced by shrinking this operation shouldn't be
3290 // considered undefined behavior. So, we can't unconditionally copy
3291 // arithmetic wrapping flags to NewI.
3292 cast
<BinaryOperator
>(NewI
)->copyIRFlags(I
, /*IncludeWrapFlags=*/false);
3293 } else if (auto *CI
= dyn_cast
<ICmpInst
>(I
)) {
3295 B
.CreateICmp(CI
->getPredicate(), ShrinkOperand(CI
->getOperand(0)),
3296 ShrinkOperand(CI
->getOperand(1)));
3297 } else if (auto *SI
= dyn_cast
<SelectInst
>(I
)) {
3298 NewI
= B
.CreateSelect(SI
->getCondition(),
3299 ShrinkOperand(SI
->getTrueValue()),
3300 ShrinkOperand(SI
->getFalseValue()));
3301 } else if (auto *CI
= dyn_cast
<CastInst
>(I
)) {
3302 switch (CI
->getOpcode()) {
3304 llvm_unreachable("Unhandled cast!");
3305 case Instruction::Trunc
:
3306 NewI
= ShrinkOperand(CI
->getOperand(0));
3308 case Instruction::SExt
:
3309 NewI
= B
.CreateSExtOrTrunc(
3311 smallestIntegerVectorType(OriginalTy
, TruncatedTy
));
3313 case Instruction::ZExt
:
3314 NewI
= B
.CreateZExtOrTrunc(
3316 smallestIntegerVectorType(OriginalTy
, TruncatedTy
));
3319 } else if (auto *SI
= dyn_cast
<ShuffleVectorInst
>(I
)) {
3320 auto Elements0
= SI
->getOperand(0)->getType()->getVectorNumElements();
3321 auto *O0
= B
.CreateZExtOrTrunc(
3322 SI
->getOperand(0), VectorType::get(ScalarTruncatedTy
, Elements0
));
3323 auto Elements1
= SI
->getOperand(1)->getType()->getVectorNumElements();
3324 auto *O1
= B
.CreateZExtOrTrunc(
3325 SI
->getOperand(1), VectorType::get(ScalarTruncatedTy
, Elements1
));
3327 NewI
= B
.CreateShuffleVector(O0
, O1
, SI
->getMask());
3328 } else if (isa
<LoadInst
>(I
) || isa
<PHINode
>(I
)) {
3329 // Don't do anything with the operands, just extend the result.
3331 } else if (auto *IE
= dyn_cast
<InsertElementInst
>(I
)) {
3332 auto Elements
= IE
->getOperand(0)->getType()->getVectorNumElements();
3333 auto *O0
= B
.CreateZExtOrTrunc(
3334 IE
->getOperand(0), VectorType::get(ScalarTruncatedTy
, Elements
));
3335 auto *O1
= B
.CreateZExtOrTrunc(IE
->getOperand(1), ScalarTruncatedTy
);
3336 NewI
= B
.CreateInsertElement(O0
, O1
, IE
->getOperand(2));
3337 } else if (auto *EE
= dyn_cast
<ExtractElementInst
>(I
)) {
3338 auto Elements
= EE
->getOperand(0)->getType()->getVectorNumElements();
3339 auto *O0
= B
.CreateZExtOrTrunc(
3340 EE
->getOperand(0), VectorType::get(ScalarTruncatedTy
, Elements
));
3341 NewI
= B
.CreateExtractElement(O0
, EE
->getOperand(2));
3343 // If we don't know what to do, be conservative and don't do anything.
3347 // Lastly, extend the result.
3348 NewI
->takeName(cast
<Instruction
>(I
));
3349 Value
*Res
= B
.CreateZExtOrTrunc(NewI
, OriginalTy
);
3350 I
->replaceAllUsesWith(Res
);
3351 cast
<Instruction
>(I
)->eraseFromParent();
3353 VectorLoopValueMap
.resetVectorValue(KV
.first
, Part
, Res
);
3357 // We'll have created a bunch of ZExts that are now parentless. Clean up.
3358 for (const auto &KV
: Cost
->getMinimalBitwidths()) {
3359 // If the value wasn't vectorized, we must maintain the original scalar
3360 // type. The absence of the value from VectorLoopValueMap indicates that it
3361 // wasn't vectorized.
3362 if (!VectorLoopValueMap
.hasAnyVectorValue(KV
.first
))
3364 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3365 Value
*I
= getOrCreateVectorValue(KV
.first
, Part
);
3366 ZExtInst
*Inst
= dyn_cast
<ZExtInst
>(I
);
3367 if (Inst
&& Inst
->use_empty()) {
3368 Value
*NewI
= Inst
->getOperand(0);
3369 Inst
->eraseFromParent();
3370 VectorLoopValueMap
.resetVectorValue(KV
.first
, Part
, NewI
);
3376 void InnerLoopVectorizer::fixVectorizedLoop() {
3377 // Insert truncates and extends for any truncated instructions as hints to
3380 truncateToMinimalBitwidths();
3382 // Fix widened non-induction PHIs by setting up the PHI operands.
3383 if (OrigPHIsToFix
.size()) {
3384 assert(EnableVPlanNativePath
&&
3385 "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3386 fixNonInductionPHIs();
3389 // At this point every instruction in the original loop is widened to a
3390 // vector form. Now we need to fix the recurrences in the loop. These PHI
3391 // nodes are currently empty because we did not want to introduce cycles.
3392 // This is the second stage of vectorizing recurrences.
3393 fixCrossIterationPHIs();
3395 // Update the dominator tree.
3397 // FIXME: After creating the structure of the new loop, the dominator tree is
3398 // no longer up-to-date, and it remains that way until we update it
3399 // here. An out-of-date dominator tree is problematic for SCEV,
3400 // because SCEVExpander uses it to guide code generation. The
3401 // vectorizer use SCEVExpanders in several places. Instead, we should
3402 // keep the dominator tree up-to-date as we go.
3405 // Fix-up external users of the induction variables.
3406 for (auto &Entry
: *Legal
->getInductionVars())
3407 fixupIVUsers(Entry
.first
, Entry
.second
,
3408 getOrCreateVectorTripCount(LI
->getLoopFor(LoopVectorBody
)),
3409 IVEndValues
[Entry
.first
], LoopMiddleBlock
);
3412 for (Instruction
*PI
: PredicatedInstructions
)
3413 sinkScalarOperands(&*PI
);
3415 // Remove redundant induction instructions.
3416 cse(LoopVectorBody
);
3419 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3420 // In order to support recurrences we need to be able to vectorize Phi nodes.
3421 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3422 // stage #2: We now need to fix the recurrences by adding incoming edges to
3423 // the currently empty PHI nodes. At this point every instruction in the
3424 // original loop is widened to a vector form so we can use them to construct
3425 // the incoming edges.
3426 for (PHINode
&Phi
: OrigLoop
->getHeader()->phis()) {
3427 // Handle first-order recurrences and reductions that need to be fixed.
3428 if (Legal
->isFirstOrderRecurrence(&Phi
))
3429 fixFirstOrderRecurrence(&Phi
);
3430 else if (Legal
->isReductionVariable(&Phi
))
3435 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode
*Phi
) {
3436 // This is the second phase of vectorizing first-order recurrences. An
3437 // overview of the transformation is described below. Suppose we have the
3440 // for (int i = 0; i < n; ++i)
3441 // b[i] = a[i] - a[i - 1];
3443 // There is a first-order recurrence on "a". For this loop, the shorthand
3444 // scalar IR looks like:
3451 // i = phi [0, scalar.ph], [i+1, scalar.body]
3452 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3455 // br cond, scalar.body, ...
3457 // In this example, s1 is a recurrence because it's value depends on the
3458 // previous iteration. In the first phase of vectorization, we created a
3459 // temporary value for s1. We now complete the vectorization and produce the
3460 // shorthand vector IR shown below (for VF = 4, UF = 1).
3463 // v_init = vector(..., ..., ..., a[-1])
3467 // i = phi [0, vector.ph], [i+4, vector.body]
3468 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3469 // v2 = a[i, i+1, i+2, i+3];
3470 // v3 = vector(v1(3), v2(0, 1, 2))
3471 // b[i, i+1, i+2, i+3] = v2 - v3
3472 // br cond, vector.body, middle.block
3479 // s_init = phi [x, middle.block], [a[-1], otherwise]
3482 // After execution completes the vector loop, we extract the next value of
3483 // the recurrence (x) to use as the initial value in the scalar loop.
3485 // Get the original loop preheader and single loop latch.
3486 auto *Preheader
= OrigLoop
->getLoopPreheader();
3487 auto *Latch
= OrigLoop
->getLoopLatch();
3489 // Get the initial and previous values of the scalar recurrence.
3490 auto *ScalarInit
= Phi
->getIncomingValueForBlock(Preheader
);
3491 auto *Previous
= Phi
->getIncomingValueForBlock(Latch
);
3493 // Create a vector from the initial value.
3494 auto *VectorInit
= ScalarInit
;
3496 Builder
.SetInsertPoint(LoopVectorPreHeader
->getTerminator());
3497 VectorInit
= Builder
.CreateInsertElement(
3498 UndefValue::get(VectorType::get(VectorInit
->getType(), VF
)), VectorInit
,
3499 Builder
.getInt32(VF
- 1), "vector.recur.init");
3502 // We constructed a temporary phi node in the first phase of vectorization.
3503 // This phi node will eventually be deleted.
3504 Builder
.SetInsertPoint(
3505 cast
<Instruction
>(VectorLoopValueMap
.getVectorValue(Phi
, 0)));
3507 // Create a phi node for the new recurrence. The current value will either be
3508 // the initial value inserted into a vector or loop-varying vector value.
3509 auto *VecPhi
= Builder
.CreatePHI(VectorInit
->getType(), 2, "vector.recur");
3510 VecPhi
->addIncoming(VectorInit
, LoopVectorPreHeader
);
3512 // Get the vectorized previous value of the last part UF - 1. It appears last
3513 // among all unrolled iterations, due to the order of their construction.
3514 Value
*PreviousLastPart
= getOrCreateVectorValue(Previous
, UF
- 1);
3516 // Set the insertion point after the previous value if it is an instruction.
3517 // Note that the previous value may have been constant-folded so it is not
3518 // guaranteed to be an instruction in the vector loop. Also, if the previous
3519 // value is a phi node, we should insert after all the phi nodes to avoid
3520 // breaking basic block verification.
3521 if (LI
->getLoopFor(LoopVectorBody
)->isLoopInvariant(PreviousLastPart
) ||
3522 isa
<PHINode
>(PreviousLastPart
))
3523 Builder
.SetInsertPoint(&*LoopVectorBody
->getFirstInsertionPt());
3525 Builder
.SetInsertPoint(
3526 &*++BasicBlock::iterator(cast
<Instruction
>(PreviousLastPart
)));
3528 // We will construct a vector for the recurrence by combining the values for
3529 // the current and previous iterations. This is the required shuffle mask.
3530 SmallVector
<Constant
*, 8> ShuffleMask(VF
);
3531 ShuffleMask
[0] = Builder
.getInt32(VF
- 1);
3532 for (unsigned I
= 1; I
< VF
; ++I
)
3533 ShuffleMask
[I
] = Builder
.getInt32(I
+ VF
- 1);
3535 // The vector from which to take the initial value for the current iteration
3536 // (actual or unrolled). Initially, this is the vector phi node.
3537 Value
*Incoming
= VecPhi
;
3539 // Shuffle the current and previous vector and update the vector parts.
3540 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3541 Value
*PreviousPart
= getOrCreateVectorValue(Previous
, Part
);
3542 Value
*PhiPart
= VectorLoopValueMap
.getVectorValue(Phi
, Part
);
3544 VF
> 1 ? Builder
.CreateShuffleVector(Incoming
, PreviousPart
,
3545 ConstantVector::get(ShuffleMask
))
3547 PhiPart
->replaceAllUsesWith(Shuffle
);
3548 cast
<Instruction
>(PhiPart
)->eraseFromParent();
3549 VectorLoopValueMap
.resetVectorValue(Phi
, Part
, Shuffle
);
3550 Incoming
= PreviousPart
;
3553 // Fix the latch value of the new recurrence in the vector loop.
3554 VecPhi
->addIncoming(Incoming
, LI
->getLoopFor(LoopVectorBody
)->getLoopLatch());
3556 // Extract the last vector element in the middle block. This will be the
3557 // initial value for the recurrence when jumping to the scalar loop.
3558 auto *ExtractForScalar
= Incoming
;
3560 Builder
.SetInsertPoint(LoopMiddleBlock
->getTerminator());
3561 ExtractForScalar
= Builder
.CreateExtractElement(
3562 ExtractForScalar
, Builder
.getInt32(VF
- 1), "vector.recur.extract");
3564 // Extract the second last element in the middle block if the
3565 // Phi is used outside the loop. We need to extract the phi itself
3566 // and not the last element (the phi update in the current iteration). This
3567 // will be the value when jumping to the exit block from the LoopMiddleBlock,
3568 // when the scalar loop is not run at all.
3569 Value
*ExtractForPhiUsedOutsideLoop
= nullptr;
3571 ExtractForPhiUsedOutsideLoop
= Builder
.CreateExtractElement(
3572 Incoming
, Builder
.getInt32(VF
- 2), "vector.recur.extract.for.phi");
3573 // When loop is unrolled without vectorizing, initialize
3574 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3575 // `Incoming`. This is analogous to the vectorized case above: extracting the
3576 // second last element when VF > 1.
3578 ExtractForPhiUsedOutsideLoop
= getOrCreateVectorValue(Previous
, UF
- 2);
3580 // Fix the initial value of the original recurrence in the scalar loop.
3581 Builder
.SetInsertPoint(&*LoopScalarPreHeader
->begin());
3582 auto *Start
= Builder
.CreatePHI(Phi
->getType(), 2, "scalar.recur.init");
3583 for (auto *BB
: predecessors(LoopScalarPreHeader
)) {
3584 auto *Incoming
= BB
== LoopMiddleBlock
? ExtractForScalar
: ScalarInit
;
3585 Start
->addIncoming(Incoming
, BB
);
3588 Phi
->setIncomingValueForBlock(LoopScalarPreHeader
, Start
);
3589 Phi
->setName("scalar.recur");
3591 // Finally, fix users of the recurrence outside the loop. The users will need
3592 // either the last value of the scalar recurrence or the last value of the
3593 // vector recurrence we extracted in the middle block. Since the loop is in
3594 // LCSSA form, we just need to find all the phi nodes for the original scalar
3595 // recurrence in the exit block, and then add an edge for the middle block.
3596 for (PHINode
&LCSSAPhi
: LoopExitBlock
->phis()) {
3597 if (LCSSAPhi
.getIncomingValue(0) == Phi
) {
3598 LCSSAPhi
.addIncoming(ExtractForPhiUsedOutsideLoop
, LoopMiddleBlock
);
3603 void InnerLoopVectorizer::fixReduction(PHINode
*Phi
) {
3604 Constant
*Zero
= Builder
.getInt32(0);
3606 // Get it's reduction variable descriptor.
3607 assert(Legal
->isReductionVariable(Phi
) &&
3608 "Unable to find the reduction variable");
3609 RecurrenceDescriptor RdxDesc
= (*Legal
->getReductionVars())[Phi
];
3611 RecurrenceDescriptor::RecurrenceKind RK
= RdxDesc
.getRecurrenceKind();
3612 TrackingVH
<Value
> ReductionStartValue
= RdxDesc
.getRecurrenceStartValue();
3613 Instruction
*LoopExitInst
= RdxDesc
.getLoopExitInstr();
3614 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind
=
3615 RdxDesc
.getMinMaxRecurrenceKind();
3616 setDebugLocFromInst(Builder
, ReductionStartValue
);
3618 // We need to generate a reduction vector from the incoming scalar.
3619 // To do so, we need to generate the 'identity' vector and override
3620 // one of the elements with the incoming scalar reduction. We need
3621 // to do it in the vector-loop preheader.
3622 Builder
.SetInsertPoint(LoopVectorPreHeader
->getTerminator());
3624 // This is the vector-clone of the value that leaves the loop.
3625 Type
*VecTy
= getOrCreateVectorValue(LoopExitInst
, 0)->getType();
3627 // Find the reduction identity variable. Zero for addition, or, xor,
3628 // one for multiplication, -1 for And.
3631 if (RK
== RecurrenceDescriptor::RK_IntegerMinMax
||
3632 RK
== RecurrenceDescriptor::RK_FloatMinMax
) {
3633 // MinMax reduction have the start value as their identify.
3635 VectorStart
= Identity
= ReductionStartValue
;
3637 VectorStart
= Identity
=
3638 Builder
.CreateVectorSplat(VF
, ReductionStartValue
, "minmax.ident");
3641 // Handle other reduction kinds:
3642 Constant
*Iden
= RecurrenceDescriptor::getRecurrenceIdentity(
3643 RK
, VecTy
->getScalarType());
3646 // This vector is the Identity vector where the first element is the
3647 // incoming scalar reduction.
3648 VectorStart
= ReductionStartValue
;
3650 Identity
= ConstantVector::getSplat(VF
, Iden
);
3652 // This vector is the Identity vector where the first element is the
3653 // incoming scalar reduction.
3655 Builder
.CreateInsertElement(Identity
, ReductionStartValue
, Zero
);
3659 // Fix the vector-loop phi.
3661 // Reductions do not have to start at zero. They can start with
3662 // any loop invariant values.
3663 BasicBlock
*Latch
= OrigLoop
->getLoopLatch();
3664 Value
*LoopVal
= Phi
->getIncomingValueForBlock(Latch
);
3665 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3666 Value
*VecRdxPhi
= getOrCreateVectorValue(Phi
, Part
);
3667 Value
*Val
= getOrCreateVectorValue(LoopVal
, Part
);
3668 // Make sure to add the reduction stat value only to the
3669 // first unroll part.
3670 Value
*StartVal
= (Part
== 0) ? VectorStart
: Identity
;
3671 cast
<PHINode
>(VecRdxPhi
)->addIncoming(StartVal
, LoopVectorPreHeader
);
3672 cast
<PHINode
>(VecRdxPhi
)
3673 ->addIncoming(Val
, LI
->getLoopFor(LoopVectorBody
)->getLoopLatch());
3676 // Before each round, move the insertion point right between
3677 // the PHIs and the values we are going to write.
3678 // This allows us to write both PHINodes and the extractelement
3680 Builder
.SetInsertPoint(&*LoopMiddleBlock
->getFirstInsertionPt());
3682 setDebugLocFromInst(Builder
, LoopExitInst
);
3684 // If tail is folded by masking, the vector value to leave the loop should be
3685 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3686 // instead of the former.
3687 if (Cost
->foldTailByMasking()) {
3688 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3689 Value
*VecLoopExitInst
=
3690 VectorLoopValueMap
.getVectorValue(LoopExitInst
, Part
);
3691 Value
*Sel
= nullptr;
3692 for (User
*U
: VecLoopExitInst
->users()) {
3693 if (isa
<SelectInst
>(U
)) {
3694 assert(!Sel
&& "Reduction exit feeding two selects");
3697 assert(isa
<PHINode
>(U
) && "Reduction exit must feed Phi's or select");
3699 assert(Sel
&& "Reduction exit feeds no select");
3700 VectorLoopValueMap
.resetVectorValue(LoopExitInst
, Part
, Sel
);
3704 // If the vector reduction can be performed in a smaller type, we truncate
3705 // then extend the loop exit value to enable InstCombine to evaluate the
3706 // entire expression in the smaller type.
3707 if (VF
> 1 && Phi
->getType() != RdxDesc
.getRecurrenceType()) {
3708 Type
*RdxVecTy
= VectorType::get(RdxDesc
.getRecurrenceType(), VF
);
3709 Builder
.SetInsertPoint(
3710 LI
->getLoopFor(LoopVectorBody
)->getLoopLatch()->getTerminator());
3711 VectorParts
RdxParts(UF
);
3712 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3713 RdxParts
[Part
] = VectorLoopValueMap
.getVectorValue(LoopExitInst
, Part
);
3714 Value
*Trunc
= Builder
.CreateTrunc(RdxParts
[Part
], RdxVecTy
);
3715 Value
*Extnd
= RdxDesc
.isSigned() ? Builder
.CreateSExt(Trunc
, VecTy
)
3716 : Builder
.CreateZExt(Trunc
, VecTy
);
3717 for (Value::user_iterator UI
= RdxParts
[Part
]->user_begin();
3718 UI
!= RdxParts
[Part
]->user_end();)
3720 (*UI
++)->replaceUsesOfWith(RdxParts
[Part
], Extnd
);
3721 RdxParts
[Part
] = Extnd
;
3726 Builder
.SetInsertPoint(&*LoopMiddleBlock
->getFirstInsertionPt());
3727 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3728 RdxParts
[Part
] = Builder
.CreateTrunc(RdxParts
[Part
], RdxVecTy
);
3729 VectorLoopValueMap
.resetVectorValue(LoopExitInst
, Part
, RdxParts
[Part
]);
3733 // Reduce all of the unrolled parts into a single vector.
3734 Value
*ReducedPartRdx
= VectorLoopValueMap
.getVectorValue(LoopExitInst
, 0);
3735 unsigned Op
= RecurrenceDescriptor::getRecurrenceBinOp(RK
);
3737 // The middle block terminator has already been assigned a DebugLoc here (the
3738 // OrigLoop's single latch terminator). We want the whole middle block to
3739 // appear to execute on this line because: (a) it is all compiler generated,
3740 // (b) these instructions are always executed after evaluating the latch
3741 // conditional branch, and (c) other passes may add new predecessors which
3742 // terminate on this line. This is the easiest way to ensure we don't
3743 // accidentally cause an extra step back into the loop while debugging.
3744 setDebugLocFromInst(Builder
, LoopMiddleBlock
->getTerminator());
3745 for (unsigned Part
= 1; Part
< UF
; ++Part
) {
3746 Value
*RdxPart
= VectorLoopValueMap
.getVectorValue(LoopExitInst
, Part
);
3747 if (Op
!= Instruction::ICmp
&& Op
!= Instruction::FCmp
)
3748 // Floating point operations had to be 'fast' to enable the reduction.
3749 ReducedPartRdx
= addFastMathFlag(
3750 Builder
.CreateBinOp((Instruction::BinaryOps
)Op
, RdxPart
,
3751 ReducedPartRdx
, "bin.rdx"),
3752 RdxDesc
.getFastMathFlags());
3754 ReducedPartRdx
= createMinMaxOp(Builder
, MinMaxKind
, ReducedPartRdx
,
3759 bool NoNaN
= Legal
->hasFunNoNaNAttr();
3761 createTargetReduction(Builder
, TTI
, RdxDesc
, ReducedPartRdx
, NoNaN
);
3762 // If the reduction can be performed in a smaller type, we need to extend
3763 // the reduction to the wider type before we branch to the original loop.
3764 if (Phi
->getType() != RdxDesc
.getRecurrenceType())
3767 ? Builder
.CreateSExt(ReducedPartRdx
, Phi
->getType())
3768 : Builder
.CreateZExt(ReducedPartRdx
, Phi
->getType());
3771 // Create a phi node that merges control-flow from the backedge-taken check
3772 // block and the middle block.
3773 PHINode
*BCBlockPhi
= PHINode::Create(Phi
->getType(), 2, "bc.merge.rdx",
3774 LoopScalarPreHeader
->getTerminator());
3775 for (unsigned I
= 0, E
= LoopBypassBlocks
.size(); I
!= E
; ++I
)
3776 BCBlockPhi
->addIncoming(ReductionStartValue
, LoopBypassBlocks
[I
]);
3777 BCBlockPhi
->addIncoming(ReducedPartRdx
, LoopMiddleBlock
);
3779 // Now, we need to fix the users of the reduction variable
3780 // inside and outside of the scalar remainder loop.
3781 // We know that the loop is in LCSSA form. We need to update the
3782 // PHI nodes in the exit blocks.
3783 for (PHINode
&LCSSAPhi
: LoopExitBlock
->phis()) {
3784 // All PHINodes need to have a single entry edge, or two if
3785 // we already fixed them.
3786 assert(LCSSAPhi
.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3788 // We found a reduction value exit-PHI. Update it with the
3789 // incoming bypass edge.
3790 if (LCSSAPhi
.getIncomingValue(0) == LoopExitInst
)
3791 LCSSAPhi
.addIncoming(ReducedPartRdx
, LoopMiddleBlock
);
3792 } // end of the LCSSA phi scan.
3794 // Fix the scalar loop reduction variable with the incoming reduction sum
3795 // from the vector body and from the backedge value.
3796 int IncomingEdgeBlockIdx
=
3797 Phi
->getBasicBlockIndex(OrigLoop
->getLoopLatch());
3798 assert(IncomingEdgeBlockIdx
>= 0 && "Invalid block index");
3799 // Pick the other block.
3800 int SelfEdgeBlockIdx
= (IncomingEdgeBlockIdx
? 0 : 1);
3801 Phi
->setIncomingValue(SelfEdgeBlockIdx
, BCBlockPhi
);
3802 Phi
->setIncomingValue(IncomingEdgeBlockIdx
, LoopExitInst
);
3805 void InnerLoopVectorizer::fixLCSSAPHIs() {
3806 for (PHINode
&LCSSAPhi
: LoopExitBlock
->phis()) {
3807 if (LCSSAPhi
.getNumIncomingValues() == 1) {
3808 auto *IncomingValue
= LCSSAPhi
.getIncomingValue(0);
3809 // Non-instruction incoming values will have only one value.
3810 unsigned LastLane
= 0;
3811 if (isa
<Instruction
>(IncomingValue
))
3812 LastLane
= Cost
->isUniformAfterVectorization(
3813 cast
<Instruction
>(IncomingValue
), VF
)
3816 // Can be a loop invariant incoming value or the last scalar value to be
3817 // extracted from the vectorized loop.
3818 Builder
.SetInsertPoint(LoopMiddleBlock
->getTerminator());
3819 Value
*lastIncomingValue
=
3820 getOrCreateScalarValue(IncomingValue
, { UF
- 1, LastLane
});
3821 LCSSAPhi
.addIncoming(lastIncomingValue
, LoopMiddleBlock
);
3826 void InnerLoopVectorizer::sinkScalarOperands(Instruction
*PredInst
) {
3827 // The basic block and loop containing the predicated instruction.
3828 auto *PredBB
= PredInst
->getParent();
3829 auto *VectorLoop
= LI
->getLoopFor(PredBB
);
3831 // Initialize a worklist with the operands of the predicated instruction.
3832 SetVector
<Value
*> Worklist(PredInst
->op_begin(), PredInst
->op_end());
3834 // Holds instructions that we need to analyze again. An instruction may be
3835 // reanalyzed if we don't yet know if we can sink it or not.
3836 SmallVector
<Instruction
*, 8> InstsToReanalyze
;
3838 // Returns true if a given use occurs in the predicated block. Phi nodes use
3839 // their operands in their corresponding predecessor blocks.
3840 auto isBlockOfUsePredicated
= [&](Use
&U
) -> bool {
3841 auto *I
= cast
<Instruction
>(U
.getUser());
3842 BasicBlock
*BB
= I
->getParent();
3843 if (auto *Phi
= dyn_cast
<PHINode
>(I
))
3844 BB
= Phi
->getIncomingBlock(
3845 PHINode::getIncomingValueNumForOperand(U
.getOperandNo()));
3846 return BB
== PredBB
;
3849 // Iteratively sink the scalarized operands of the predicated instruction
3850 // into the block we created for it. When an instruction is sunk, it's
3851 // operands are then added to the worklist. The algorithm ends after one pass
3852 // through the worklist doesn't sink a single instruction.
3855 // Add the instructions that need to be reanalyzed to the worklist, and
3856 // reset the changed indicator.
3857 Worklist
.insert(InstsToReanalyze
.begin(), InstsToReanalyze
.end());
3858 InstsToReanalyze
.clear();
3861 while (!Worklist
.empty()) {
3862 auto *I
= dyn_cast
<Instruction
>(Worklist
.pop_back_val());
3864 // We can't sink an instruction if it is a phi node, is already in the
3865 // predicated block, is not in the loop, or may have side effects.
3866 if (!I
|| isa
<PHINode
>(I
) || I
->getParent() == PredBB
||
3867 !VectorLoop
->contains(I
) || I
->mayHaveSideEffects())
3870 // It's legal to sink the instruction if all its uses occur in the
3871 // predicated block. Otherwise, there's nothing to do yet, and we may
3872 // need to reanalyze the instruction.
3873 if (!llvm::all_of(I
->uses(), isBlockOfUsePredicated
)) {
3874 InstsToReanalyze
.push_back(I
);
3878 // Move the instruction to the beginning of the predicated block, and add
3879 // it's operands to the worklist.
3880 I
->moveBefore(&*PredBB
->getFirstInsertionPt());
3881 Worklist
.insert(I
->op_begin(), I
->op_end());
3883 // The sinking may have enabled other instructions to be sunk, so we will
3890 void InnerLoopVectorizer::fixNonInductionPHIs() {
3891 for (PHINode
*OrigPhi
: OrigPHIsToFix
) {
3893 cast
<PHINode
>(VectorLoopValueMap
.getVectorValue(OrigPhi
, 0));
3894 unsigned NumIncomingValues
= OrigPhi
->getNumIncomingValues();
3896 SmallVector
<BasicBlock
*, 2> ScalarBBPredecessors(
3897 predecessors(OrigPhi
->getParent()));
3898 SmallVector
<BasicBlock
*, 2> VectorBBPredecessors(
3899 predecessors(NewPhi
->getParent()));
3900 assert(ScalarBBPredecessors
.size() == VectorBBPredecessors
.size() &&
3901 "Scalar and Vector BB should have the same number of predecessors");
3903 // The insertion point in Builder may be invalidated by the time we get
3904 // here. Force the Builder insertion point to something valid so that we do
3905 // not run into issues during insertion point restore in
3906 // getOrCreateVectorValue calls below.
3907 Builder
.SetInsertPoint(NewPhi
);
3909 // The predecessor order is preserved and we can rely on mapping between
3910 // scalar and vector block predecessors.
3911 for (unsigned i
= 0; i
< NumIncomingValues
; ++i
) {
3912 BasicBlock
*NewPredBB
= VectorBBPredecessors
[i
];
3914 // When looking up the new scalar/vector values to fix up, use incoming
3915 // values from original phi.
3917 OrigPhi
->getIncomingValueForBlock(ScalarBBPredecessors
[i
]);
3919 // Scalar incoming value may need a broadcast
3920 Value
*NewIncV
= getOrCreateVectorValue(ScIncV
, 0);
3921 NewPhi
->addIncoming(NewIncV
, NewPredBB
);
3926 void InnerLoopVectorizer::widenPHIInstruction(Instruction
*PN
, unsigned UF
,
3928 PHINode
*P
= cast
<PHINode
>(PN
);
3929 if (EnableVPlanNativePath
) {
3930 // Currently we enter here in the VPlan-native path for non-induction
3931 // PHIs where all control flow is uniform. We simply widen these PHIs.
3932 // Create a vector phi with no operands - the vector phi operands will be
3933 // set at the end of vector code generation.
3935 (VF
== 1) ? PN
->getType() : VectorType::get(PN
->getType(), VF
);
3936 Value
*VecPhi
= Builder
.CreatePHI(VecTy
, PN
->getNumOperands(), "vec.phi");
3937 VectorLoopValueMap
.setVectorValue(P
, 0, VecPhi
);
3938 OrigPHIsToFix
.push_back(P
);
3943 assert(PN
->getParent() == OrigLoop
->getHeader() &&
3944 "Non-header phis should have been handled elsewhere");
3946 // In order to support recurrences we need to be able to vectorize Phi nodes.
3947 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3948 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3949 // this value when we vectorize all of the instructions that use the PHI.
3950 if (Legal
->isReductionVariable(P
) || Legal
->isFirstOrderRecurrence(P
)) {
3951 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3952 // This is phase one of vectorizing PHIs.
3954 (VF
== 1) ? PN
->getType() : VectorType::get(PN
->getType(), VF
);
3955 Value
*EntryPart
= PHINode::Create(
3956 VecTy
, 2, "vec.phi", &*LoopVectorBody
->getFirstInsertionPt());
3957 VectorLoopValueMap
.setVectorValue(P
, Part
, EntryPart
);
3962 setDebugLocFromInst(Builder
, P
);
3964 // This PHINode must be an induction variable.
3965 // Make sure that we know about it.
3966 assert(Legal
->getInductionVars()->count(P
) && "Not an induction variable");
3968 InductionDescriptor II
= Legal
->getInductionVars()->lookup(P
);
3969 const DataLayout
&DL
= OrigLoop
->getHeader()->getModule()->getDataLayout();
3971 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
3972 // which can be found from the original scalar operations.
3973 switch (II
.getKind()) {
3974 case InductionDescriptor::IK_NoInduction
:
3975 llvm_unreachable("Unknown induction");
3976 case InductionDescriptor::IK_IntInduction
:
3977 case InductionDescriptor::IK_FpInduction
:
3978 llvm_unreachable("Integer/fp induction is handled elsewhere.");
3979 case InductionDescriptor::IK_PtrInduction
: {
3980 // Handle the pointer induction variable case.
3981 assert(P
->getType()->isPointerTy() && "Unexpected type.");
3982 // This is the normalized GEP that starts counting at zero.
3983 Value
*PtrInd
= Induction
;
3984 PtrInd
= Builder
.CreateSExtOrTrunc(PtrInd
, II
.getStep()->getType());
3985 // Determine the number of scalars we need to generate for each unroll
3986 // iteration. If the instruction is uniform, we only need to generate the
3987 // first lane. Otherwise, we generate all VF values.
3988 unsigned Lanes
= Cost
->isUniformAfterVectorization(P
, VF
) ? 1 : VF
;
3989 // These are the scalar results. Notice that we don't generate vector GEPs
3990 // because scalar GEPs result in better code.
3991 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3992 for (unsigned Lane
= 0; Lane
< Lanes
; ++Lane
) {
3993 Constant
*Idx
= ConstantInt::get(PtrInd
->getType(), Lane
+ Part
* VF
);
3994 Value
*GlobalIdx
= Builder
.CreateAdd(PtrInd
, Idx
);
3996 emitTransformedIndex(Builder
, GlobalIdx
, PSE
.getSE(), DL
, II
);
3997 SclrGep
->setName("next.gep");
3998 VectorLoopValueMap
.setScalarValue(P
, {Part
, Lane
}, SclrGep
);
4006 /// A helper function for checking whether an integer division-related
4007 /// instruction may divide by zero (in which case it must be predicated if
4008 /// executed conditionally in the scalar code).
4009 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4010 /// Non-zero divisors that are non compile-time constants will not be
4011 /// converted into multiplication, so we will still end up scalarizing
4012 /// the division, but can do so w/o predication.
4013 static bool mayDivideByZero(Instruction
&I
) {
4014 assert((I
.getOpcode() == Instruction::UDiv
||
4015 I
.getOpcode() == Instruction::SDiv
||
4016 I
.getOpcode() == Instruction::URem
||
4017 I
.getOpcode() == Instruction::SRem
) &&
4018 "Unexpected instruction");
4019 Value
*Divisor
= I
.getOperand(1);
4020 auto *CInt
= dyn_cast
<ConstantInt
>(Divisor
);
4021 return !CInt
|| CInt
->isZero();
4024 void InnerLoopVectorizer::widenInstruction(Instruction
&I
) {
4025 switch (I
.getOpcode()) {
4026 case Instruction::Br
:
4027 case Instruction::PHI
:
4028 llvm_unreachable("This instruction is handled by a different recipe.");
4029 case Instruction::GetElementPtr
: {
4030 // Construct a vector GEP by widening the operands of the scalar GEP as
4031 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4032 // results in a vector of pointers when at least one operand of the GEP
4033 // is vector-typed. Thus, to keep the representation compact, we only use
4034 // vector-typed operands for loop-varying values.
4035 auto *GEP
= cast
<GetElementPtrInst
>(&I
);
4037 if (VF
> 1 && OrigLoop
->hasLoopInvariantOperands(GEP
)) {
4038 // If we are vectorizing, but the GEP has only loop-invariant operands,
4039 // the GEP we build (by only using vector-typed operands for
4040 // loop-varying values) would be a scalar pointer. Thus, to ensure we
4041 // produce a vector of pointers, we need to either arbitrarily pick an
4042 // operand to broadcast, or broadcast a clone of the original GEP.
4043 // Here, we broadcast a clone of the original.
4045 // TODO: If at some point we decide to scalarize instructions having
4046 // loop-invariant operands, this special case will no longer be
4047 // required. We would add the scalarization decision to
4048 // collectLoopScalars() and teach getVectorValue() to broadcast
4049 // the lane-zero scalar value.
4050 auto *Clone
= Builder
.Insert(GEP
->clone());
4051 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4052 Value
*EntryPart
= Builder
.CreateVectorSplat(VF
, Clone
);
4053 VectorLoopValueMap
.setVectorValue(&I
, Part
, EntryPart
);
4054 addMetadata(EntryPart
, GEP
);
4057 // If the GEP has at least one loop-varying operand, we are sure to
4058 // produce a vector of pointers. But if we are only unrolling, we want
4059 // to produce a scalar GEP for each unroll part. Thus, the GEP we
4060 // produce with the code below will be scalar (if VF == 1) or vector
4061 // (otherwise). Note that for the unroll-only case, we still maintain
4062 // values in the vector mapping with initVector, as we do for other
4064 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4065 // The pointer operand of the new GEP. If it's loop-invariant, we
4066 // won't broadcast it.
4068 OrigLoop
->isLoopInvariant(GEP
->getPointerOperand())
4069 ? GEP
->getPointerOperand()
4070 : getOrCreateVectorValue(GEP
->getPointerOperand(), Part
);
4072 // Collect all the indices for the new GEP. If any index is
4073 // loop-invariant, we won't broadcast it.
4074 SmallVector
<Value
*, 4> Indices
;
4075 for (auto &U
: make_range(GEP
->idx_begin(), GEP
->idx_end())) {
4076 if (OrigLoop
->isLoopInvariant(U
.get()))
4077 Indices
.push_back(U
.get());
4079 Indices
.push_back(getOrCreateVectorValue(U
.get(), Part
));
4082 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4083 // but it should be a vector, otherwise.
4086 ? Builder
.CreateInBoundsGEP(GEP
->getSourceElementType(), Ptr
,
4088 : Builder
.CreateGEP(GEP
->getSourceElementType(), Ptr
, Indices
);
4089 assert((VF
== 1 || NewGEP
->getType()->isVectorTy()) &&
4090 "NewGEP is not a pointer vector");
4091 VectorLoopValueMap
.setVectorValue(&I
, Part
, NewGEP
);
4092 addMetadata(NewGEP
, GEP
);
4098 case Instruction::UDiv
:
4099 case Instruction::SDiv
:
4100 case Instruction::SRem
:
4101 case Instruction::URem
:
4102 case Instruction::Add
:
4103 case Instruction::FAdd
:
4104 case Instruction::Sub
:
4105 case Instruction::FSub
:
4106 case Instruction::FNeg
:
4107 case Instruction::Mul
:
4108 case Instruction::FMul
:
4109 case Instruction::FDiv
:
4110 case Instruction::FRem
:
4111 case Instruction::Shl
:
4112 case Instruction::LShr
:
4113 case Instruction::AShr
:
4114 case Instruction::And
:
4115 case Instruction::Or
:
4116 case Instruction::Xor
: {
4117 // Just widen unops and binops.
4118 setDebugLocFromInst(Builder
, &I
);
4120 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4121 SmallVector
<Value
*, 2> Ops
;
4122 for (Value
*Op
: I
.operands())
4123 Ops
.push_back(getOrCreateVectorValue(Op
, Part
));
4125 Value
*V
= Builder
.CreateNAryOp(I
.getOpcode(), Ops
);
4127 if (auto *VecOp
= dyn_cast
<Instruction
>(V
))
4128 VecOp
->copyIRFlags(&I
);
4130 // Use this vector value for all users of the original instruction.
4131 VectorLoopValueMap
.setVectorValue(&I
, Part
, V
);
4137 case Instruction::Select
: {
4139 // If the selector is loop invariant we can create a select
4140 // instruction with a scalar condition. Otherwise, use vector-select.
4141 auto *SE
= PSE
.getSE();
4142 bool InvariantCond
=
4143 SE
->isLoopInvariant(PSE
.getSCEV(I
.getOperand(0)), OrigLoop
);
4144 setDebugLocFromInst(Builder
, &I
);
4146 // The condition can be loop invariant but still defined inside the
4147 // loop. This means that we can't just use the original 'cond' value.
4148 // We have to take the 'vectorized' value and pick the first lane.
4149 // Instcombine will make this a no-op.
4151 auto *ScalarCond
= getOrCreateScalarValue(I
.getOperand(0), {0, 0});
4153 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4154 Value
*Cond
= getOrCreateVectorValue(I
.getOperand(0), Part
);
4155 Value
*Op0
= getOrCreateVectorValue(I
.getOperand(1), Part
);
4156 Value
*Op1
= getOrCreateVectorValue(I
.getOperand(2), Part
);
4158 Builder
.CreateSelect(InvariantCond
? ScalarCond
: Cond
, Op0
, Op1
);
4159 VectorLoopValueMap
.setVectorValue(&I
, Part
, Sel
);
4160 addMetadata(Sel
, &I
);
4166 case Instruction::ICmp
:
4167 case Instruction::FCmp
: {
4168 // Widen compares. Generate vector compares.
4169 bool FCmp
= (I
.getOpcode() == Instruction::FCmp
);
4170 auto *Cmp
= dyn_cast
<CmpInst
>(&I
);
4171 setDebugLocFromInst(Builder
, Cmp
);
4172 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4173 Value
*A
= getOrCreateVectorValue(Cmp
->getOperand(0), Part
);
4174 Value
*B
= getOrCreateVectorValue(Cmp
->getOperand(1), Part
);
4177 // Propagate fast math flags.
4178 IRBuilder
<>::FastMathFlagGuard
FMFG(Builder
);
4179 Builder
.setFastMathFlags(Cmp
->getFastMathFlags());
4180 C
= Builder
.CreateFCmp(Cmp
->getPredicate(), A
, B
);
4182 C
= Builder
.CreateICmp(Cmp
->getPredicate(), A
, B
);
4184 VectorLoopValueMap
.setVectorValue(&I
, Part
, C
);
4191 case Instruction::ZExt
:
4192 case Instruction::SExt
:
4193 case Instruction::FPToUI
:
4194 case Instruction::FPToSI
:
4195 case Instruction::FPExt
:
4196 case Instruction::PtrToInt
:
4197 case Instruction::IntToPtr
:
4198 case Instruction::SIToFP
:
4199 case Instruction::UIToFP
:
4200 case Instruction::Trunc
:
4201 case Instruction::FPTrunc
:
4202 case Instruction::BitCast
: {
4203 auto *CI
= dyn_cast
<CastInst
>(&I
);
4204 setDebugLocFromInst(Builder
, CI
);
4206 /// Vectorize casts.
4208 (VF
== 1) ? CI
->getType() : VectorType::get(CI
->getType(), VF
);
4210 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4211 Value
*A
= getOrCreateVectorValue(CI
->getOperand(0), Part
);
4212 Value
*Cast
= Builder
.CreateCast(CI
->getOpcode(), A
, DestTy
);
4213 VectorLoopValueMap
.setVectorValue(&I
, Part
, Cast
);
4214 addMetadata(Cast
, &I
);
4219 case Instruction::Call
: {
4220 // Ignore dbg intrinsics.
4221 if (isa
<DbgInfoIntrinsic
>(I
))
4223 setDebugLocFromInst(Builder
, &I
);
4225 Module
*M
= I
.getParent()->getParent()->getParent();
4226 auto *CI
= cast
<CallInst
>(&I
);
4228 StringRef FnName
= CI
->getCalledFunction()->getName();
4229 Function
*F
= CI
->getCalledFunction();
4230 Type
*RetTy
= ToVectorTy(CI
->getType(), VF
);
4231 SmallVector
<Type
*, 4> Tys
;
4232 for (Value
*ArgOperand
: CI
->arg_operands())
4233 Tys
.push_back(ToVectorTy(ArgOperand
->getType(), VF
));
4235 Intrinsic::ID ID
= getVectorIntrinsicIDForCall(CI
, TLI
);
4237 // The flag shows whether we use Intrinsic or a usual Call for vectorized
4238 // version of the instruction.
4239 // Is it beneficial to perform intrinsic call compared to lib call?
4240 bool NeedToScalarize
;
4241 unsigned CallCost
= Cost
->getVectorCallCost(CI
, VF
, NeedToScalarize
);
4242 bool UseVectorIntrinsic
=
4243 ID
&& Cost
->getVectorIntrinsicCost(CI
, VF
) <= CallCost
;
4244 assert((UseVectorIntrinsic
|| !NeedToScalarize
) &&
4245 "Instruction should be scalarized elsewhere.");
4247 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4248 SmallVector
<Value
*, 4> Args
;
4249 for (unsigned i
= 0, ie
= CI
->getNumArgOperands(); i
!= ie
; ++i
) {
4250 Value
*Arg
= CI
->getArgOperand(i
);
4251 // Some intrinsics have a scalar argument - don't replace it with a
4253 if (!UseVectorIntrinsic
|| !hasVectorInstrinsicScalarOpd(ID
, i
))
4254 Arg
= getOrCreateVectorValue(CI
->getArgOperand(i
), Part
);
4255 Args
.push_back(Arg
);
4259 if (UseVectorIntrinsic
) {
4260 // Use vector version of the intrinsic.
4261 Type
*TysForDecl
[] = {CI
->getType()};
4263 TysForDecl
[0] = VectorType::get(CI
->getType()->getScalarType(), VF
);
4264 VectorF
= Intrinsic::getDeclaration(M
, ID
, TysForDecl
);
4266 // Use vector version of the library call.
4267 StringRef VFnName
= TLI
->getVectorizedFunction(FnName
, VF
);
4268 assert(!VFnName
.empty() && "Vector function name is empty.");
4269 VectorF
= M
->getFunction(VFnName
);
4271 // Generate a declaration
4272 FunctionType
*FTy
= FunctionType::get(RetTy
, Tys
, false);
4274 Function::Create(FTy
, Function::ExternalLinkage
, VFnName
, M
);
4275 VectorF
->copyAttributesFrom(F
);
4278 assert(VectorF
&& "Can't create vector function.");
4280 SmallVector
<OperandBundleDef
, 1> OpBundles
;
4281 CI
->getOperandBundlesAsDefs(OpBundles
);
4282 CallInst
*V
= Builder
.CreateCall(VectorF
, Args
, OpBundles
);
4284 if (isa
<FPMathOperator
>(V
))
4285 V
->copyFastMathFlags(CI
);
4287 VectorLoopValueMap
.setVectorValue(&I
, Part
, V
);
4295 // This instruction is not vectorized by simple widening.
4296 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I
);
4297 llvm_unreachable("Unhandled instruction!");
4301 void InnerLoopVectorizer::updateAnalysis() {
4302 // Forget the original basic block.
4303 PSE
.getSE()->forgetLoop(OrigLoop
);
4305 // DT is not kept up-to-date for outer loop vectorization
4306 if (EnableVPlanNativePath
)
4309 // Update the dominator tree information.
4310 assert(DT
->properlyDominates(LoopBypassBlocks
.front(), LoopExitBlock
) &&
4311 "Entry does not dominate exit.");
4313 DT
->addNewBlock(LoopMiddleBlock
,
4314 LI
->getLoopFor(LoopVectorBody
)->getLoopLatch());
4315 DT
->addNewBlock(LoopScalarPreHeader
, LoopBypassBlocks
[0]);
4316 DT
->changeImmediateDominator(LoopScalarBody
, LoopScalarPreHeader
);
4317 DT
->changeImmediateDominator(LoopExitBlock
, LoopBypassBlocks
[0]);
4318 assert(DT
->verify(DominatorTree::VerificationLevel::Fast
));
4321 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF
) {
4322 // We should not collect Scalars more than once per VF. Right now, this
4323 // function is called from collectUniformsAndScalars(), which already does
4324 // this check. Collecting Scalars for VF=1 does not make any sense.
4325 assert(VF
>= 2 && Scalars
.find(VF
) == Scalars
.end() &&
4326 "This function should not be visited twice for the same VF");
4328 SmallSetVector
<Instruction
*, 8> Worklist
;
4330 // These sets are used to seed the analysis with pointers used by memory
4331 // accesses that will remain scalar.
4332 SmallSetVector
<Instruction
*, 8> ScalarPtrs
;
4333 SmallPtrSet
<Instruction
*, 8> PossibleNonScalarPtrs
;
4335 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4336 // The pointer operands of loads and stores will be scalar as long as the
4337 // memory access is not a gather or scatter operation. The value operand of a
4338 // store will remain scalar if the store is scalarized.
4339 auto isScalarUse
= [&](Instruction
*MemAccess
, Value
*Ptr
) {
4340 InstWidening WideningDecision
= getWideningDecision(MemAccess
, VF
);
4341 assert(WideningDecision
!= CM_Unknown
&&
4342 "Widening decision should be ready at this moment");
4343 if (auto *Store
= dyn_cast
<StoreInst
>(MemAccess
))
4344 if (Ptr
== Store
->getValueOperand())
4345 return WideningDecision
== CM_Scalarize
;
4346 assert(Ptr
== getLoadStorePointerOperand(MemAccess
) &&
4347 "Ptr is neither a value or pointer operand");
4348 return WideningDecision
!= CM_GatherScatter
;
4351 // A helper that returns true if the given value is a bitcast or
4352 // getelementptr instruction contained in the loop.
4353 auto isLoopVaryingBitCastOrGEP
= [&](Value
*V
) {
4354 return ((isa
<BitCastInst
>(V
) && V
->getType()->isPointerTy()) ||
4355 isa
<GetElementPtrInst
>(V
)) &&
4356 !TheLoop
->isLoopInvariant(V
);
4359 // A helper that evaluates a memory access's use of a pointer. If the use
4360 // will be a scalar use, and the pointer is only used by memory accesses, we
4361 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4362 // PossibleNonScalarPtrs.
4363 auto evaluatePtrUse
= [&](Instruction
*MemAccess
, Value
*Ptr
) {
4364 // We only care about bitcast and getelementptr instructions contained in
4366 if (!isLoopVaryingBitCastOrGEP(Ptr
))
4369 // If the pointer has already been identified as scalar (e.g., if it was
4370 // also identified as uniform), there's nothing to do.
4371 auto *I
= cast
<Instruction
>(Ptr
);
4372 if (Worklist
.count(I
))
4375 // If the use of the pointer will be a scalar use, and all users of the
4376 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4377 // place the pointer in PossibleNonScalarPtrs.
4378 if (isScalarUse(MemAccess
, Ptr
) && llvm::all_of(I
->users(), [&](User
*U
) {
4379 return isa
<LoadInst
>(U
) || isa
<StoreInst
>(U
);
4381 ScalarPtrs
.insert(I
);
4383 PossibleNonScalarPtrs
.insert(I
);
4386 // We seed the scalars analysis with three classes of instructions: (1)
4387 // instructions marked uniform-after-vectorization, (2) bitcast and
4388 // getelementptr instructions used by memory accesses requiring a scalar use,
4389 // and (3) pointer induction variables and their update instructions (we
4390 // currently only scalarize these).
4392 // (1) Add to the worklist all instructions that have been identified as
4393 // uniform-after-vectorization.
4394 Worklist
.insert(Uniforms
[VF
].begin(), Uniforms
[VF
].end());
4396 // (2) Add to the worklist all bitcast and getelementptr instructions used by
4397 // memory accesses requiring a scalar use. The pointer operands of loads and
4398 // stores will be scalar as long as the memory accesses is not a gather or
4399 // scatter operation. The value operand of a store will remain scalar if the
4400 // store is scalarized.
4401 for (auto *BB
: TheLoop
->blocks())
4402 for (auto &I
: *BB
) {
4403 if (auto *Load
= dyn_cast
<LoadInst
>(&I
)) {
4404 evaluatePtrUse(Load
, Load
->getPointerOperand());
4405 } else if (auto *Store
= dyn_cast
<StoreInst
>(&I
)) {
4406 evaluatePtrUse(Store
, Store
->getPointerOperand());
4407 evaluatePtrUse(Store
, Store
->getValueOperand());
4410 for (auto *I
: ScalarPtrs
)
4411 if (PossibleNonScalarPtrs
.find(I
) == PossibleNonScalarPtrs
.end()) {
4412 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I
<< "\n");
4416 // (3) Add to the worklist all pointer induction variables and their update
4419 // TODO: Once we are able to vectorize pointer induction variables we should
4420 // no longer insert them into the worklist here.
4421 auto *Latch
= TheLoop
->getLoopLatch();
4422 for (auto &Induction
: *Legal
->getInductionVars()) {
4423 auto *Ind
= Induction
.first
;
4424 auto *IndUpdate
= cast
<Instruction
>(Ind
->getIncomingValueForBlock(Latch
));
4425 if (Induction
.second
.getKind() != InductionDescriptor::IK_PtrInduction
)
4427 Worklist
.insert(Ind
);
4428 Worklist
.insert(IndUpdate
);
4429 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind
<< "\n");
4430 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4434 // Insert the forced scalars.
4435 // FIXME: Currently widenPHIInstruction() often creates a dead vector
4436 // induction variable when the PHI user is scalarized.
4437 auto ForcedScalar
= ForcedScalars
.find(VF
);
4438 if (ForcedScalar
!= ForcedScalars
.end())
4439 for (auto *I
: ForcedScalar
->second
)
4442 // Expand the worklist by looking through any bitcasts and getelementptr
4443 // instructions we've already identified as scalar. This is similar to the
4444 // expansion step in collectLoopUniforms(); however, here we're only
4445 // expanding to include additional bitcasts and getelementptr instructions.
4447 while (Idx
!= Worklist
.size()) {
4448 Instruction
*Dst
= Worklist
[Idx
++];
4449 if (!isLoopVaryingBitCastOrGEP(Dst
->getOperand(0)))
4451 auto *Src
= cast
<Instruction
>(Dst
->getOperand(0));
4452 if (llvm::all_of(Src
->users(), [&](User
*U
) -> bool {
4453 auto *J
= cast
<Instruction
>(U
);
4454 return !TheLoop
->contains(J
) || Worklist
.count(J
) ||
4455 ((isa
<LoadInst
>(J
) || isa
<StoreInst
>(J
)) &&
4456 isScalarUse(J
, Src
));
4458 Worklist
.insert(Src
);
4459 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src
<< "\n");
4463 // An induction variable will remain scalar if all users of the induction
4464 // variable and induction variable update remain scalar.
4465 for (auto &Induction
: *Legal
->getInductionVars()) {
4466 auto *Ind
= Induction
.first
;
4467 auto *IndUpdate
= cast
<Instruction
>(Ind
->getIncomingValueForBlock(Latch
));
4469 // We already considered pointer induction variables, so there's no reason
4470 // to look at their users again.
4472 // TODO: Once we are able to vectorize pointer induction variables we
4473 // should no longer skip over them here.
4474 if (Induction
.second
.getKind() == InductionDescriptor::IK_PtrInduction
)
4477 // Determine if all users of the induction variable are scalar after
4479 auto ScalarInd
= llvm::all_of(Ind
->users(), [&](User
*U
) -> bool {
4480 auto *I
= cast
<Instruction
>(U
);
4481 return I
== IndUpdate
|| !TheLoop
->contains(I
) || Worklist
.count(I
);
4486 // Determine if all users of the induction variable update instruction are
4487 // scalar after vectorization.
4488 auto ScalarIndUpdate
=
4489 llvm::all_of(IndUpdate
->users(), [&](User
*U
) -> bool {
4490 auto *I
= cast
<Instruction
>(U
);
4491 return I
== Ind
|| !TheLoop
->contains(I
) || Worklist
.count(I
);
4493 if (!ScalarIndUpdate
)
4496 // The induction variable and its update instruction will remain scalar.
4497 Worklist
.insert(Ind
);
4498 Worklist
.insert(IndUpdate
);
4499 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind
<< "\n");
4500 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4504 Scalars
[VF
].insert(Worklist
.begin(), Worklist
.end());
4507 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction
*I
, unsigned VF
) {
4508 if (!blockNeedsPredication(I
->getParent()))
4510 switch(I
->getOpcode()) {
4513 case Instruction::Load
:
4514 case Instruction::Store
: {
4515 if (!Legal
->isMaskRequired(I
))
4517 auto *Ptr
= getLoadStorePointerOperand(I
);
4518 auto *Ty
= getMemInstValueType(I
);
4519 // We have already decided how to vectorize this instruction, get that
4522 InstWidening WideningDecision
= getWideningDecision(I
, VF
);
4523 assert(WideningDecision
!= CM_Unknown
&&
4524 "Widening decision should be ready at this moment");
4525 return WideningDecision
== CM_Scalarize
;
4527 return isa
<LoadInst
>(I
) ?
4528 !(isLegalMaskedLoad(Ty
, Ptr
) || isLegalMaskedGather(Ty
))
4529 : !(isLegalMaskedStore(Ty
, Ptr
) || isLegalMaskedScatter(Ty
));
4531 case Instruction::UDiv
:
4532 case Instruction::SDiv
:
4533 case Instruction::SRem
:
4534 case Instruction::URem
:
4535 return mayDivideByZero(*I
);
4540 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction
*I
,
4542 assert(isAccessInterleaved(I
) && "Expecting interleaved access.");
4543 assert(getWideningDecision(I
, VF
) == CM_Unknown
&&
4544 "Decision should not be set yet.");
4545 auto *Group
= getInterleavedAccessGroup(I
);
4546 assert(Group
&& "Must have a group.");
4548 // If the instruction's allocated size doesn't equal it's type size, it
4549 // requires padding and will be scalarized.
4550 auto &DL
= I
->getModule()->getDataLayout();
4551 auto *ScalarTy
= getMemInstValueType(I
);
4552 if (hasIrregularType(ScalarTy
, DL
, VF
))
4555 // Check if masking is required.
4556 // A Group may need masking for one of two reasons: it resides in a block that
4557 // needs predication, or it was decided to use masking to deal with gaps.
4558 bool PredicatedAccessRequiresMasking
=
4559 Legal
->blockNeedsPredication(I
->getParent()) && Legal
->isMaskRequired(I
);
4560 bool AccessWithGapsRequiresMasking
=
4561 Group
->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4562 if (!PredicatedAccessRequiresMasking
&& !AccessWithGapsRequiresMasking
)
4565 // If masked interleaving is required, we expect that the user/target had
4566 // enabled it, because otherwise it either wouldn't have been created or
4567 // it should have been invalidated by the CostModel.
4568 assert(useMaskedInterleavedAccesses(TTI
) &&
4569 "Masked interleave-groups for predicated accesses are not enabled.");
4571 auto *Ty
= getMemInstValueType(I
);
4572 return isa
<LoadInst
>(I
) ? TTI
.isLegalMaskedLoad(Ty
)
4573 : TTI
.isLegalMaskedStore(Ty
);
4576 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction
*I
,
4578 // Get and ensure we have a valid memory instruction.
4579 LoadInst
*LI
= dyn_cast
<LoadInst
>(I
);
4580 StoreInst
*SI
= dyn_cast
<StoreInst
>(I
);
4581 assert((LI
|| SI
) && "Invalid memory instruction");
4583 auto *Ptr
= getLoadStorePointerOperand(I
);
4585 // In order to be widened, the pointer should be consecutive, first of all.
4586 if (!Legal
->isConsecutivePtr(Ptr
))
4589 // If the instruction is a store located in a predicated block, it will be
4591 if (isScalarWithPredication(I
))
4594 // If the instruction's allocated size doesn't equal it's type size, it
4595 // requires padding and will be scalarized.
4596 auto &DL
= I
->getModule()->getDataLayout();
4597 auto *ScalarTy
= LI
? LI
->getType() : SI
->getValueOperand()->getType();
4598 if (hasIrregularType(ScalarTy
, DL
, VF
))
4604 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF
) {
4605 // We should not collect Uniforms more than once per VF. Right now,
4606 // this function is called from collectUniformsAndScalars(), which
4607 // already does this check. Collecting Uniforms for VF=1 does not make any
4610 assert(VF
>= 2 && Uniforms
.find(VF
) == Uniforms
.end() &&
4611 "This function should not be visited twice for the same VF");
4613 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4614 // not analyze again. Uniforms.count(VF) will return 1.
4615 Uniforms
[VF
].clear();
4617 // We now know that the loop is vectorizable!
4618 // Collect instructions inside the loop that will remain uniform after
4621 // Global values, params and instructions outside of current loop are out of
4623 auto isOutOfScope
= [&](Value
*V
) -> bool {
4624 Instruction
*I
= dyn_cast
<Instruction
>(V
);
4625 return (!I
|| !TheLoop
->contains(I
));
4628 SetVector
<Instruction
*> Worklist
;
4629 BasicBlock
*Latch
= TheLoop
->getLoopLatch();
4631 // Start with the conditional branch. If the branch condition is an
4632 // instruction contained in the loop that is only used by the branch, it is
4634 auto *Cmp
= dyn_cast
<Instruction
>(Latch
->getTerminator()->getOperand(0));
4635 if (Cmp
&& TheLoop
->contains(Cmp
) && Cmp
->hasOneUse()) {
4636 Worklist
.insert(Cmp
);
4637 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp
<< "\n");
4640 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4641 // are pointers that are treated like consecutive pointers during
4642 // vectorization. The pointer operands of interleaved accesses are an
4644 SmallSetVector
<Instruction
*, 8> ConsecutiveLikePtrs
;
4646 // Holds pointer operands of instructions that are possibly non-uniform.
4647 SmallPtrSet
<Instruction
*, 8> PossibleNonUniformPtrs
;
4649 auto isUniformDecision
= [&](Instruction
*I
, unsigned VF
) {
4650 InstWidening WideningDecision
= getWideningDecision(I
, VF
);
4651 assert(WideningDecision
!= CM_Unknown
&&
4652 "Widening decision should be ready at this moment");
4654 return (WideningDecision
== CM_Widen
||
4655 WideningDecision
== CM_Widen_Reverse
||
4656 WideningDecision
== CM_Interleave
);
4658 // Iterate over the instructions in the loop, and collect all
4659 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4660 // that a consecutive-like pointer operand will be scalarized, we collect it
4661 // in PossibleNonUniformPtrs instead. We use two sets here because a single
4662 // getelementptr instruction can be used by both vectorized and scalarized
4663 // memory instructions. For example, if a loop loads and stores from the same
4664 // location, but the store is conditional, the store will be scalarized, and
4665 // the getelementptr won't remain uniform.
4666 for (auto *BB
: TheLoop
->blocks())
4667 for (auto &I
: *BB
) {
4668 // If there's no pointer operand, there's nothing to do.
4669 auto *Ptr
= dyn_cast_or_null
<Instruction
>(getLoadStorePointerOperand(&I
));
4673 // True if all users of Ptr are memory accesses that have Ptr as their
4675 auto UsersAreMemAccesses
=
4676 llvm::all_of(Ptr
->users(), [&](User
*U
) -> bool {
4677 return getLoadStorePointerOperand(U
) == Ptr
;
4680 // Ensure the memory instruction will not be scalarized or used by
4681 // gather/scatter, making its pointer operand non-uniform. If the pointer
4682 // operand is used by any instruction other than a memory access, we
4683 // conservatively assume the pointer operand may be non-uniform.
4684 if (!UsersAreMemAccesses
|| !isUniformDecision(&I
, VF
))
4685 PossibleNonUniformPtrs
.insert(Ptr
);
4687 // If the memory instruction will be vectorized and its pointer operand
4688 // is consecutive-like, or interleaving - the pointer operand should
4691 ConsecutiveLikePtrs
.insert(Ptr
);
4694 // Add to the Worklist all consecutive and consecutive-like pointers that
4695 // aren't also identified as possibly non-uniform.
4696 for (auto *V
: ConsecutiveLikePtrs
)
4697 if (PossibleNonUniformPtrs
.find(V
) == PossibleNonUniformPtrs
.end()) {
4698 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V
<< "\n");
4702 // Expand Worklist in topological order: whenever a new instruction
4703 // is added , its users should be already inside Worklist. It ensures
4704 // a uniform instruction will only be used by uniform instructions.
4706 while (idx
!= Worklist
.size()) {
4707 Instruction
*I
= Worklist
[idx
++];
4709 for (auto OV
: I
->operand_values()) {
4710 // isOutOfScope operands cannot be uniform instructions.
4711 if (isOutOfScope(OV
))
4713 // First order recurrence Phi's should typically be considered
4715 auto *OP
= dyn_cast
<PHINode
>(OV
);
4716 if (OP
&& Legal
->isFirstOrderRecurrence(OP
))
4718 // If all the users of the operand are uniform, then add the
4719 // operand into the uniform worklist.
4720 auto *OI
= cast
<Instruction
>(OV
);
4721 if (llvm::all_of(OI
->users(), [&](User
*U
) -> bool {
4722 auto *J
= cast
<Instruction
>(U
);
4723 return Worklist
.count(J
) ||
4724 (OI
== getLoadStorePointerOperand(J
) &&
4725 isUniformDecision(J
, VF
));
4727 Worklist
.insert(OI
);
4728 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI
<< "\n");
4733 // Returns true if Ptr is the pointer operand of a memory access instruction
4734 // I, and I is known to not require scalarization.
4735 auto isVectorizedMemAccessUse
= [&](Instruction
*I
, Value
*Ptr
) -> bool {
4736 return getLoadStorePointerOperand(I
) == Ptr
&& isUniformDecision(I
, VF
);
4739 // For an instruction to be added into Worklist above, all its users inside
4740 // the loop should also be in Worklist. However, this condition cannot be
4741 // true for phi nodes that form a cyclic dependence. We must process phi
4742 // nodes separately. An induction variable will remain uniform if all users
4743 // of the induction variable and induction variable update remain uniform.
4744 // The code below handles both pointer and non-pointer induction variables.
4745 for (auto &Induction
: *Legal
->getInductionVars()) {
4746 auto *Ind
= Induction
.first
;
4747 auto *IndUpdate
= cast
<Instruction
>(Ind
->getIncomingValueForBlock(Latch
));
4749 // Determine if all users of the induction variable are uniform after
4751 auto UniformInd
= llvm::all_of(Ind
->users(), [&](User
*U
) -> bool {
4752 auto *I
= cast
<Instruction
>(U
);
4753 return I
== IndUpdate
|| !TheLoop
->contains(I
) || Worklist
.count(I
) ||
4754 isVectorizedMemAccessUse(I
, Ind
);
4759 // Determine if all users of the induction variable update instruction are
4760 // uniform after vectorization.
4761 auto UniformIndUpdate
=
4762 llvm::all_of(IndUpdate
->users(), [&](User
*U
) -> bool {
4763 auto *I
= cast
<Instruction
>(U
);
4764 return I
== Ind
|| !TheLoop
->contains(I
) || Worklist
.count(I
) ||
4765 isVectorizedMemAccessUse(I
, IndUpdate
);
4767 if (!UniformIndUpdate
)
4770 // The induction variable and its update instruction will remain uniform.
4771 Worklist
.insert(Ind
);
4772 Worklist
.insert(IndUpdate
);
4773 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind
<< "\n");
4774 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4778 Uniforms
[VF
].insert(Worklist
.begin(), Worklist
.end());
4781 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4782 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4784 if (Legal
->getRuntimePointerChecking()->Need
) {
4785 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4786 "runtime pointer checks needed. Enable vectorization of this "
4787 "loop with '#pragma clang loop vectorize(enable)' when "
4788 "compiling with -Os/-Oz",
4789 "CantVersionLoopWithOptForSize", ORE
, TheLoop
);
4793 if (!PSE
.getUnionPredicate().getPredicates().empty()) {
4794 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4795 "runtime SCEV checks needed. Enable vectorization of this "
4796 "loop with '#pragma clang loop vectorize(enable)' when "
4797 "compiling with -Os/-Oz",
4798 "CantVersionLoopWithOptForSize", ORE
, TheLoop
);
4802 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4803 if (!Legal
->getLAI()->getSymbolicStrides().empty()) {
4804 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4805 "runtime stride == 1 checks needed. Enable vectorization of "
4806 "this loop with '#pragma clang loop vectorize(enable)' when "
4807 "compiling with -Os/-Oz",
4808 "CantVersionLoopWithOptForSize", ORE
, TheLoop
);
4815 Optional
<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4816 if (Legal
->getRuntimePointerChecking()->Need
&& TTI
.hasBranchDivergence()) {
4817 // TODO: It may by useful to do since it's still likely to be dynamically
4818 // uniform if the target can skip.
4819 reportVectorizationFailure(
4820 "Not inserting runtime ptr check for divergent target",
4821 "runtime pointer checks needed. Not enabled for divergent target",
4822 "CantVersionLoopWithDivergentTarget", ORE
, TheLoop
);
4826 unsigned TC
= PSE
.getSE()->getSmallConstantTripCount(TheLoop
);
4827 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC
<< '\n');
4829 reportVectorizationFailure("Single iteration (non) loop",
4830 "loop trip count is one, irrelevant for vectorization",
4831 "SingleIterationLoop", ORE
, TheLoop
);
4835 switch (ScalarEpilogueStatus
) {
4836 case CM_ScalarEpilogueAllowed
:
4837 return computeFeasibleMaxVF(TC
);
4838 case CM_ScalarEpilogueNotNeededUsePredicate
:
4840 dbgs() << "LV: vector predicate hint/switch found.\n"
4841 << "LV: Not allowing scalar epilogue, creating predicated "
4842 << "vector loop.\n");
4844 case CM_ScalarEpilogueNotAllowedLowTripLoop
:
4845 // fallthrough as a special case of OptForSize
4846 case CM_ScalarEpilogueNotAllowedOptSize
:
4847 if (ScalarEpilogueStatus
== CM_ScalarEpilogueNotAllowedOptSize
)
4849 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4851 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4854 // Bail if runtime checks are required, which are not good when optimising
4856 if (runtimeChecksRequired())
4861 // Now try the tail folding
4863 // Invalidate interleave groups that require an epilogue if we can't mask
4864 // the interleave-group.
4865 if (!useMaskedInterleavedAccesses(TTI
))
4866 InterleaveInfo
.invalidateGroupsRequiringScalarEpilogue();
4868 unsigned MaxVF
= computeFeasibleMaxVF(TC
);
4869 if (TC
> 0 && TC
% MaxVF
== 0) {
4870 // Accept MaxVF if we do not have a tail.
4871 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4875 // If we don't know the precise trip count, or if the trip count that we
4876 // found modulo the vectorization factor is not zero, try to fold the tail
4878 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4879 if (Legal
->prepareToFoldTailByMasking()) {
4880 FoldTailByMasking
= true;
4885 reportVectorizationFailure(
4886 "Unable to calculate the loop count due to complex control flow",
4887 "unable to calculate the loop count due to complex control flow",
4888 "UnknownLoopCountComplexCFG", ORE
, TheLoop
);
4892 reportVectorizationFailure(
4893 "Cannot optimize for size and vectorize at the same time.",
4894 "cannot optimize for size and vectorize at the same time. "
4895 "Enable vectorization of this loop with '#pragma clang loop "
4896 "vectorize(enable)' when compiling with -Os/-Oz",
4897 "NoTailLoopWithOptForSize", ORE
, TheLoop
);
4902 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount
) {
4903 MinBWs
= computeMinimumValueSizes(TheLoop
->getBlocks(), *DB
, &TTI
);
4904 unsigned SmallestType
, WidestType
;
4905 std::tie(SmallestType
, WidestType
) = getSmallestAndWidestTypes();
4906 unsigned WidestRegister
= TTI
.getRegisterBitWidth(true);
4908 // Get the maximum safe dependence distance in bits computed by LAA.
4909 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4910 // the memory accesses that is most restrictive (involved in the smallest
4911 // dependence distance).
4912 unsigned MaxSafeRegisterWidth
= Legal
->getMaxSafeRegisterWidth();
4914 WidestRegister
= std::min(WidestRegister
, MaxSafeRegisterWidth
);
4916 unsigned MaxVectorSize
= WidestRegister
/ WidestType
;
4918 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4919 << " / " << WidestType
<< " bits.\n");
4920 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4921 << WidestRegister
<< " bits.\n");
4923 assert(MaxVectorSize
<= 256 && "Did not expect to pack so many elements"
4924 " into one vector!");
4925 if (MaxVectorSize
== 0) {
4926 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4928 return MaxVectorSize
;
4929 } else if (ConstTripCount
&& ConstTripCount
< MaxVectorSize
&&
4930 isPowerOf2_32(ConstTripCount
)) {
4931 // We need to clamp the VF to be the ConstTripCount. There is no point in
4932 // choosing a higher viable VF as done in the loop below.
4933 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4934 << ConstTripCount
<< "\n");
4935 MaxVectorSize
= ConstTripCount
;
4936 return MaxVectorSize
;
4939 unsigned MaxVF
= MaxVectorSize
;
4940 if (TTI
.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
4941 (MaximizeBandwidth
&& isScalarEpilogueAllowed())) {
4942 // Collect all viable vectorization factors larger than the default MaxVF
4943 // (i.e. MaxVectorSize).
4944 SmallVector
<unsigned, 8> VFs
;
4945 unsigned NewMaxVectorSize
= WidestRegister
/ SmallestType
;
4946 for (unsigned VS
= MaxVectorSize
* 2; VS
<= NewMaxVectorSize
; VS
*= 2)
4949 // For each VF calculate its register usage.
4950 auto RUs
= calculateRegisterUsage(VFs
);
4952 // Select the largest VF which doesn't require more registers than existing
4954 unsigned TargetNumRegisters
= TTI
.getNumberOfRegisters(true);
4955 for (int i
= RUs
.size() - 1; i
>= 0; --i
) {
4956 if (RUs
[i
].MaxLocalUsers
<= TargetNumRegisters
) {
4961 if (unsigned MinVF
= TTI
.getMinimumVF(SmallestType
)) {
4962 if (MaxVF
< MinVF
) {
4963 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4964 << ") with target's minimum: " << MinVF
<< '\n');
4973 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF
) {
4974 float Cost
= expectedCost(1).first
;
4975 const float ScalarCost
= Cost
;
4977 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost
<< ".\n");
4979 bool ForceVectorization
= Hints
->getForce() == LoopVectorizeHints::FK_Enabled
;
4980 if (ForceVectorization
&& MaxVF
> 1) {
4981 // Ignore scalar width, because the user explicitly wants vectorization.
4982 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4984 Cost
= std::numeric_limits
<float>::max();
4987 for (unsigned i
= 2; i
<= MaxVF
; i
*= 2) {
4988 // Notice that the vector loop needs to be executed less times, so
4989 // we need to divide the cost of the vector loops by the width of
4990 // the vector elements.
4991 VectorizationCostTy C
= expectedCost(i
);
4992 float VectorCost
= C
.first
/ (float)i
;
4993 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
4994 << " costs: " << (int)VectorCost
<< ".\n");
4995 if (!C
.second
&& !ForceVectorization
) {
4997 dbgs() << "LV: Not considering vector loop of width " << i
4998 << " because it will not generate any vector instructions.\n");
5001 if (VectorCost
< Cost
) {
5007 if (!EnableCondStoresVectorization
&& NumPredStores
) {
5008 reportVectorizationFailure("There are conditional stores.",
5009 "store that is conditionally executed prevents vectorization",
5010 "ConditionalStore", ORE
, TheLoop
);
5015 LLVM_DEBUG(if (ForceVectorization
&& Width
> 1 && Cost
>= ScalarCost
) dbgs()
5016 << "LV: Vectorization seems to be not beneficial, "
5017 << "but was forced by a user.\n");
5018 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width
<< ".\n");
5019 VectorizationFactor Factor
= {Width
, (unsigned)(Width
* Cost
)};
5023 std::pair
<unsigned, unsigned>
5024 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5025 unsigned MinWidth
= -1U;
5026 unsigned MaxWidth
= 8;
5027 const DataLayout
&DL
= TheFunction
->getParent()->getDataLayout();
5030 for (BasicBlock
*BB
: TheLoop
->blocks()) {
5031 // For each instruction in the loop.
5032 for (Instruction
&I
: BB
->instructionsWithoutDebug()) {
5033 Type
*T
= I
.getType();
5035 // Skip ignored values.
5036 if (ValuesToIgnore
.find(&I
) != ValuesToIgnore
.end())
5039 // Only examine Loads, Stores and PHINodes.
5040 if (!isa
<LoadInst
>(I
) && !isa
<StoreInst
>(I
) && !isa
<PHINode
>(I
))
5043 // Examine PHI nodes that are reduction variables. Update the type to
5044 // account for the recurrence type.
5045 if (auto *PN
= dyn_cast
<PHINode
>(&I
)) {
5046 if (!Legal
->isReductionVariable(PN
))
5048 RecurrenceDescriptor RdxDesc
= (*Legal
->getReductionVars())[PN
];
5049 T
= RdxDesc
.getRecurrenceType();
5052 // Examine the stored values.
5053 if (auto *ST
= dyn_cast
<StoreInst
>(&I
))
5054 T
= ST
->getValueOperand()->getType();
5056 // Ignore loaded pointer types and stored pointer types that are not
5059 // FIXME: The check here attempts to predict whether a load or store will
5060 // be vectorized. We only know this for certain after a VF has
5061 // been selected. Here, we assume that if an access can be
5062 // vectorized, it will be. We should also look at extending this
5063 // optimization to non-pointer types.
5065 if (T
->isPointerTy() && !isConsecutiveLoadOrStore(&I
) &&
5066 !isAccessInterleaved(&I
) && !isLegalGatherOrScatter(&I
))
5069 MinWidth
= std::min(MinWidth
,
5070 (unsigned)DL
.getTypeSizeInBits(T
->getScalarType()));
5071 MaxWidth
= std::max(MaxWidth
,
5072 (unsigned)DL
.getTypeSizeInBits(T
->getScalarType()));
5076 return {MinWidth
, MaxWidth
};
5079 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF
,
5080 unsigned LoopCost
) {
5081 // -- The interleave heuristics --
5082 // We interleave the loop in order to expose ILP and reduce the loop overhead.
5083 // There are many micro-architectural considerations that we can't predict
5084 // at this level. For example, frontend pressure (on decode or fetch) due to
5085 // code size, or the number and capabilities of the execution ports.
5087 // We use the following heuristics to select the interleave count:
5088 // 1. If the code has reductions, then we interleave to break the cross
5089 // iteration dependency.
5090 // 2. If the loop is really small, then we interleave to reduce the loop
5092 // 3. We don't interleave if we think that we will spill registers to memory
5093 // due to the increased register pressure.
5095 if (!isScalarEpilogueAllowed())
5098 // We used the distance for the interleave count.
5099 if (Legal
->getMaxSafeDepDistBytes() != -1U)
5102 // Do not interleave loops with a relatively small trip count.
5103 unsigned TC
= PSE
.getSE()->getSmallConstantTripCount(TheLoop
);
5104 if (TC
> 1 && TC
< TinyTripCountInterleaveThreshold
)
5107 unsigned TargetNumRegisters
= TTI
.getNumberOfRegisters(VF
> 1);
5108 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5112 if (ForceTargetNumScalarRegs
.getNumOccurrences() > 0)
5113 TargetNumRegisters
= ForceTargetNumScalarRegs
;
5115 if (ForceTargetNumVectorRegs
.getNumOccurrences() > 0)
5116 TargetNumRegisters
= ForceTargetNumVectorRegs
;
5119 RegisterUsage R
= calculateRegisterUsage({VF
})[0];
5120 // We divide by these constants so assume that we have at least one
5121 // instruction that uses at least one register.
5122 R
.MaxLocalUsers
= std::max(R
.MaxLocalUsers
, 1U);
5124 // We calculate the interleave count using the following formula.
5125 // Subtract the number of loop invariants from the number of available
5126 // registers. These registers are used by all of the interleaved instances.
5127 // Next, divide the remaining registers by the number of registers that is
5128 // required by the loop, in order to estimate how many parallel instances
5129 // fit without causing spills. All of this is rounded down if necessary to be
5130 // a power of two. We want power of two interleave count to simplify any
5131 // addressing operations or alignment considerations.
5132 // We also want power of two interleave counts to ensure that the induction
5133 // variable of the vector loop wraps to zero, when tail is folded by masking;
5134 // this currently happens when OptForSize, in which case IC is set to 1 above.
5135 unsigned IC
= PowerOf2Floor((TargetNumRegisters
- R
.LoopInvariantRegs
) /
5138 // Don't count the induction variable as interleaved.
5139 if (EnableIndVarRegisterHeur
)
5140 IC
= PowerOf2Floor((TargetNumRegisters
- R
.LoopInvariantRegs
- 1) /
5141 std::max(1U, (R
.MaxLocalUsers
- 1)));
5143 // Clamp the interleave ranges to reasonable counts.
5144 unsigned MaxInterleaveCount
= TTI
.getMaxInterleaveFactor(VF
);
5146 // Check if the user has overridden the max.
5148 if (ForceTargetMaxScalarInterleaveFactor
.getNumOccurrences() > 0)
5149 MaxInterleaveCount
= ForceTargetMaxScalarInterleaveFactor
;
5151 if (ForceTargetMaxVectorInterleaveFactor
.getNumOccurrences() > 0)
5152 MaxInterleaveCount
= ForceTargetMaxVectorInterleaveFactor
;
5155 // If the trip count is constant, limit the interleave count to be less than
5156 // the trip count divided by VF.
5158 assert(TC
>= VF
&& "VF exceeds trip count?");
5159 if ((TC
/ VF
) < MaxInterleaveCount
)
5160 MaxInterleaveCount
= (TC
/ VF
);
5163 // If we did not calculate the cost for VF (because the user selected the VF)
5164 // then we calculate the cost of VF here.
5166 LoopCost
= expectedCost(VF
).first
;
5168 assert(LoopCost
&& "Non-zero loop cost expected");
5170 // Clamp the calculated IC to be between the 1 and the max interleave count
5171 // that the target and trip count allows.
5172 if (IC
> MaxInterleaveCount
)
5173 IC
= MaxInterleaveCount
;
5177 // Interleave if we vectorized this loop and there is a reduction that could
5178 // benefit from interleaving.
5179 if (VF
> 1 && !Legal
->getReductionVars()->empty()) {
5180 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5184 // Note that if we've already vectorized the loop we will have done the
5185 // runtime check and so interleaving won't require further checks.
5186 bool InterleavingRequiresRuntimePointerCheck
=
5187 (VF
== 1 && Legal
->getRuntimePointerChecking()->Need
);
5189 // We want to interleave small loops in order to reduce the loop overhead and
5190 // potentially expose ILP opportunities.
5191 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost
<< '\n');
5192 if (!InterleavingRequiresRuntimePointerCheck
&& LoopCost
< SmallLoopCost
) {
5193 // We assume that the cost overhead is 1 and we use the cost model
5194 // to estimate the cost of the loop and interleave until the cost of the
5195 // loop overhead is about 5% of the cost of the loop.
5197 std::min(IC
, (unsigned)PowerOf2Floor(SmallLoopCost
/ LoopCost
));
5199 // Interleave until store/load ports (estimated by max interleave count) are
5201 unsigned NumStores
= Legal
->getNumStores();
5202 unsigned NumLoads
= Legal
->getNumLoads();
5203 unsigned StoresIC
= IC
/ (NumStores
? NumStores
: 1);
5204 unsigned LoadsIC
= IC
/ (NumLoads
? NumLoads
: 1);
5206 // If we have a scalar reduction (vector reductions are already dealt with
5207 // by this point), we can increase the critical path length if the loop
5208 // we're interleaving is inside another loop. Limit, by default to 2, so the
5209 // critical path only gets increased by one reduction operation.
5210 if (!Legal
->getReductionVars()->empty() && TheLoop
->getLoopDepth() > 1) {
5211 unsigned F
= static_cast<unsigned>(MaxNestedScalarReductionIC
);
5212 SmallIC
= std::min(SmallIC
, F
);
5213 StoresIC
= std::min(StoresIC
, F
);
5214 LoadsIC
= std::min(LoadsIC
, F
);
5217 if (EnableLoadStoreRuntimeInterleave
&&
5218 std::max(StoresIC
, LoadsIC
) > SmallIC
) {
5220 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5221 return std::max(StoresIC
, LoadsIC
);
5224 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5228 // Interleave if this is a large loop (small loops are already dealt with by
5229 // this point) that could benefit from interleaving.
5230 bool HasReductions
= !Legal
->getReductionVars()->empty();
5231 if (TTI
.enableAggressiveInterleaving(HasReductions
)) {
5232 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5236 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5240 SmallVector
<LoopVectorizationCostModel::RegisterUsage
, 8>
5241 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef
<unsigned> VFs
) {
5242 // This function calculates the register usage by measuring the highest number
5243 // of values that are alive at a single location. Obviously, this is a very
5244 // rough estimation. We scan the loop in a topological order in order and
5245 // assign a number to each instruction. We use RPO to ensure that defs are
5246 // met before their users. We assume that each instruction that has in-loop
5247 // users starts an interval. We record every time that an in-loop value is
5248 // used, so we have a list of the first and last occurrences of each
5249 // instruction. Next, we transpose this data structure into a multi map that
5250 // holds the list of intervals that *end* at a specific location. This multi
5251 // map allows us to perform a linear search. We scan the instructions linearly
5252 // and record each time that a new interval starts, by placing it in a set.
5253 // If we find this value in the multi-map then we remove it from the set.
5254 // The max register usage is the maximum size of the set.
5255 // We also search for instructions that are defined outside the loop, but are
5256 // used inside the loop. We need this number separately from the max-interval
5257 // usage number because when we unroll, loop-invariant values do not take
5259 LoopBlocksDFS
DFS(TheLoop
);
5264 // Each 'key' in the map opens a new interval. The values
5265 // of the map are the index of the 'last seen' usage of the
5266 // instruction that is the key.
5267 using IntervalMap
= DenseMap
<Instruction
*, unsigned>;
5269 // Maps instruction to its index.
5270 SmallVector
<Instruction
*, 64> IdxToInstr
;
5271 // Marks the end of each interval.
5272 IntervalMap EndPoint
;
5273 // Saves the list of instruction indices that are used in the loop.
5274 SmallPtrSet
<Instruction
*, 8> Ends
;
5275 // Saves the list of values that are used in the loop but are
5276 // defined outside the loop, such as arguments and constants.
5277 SmallPtrSet
<Value
*, 8> LoopInvariants
;
5279 for (BasicBlock
*BB
: make_range(DFS
.beginRPO(), DFS
.endRPO())) {
5280 for (Instruction
&I
: BB
->instructionsWithoutDebug()) {
5281 IdxToInstr
.push_back(&I
);
5283 // Save the end location of each USE.
5284 for (Value
*U
: I
.operands()) {
5285 auto *Instr
= dyn_cast
<Instruction
>(U
);
5287 // Ignore non-instruction values such as arguments, constants, etc.
5291 // If this instruction is outside the loop then record it and continue.
5292 if (!TheLoop
->contains(Instr
)) {
5293 LoopInvariants
.insert(Instr
);
5297 // Overwrite previous end points.
5298 EndPoint
[Instr
] = IdxToInstr
.size();
5304 // Saves the list of intervals that end with the index in 'key'.
5305 using InstrList
= SmallVector
<Instruction
*, 2>;
5306 DenseMap
<unsigned, InstrList
> TransposeEnds
;
5308 // Transpose the EndPoints to a list of values that end at each index.
5309 for (auto &Interval
: EndPoint
)
5310 TransposeEnds
[Interval
.second
].push_back(Interval
.first
);
5312 SmallPtrSet
<Instruction
*, 8> OpenIntervals
;
5314 // Get the size of the widest register.
5315 unsigned MaxSafeDepDist
= -1U;
5316 if (Legal
->getMaxSafeDepDistBytes() != -1U)
5317 MaxSafeDepDist
= Legal
->getMaxSafeDepDistBytes() * 8;
5318 unsigned WidestRegister
=
5319 std::min(TTI
.getRegisterBitWidth(true), MaxSafeDepDist
);
5320 const DataLayout
&DL
= TheFunction
->getParent()->getDataLayout();
5322 SmallVector
<RegisterUsage
, 8> RUs(VFs
.size());
5323 SmallVector
<unsigned, 8> MaxUsages(VFs
.size(), 0);
5325 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5327 // A lambda that gets the register usage for the given type and VF.
5328 auto GetRegUsage
= [&DL
, WidestRegister
](Type
*Ty
, unsigned VF
) {
5329 if (Ty
->isTokenTy())
5331 unsigned TypeSize
= DL
.getTypeSizeInBits(Ty
->getScalarType());
5332 return std::max
<unsigned>(1, VF
* TypeSize
/ WidestRegister
);
5335 for (unsigned int i
= 0, s
= IdxToInstr
.size(); i
< s
; ++i
) {
5336 Instruction
*I
= IdxToInstr
[i
];
5338 // Remove all of the instructions that end at this location.
5339 InstrList
&List
= TransposeEnds
[i
];
5340 for (Instruction
*ToRemove
: List
)
5341 OpenIntervals
.erase(ToRemove
);
5343 // Ignore instructions that are never used within the loop.
5344 if (Ends
.find(I
) == Ends
.end())
5347 // Skip ignored values.
5348 if (ValuesToIgnore
.find(I
) != ValuesToIgnore
.end())
5351 // For each VF find the maximum usage of registers.
5352 for (unsigned j
= 0, e
= VFs
.size(); j
< e
; ++j
) {
5354 MaxUsages
[j
] = std::max(MaxUsages
[j
], OpenIntervals
.size());
5357 collectUniformsAndScalars(VFs
[j
]);
5358 // Count the number of live intervals.
5359 unsigned RegUsage
= 0;
5360 for (auto Inst
: OpenIntervals
) {
5361 // Skip ignored values for VF > 1.
5362 if (VecValuesToIgnore
.find(Inst
) != VecValuesToIgnore
.end() ||
5363 isScalarAfterVectorization(Inst
, VFs
[j
]))
5365 RegUsage
+= GetRegUsage(Inst
->getType(), VFs
[j
]);
5367 MaxUsages
[j
] = std::max(MaxUsages
[j
], RegUsage
);
5370 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i
<< " Interval # "
5371 << OpenIntervals
.size() << '\n');
5373 // Add the current instruction to the list of open intervals.
5374 OpenIntervals
.insert(I
);
5377 for (unsigned i
= 0, e
= VFs
.size(); i
< e
; ++i
) {
5378 unsigned Invariant
= 0;
5380 Invariant
= LoopInvariants
.size();
5382 for (auto Inst
: LoopInvariants
)
5383 Invariant
+= GetRegUsage(Inst
->getType(), VFs
[i
]);
5386 LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs
[i
] << '\n');
5387 LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages
[i
] << '\n');
5388 LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant
5391 RU
.LoopInvariantRegs
= Invariant
;
5392 RU
.MaxLocalUsers
= MaxUsages
[i
];
5399 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction
*I
){
5400 // TODO: Cost model for emulated masked load/store is completely
5401 // broken. This hack guides the cost model to use an artificially
5402 // high enough value to practically disable vectorization with such
5403 // operations, except where previously deployed legality hack allowed
5404 // using very low cost values. This is to avoid regressions coming simply
5405 // from moving "masked load/store" check from legality to cost model.
5406 // Masked Load/Gather emulation was previously never allowed.
5407 // Limited number of Masked Store/Scatter emulation was allowed.
5408 assert(isPredicatedInst(I
) && "Expecting a scalar emulated instruction");
5409 return isa
<LoadInst
>(I
) ||
5410 (isa
<StoreInst
>(I
) &&
5411 NumPredStores
> NumberOfStoresToPredicate
);
5414 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF
) {
5415 // If we aren't vectorizing the loop, or if we've already collected the
5416 // instructions to scalarize, there's nothing to do. Collection may already
5417 // have occurred if we have a user-selected VF and are now computing the
5418 // expected cost for interleaving.
5419 if (VF
< 2 || InstsToScalarize
.find(VF
) != InstsToScalarize
.end())
5422 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5423 // not profitable to scalarize any instructions, the presence of VF in the
5424 // map will indicate that we've analyzed it already.
5425 ScalarCostsTy
&ScalarCostsVF
= InstsToScalarize
[VF
];
5427 // Find all the instructions that are scalar with predication in the loop and
5428 // determine if it would be better to not if-convert the blocks they are in.
5429 // If so, we also record the instructions to scalarize.
5430 for (BasicBlock
*BB
: TheLoop
->blocks()) {
5431 if (!blockNeedsPredication(BB
))
5433 for (Instruction
&I
: *BB
)
5434 if (isScalarWithPredication(&I
)) {
5435 ScalarCostsTy ScalarCosts
;
5436 // Do not apply discount logic if hacked cost is needed
5437 // for emulated masked memrefs.
5438 if (!useEmulatedMaskMemRefHack(&I
) &&
5439 computePredInstDiscount(&I
, ScalarCosts
, VF
) >= 0)
5440 ScalarCostsVF
.insert(ScalarCosts
.begin(), ScalarCosts
.end());
5441 // Remember that BB will remain after vectorization.
5442 PredicatedBBsAfterVectorization
.insert(BB
);
5447 int LoopVectorizationCostModel::computePredInstDiscount(
5448 Instruction
*PredInst
, DenseMap
<Instruction
*, unsigned> &ScalarCosts
,
5450 assert(!isUniformAfterVectorization(PredInst
, VF
) &&
5451 "Instruction marked uniform-after-vectorization will be predicated");
5453 // Initialize the discount to zero, meaning that the scalar version and the
5454 // vector version cost the same.
5457 // Holds instructions to analyze. The instructions we visit are mapped in
5458 // ScalarCosts. Those instructions are the ones that would be scalarized if
5459 // we find that the scalar version costs less.
5460 SmallVector
<Instruction
*, 8> Worklist
;
5462 // Returns true if the given instruction can be scalarized.
5463 auto canBeScalarized
= [&](Instruction
*I
) -> bool {
5464 // We only attempt to scalarize instructions forming a single-use chain
5465 // from the original predicated block that would otherwise be vectorized.
5466 // Although not strictly necessary, we give up on instructions we know will
5467 // already be scalar to avoid traversing chains that are unlikely to be
5469 if (!I
->hasOneUse() || PredInst
->getParent() != I
->getParent() ||
5470 isScalarAfterVectorization(I
, VF
))
5473 // If the instruction is scalar with predication, it will be analyzed
5474 // separately. We ignore it within the context of PredInst.
5475 if (isScalarWithPredication(I
))
5478 // If any of the instruction's operands are uniform after vectorization,
5479 // the instruction cannot be scalarized. This prevents, for example, a
5480 // masked load from being scalarized.
5482 // We assume we will only emit a value for lane zero of an instruction
5483 // marked uniform after vectorization, rather than VF identical values.
5484 // Thus, if we scalarize an instruction that uses a uniform, we would
5485 // create uses of values corresponding to the lanes we aren't emitting code
5486 // for. This behavior can be changed by allowing getScalarValue to clone
5487 // the lane zero values for uniforms rather than asserting.
5488 for (Use
&U
: I
->operands())
5489 if (auto *J
= dyn_cast
<Instruction
>(U
.get()))
5490 if (isUniformAfterVectorization(J
, VF
))
5493 // Otherwise, we can scalarize the instruction.
5497 // Compute the expected cost discount from scalarizing the entire expression
5498 // feeding the predicated instruction. We currently only consider expressions
5499 // that are single-use instruction chains.
5500 Worklist
.push_back(PredInst
);
5501 while (!Worklist
.empty()) {
5502 Instruction
*I
= Worklist
.pop_back_val();
5504 // If we've already analyzed the instruction, there's nothing to do.
5505 if (ScalarCosts
.find(I
) != ScalarCosts
.end())
5508 // Compute the cost of the vector instruction. Note that this cost already
5509 // includes the scalarization overhead of the predicated instruction.
5510 unsigned VectorCost
= getInstructionCost(I
, VF
).first
;
5512 // Compute the cost of the scalarized instruction. This cost is the cost of
5513 // the instruction as if it wasn't if-converted and instead remained in the
5514 // predicated block. We will scale this cost by block probability after
5515 // computing the scalarization overhead.
5516 unsigned ScalarCost
= VF
* getInstructionCost(I
, 1).first
;
5518 // Compute the scalarization overhead of needed insertelement instructions
5520 if (isScalarWithPredication(I
) && !I
->getType()->isVoidTy()) {
5521 ScalarCost
+= TTI
.getScalarizationOverhead(ToVectorTy(I
->getType(), VF
),
5523 ScalarCost
+= VF
* TTI
.getCFInstrCost(Instruction::PHI
);
5526 // Compute the scalarization overhead of needed extractelement
5527 // instructions. For each of the instruction's operands, if the operand can
5528 // be scalarized, add it to the worklist; otherwise, account for the
5530 for (Use
&U
: I
->operands())
5531 if (auto *J
= dyn_cast
<Instruction
>(U
.get())) {
5532 assert(VectorType::isValidElementType(J
->getType()) &&
5533 "Instruction has non-scalar type");
5534 if (canBeScalarized(J
))
5535 Worklist
.push_back(J
);
5536 else if (needsExtract(J
, VF
))
5537 ScalarCost
+= TTI
.getScalarizationOverhead(
5538 ToVectorTy(J
->getType(),VF
), false, true);
5541 // Scale the total scalar cost by block probability.
5542 ScalarCost
/= getReciprocalPredBlockProb();
5544 // Compute the discount. A non-negative discount means the vector version
5545 // of the instruction costs more, and scalarizing would be beneficial.
5546 Discount
+= VectorCost
- ScalarCost
;
5547 ScalarCosts
[I
] = ScalarCost
;
5553 LoopVectorizationCostModel::VectorizationCostTy
5554 LoopVectorizationCostModel::expectedCost(unsigned VF
) {
5555 VectorizationCostTy Cost
;
5558 for (BasicBlock
*BB
: TheLoop
->blocks()) {
5559 VectorizationCostTy BlockCost
;
5561 // For each instruction in the old loop.
5562 for (Instruction
&I
: BB
->instructionsWithoutDebug()) {
5563 // Skip ignored values.
5564 if (ValuesToIgnore
.find(&I
) != ValuesToIgnore
.end() ||
5565 (VF
> 1 && VecValuesToIgnore
.find(&I
) != VecValuesToIgnore
.end()))
5568 VectorizationCostTy C
= getInstructionCost(&I
, VF
);
5570 // Check if we should override the cost.
5571 if (ForceTargetInstructionCost
.getNumOccurrences() > 0)
5572 C
.first
= ForceTargetInstructionCost
;
5574 BlockCost
.first
+= C
.first
;
5575 BlockCost
.second
|= C
.second
;
5576 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C
.first
5577 << " for VF " << VF
<< " For instruction: " << I
5581 // If we are vectorizing a predicated block, it will have been
5582 // if-converted. This means that the block's instructions (aside from
5583 // stores and instructions that may divide by zero) will now be
5584 // unconditionally executed. For the scalar case, we may not always execute
5585 // the predicated block. Thus, scale the block's cost by the probability of
5587 if (VF
== 1 && blockNeedsPredication(BB
))
5588 BlockCost
.first
/= getReciprocalPredBlockProb();
5590 Cost
.first
+= BlockCost
.first
;
5591 Cost
.second
|= BlockCost
.second
;
5597 /// Gets Address Access SCEV after verifying that the access pattern
5598 /// is loop invariant except the induction variable dependence.
5600 /// This SCEV can be sent to the Target in order to estimate the address
5601 /// calculation cost.
5602 static const SCEV
*getAddressAccessSCEV(
5604 LoopVectorizationLegality
*Legal
,
5605 PredicatedScalarEvolution
&PSE
,
5606 const Loop
*TheLoop
) {
5608 auto *Gep
= dyn_cast
<GetElementPtrInst
>(Ptr
);
5612 // We are looking for a gep with all loop invariant indices except for one
5613 // which should be an induction variable.
5614 auto SE
= PSE
.getSE();
5615 unsigned NumOperands
= Gep
->getNumOperands();
5616 for (unsigned i
= 1; i
< NumOperands
; ++i
) {
5617 Value
*Opd
= Gep
->getOperand(i
);
5618 if (!SE
->isLoopInvariant(SE
->getSCEV(Opd
), TheLoop
) &&
5619 !Legal
->isInductionVariable(Opd
))
5623 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5624 return PSE
.getSCEV(Ptr
);
5627 static bool isStrideMul(Instruction
*I
, LoopVectorizationLegality
*Legal
) {
5628 return Legal
->hasStride(I
->getOperand(0)) ||
5629 Legal
->hasStride(I
->getOperand(1));
5632 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction
*I
,
5634 assert(VF
> 1 && "Scalarization cost of instruction implies vectorization.");
5635 Type
*ValTy
= getMemInstValueType(I
);
5636 auto SE
= PSE
.getSE();
5638 unsigned Alignment
= getLoadStoreAlignment(I
);
5639 unsigned AS
= getLoadStoreAddressSpace(I
);
5640 Value
*Ptr
= getLoadStorePointerOperand(I
);
5641 Type
*PtrTy
= ToVectorTy(Ptr
->getType(), VF
);
5643 // Figure out whether the access is strided and get the stride value
5644 // if it's known in compile time
5645 const SCEV
*PtrSCEV
= getAddressAccessSCEV(Ptr
, Legal
, PSE
, TheLoop
);
5647 // Get the cost of the scalar memory instruction and address computation.
5648 unsigned Cost
= VF
* TTI
.getAddressComputationCost(PtrTy
, SE
, PtrSCEV
);
5650 // Don't pass *I here, since it is scalar but will actually be part of a
5651 // vectorized loop where the user of it is a vectorized instruction.
5653 TTI
.getMemoryOpCost(I
->getOpcode(), ValTy
->getScalarType(), Alignment
,
5656 // Get the overhead of the extractelement and insertelement instructions
5657 // we might create due to scalarization.
5658 Cost
+= getScalarizationOverhead(I
, VF
);
5660 // If we have a predicated store, it may not be executed for each vector
5661 // lane. Scale the cost by the probability of executing the predicated
5663 if (isPredicatedInst(I
)) {
5664 Cost
/= getReciprocalPredBlockProb();
5666 if (useEmulatedMaskMemRefHack(I
))
5667 // Artificially setting to a high enough value to practically disable
5668 // vectorization with such operations.
5675 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction
*I
,
5677 Type
*ValTy
= getMemInstValueType(I
);
5678 Type
*VectorTy
= ToVectorTy(ValTy
, VF
);
5679 unsigned Alignment
= getLoadStoreAlignment(I
);
5680 Value
*Ptr
= getLoadStorePointerOperand(I
);
5681 unsigned AS
= getLoadStoreAddressSpace(I
);
5682 int ConsecutiveStride
= Legal
->isConsecutivePtr(Ptr
);
5684 assert((ConsecutiveStride
== 1 || ConsecutiveStride
== -1) &&
5685 "Stride should be 1 or -1 for consecutive memory access");
5687 if (Legal
->isMaskRequired(I
))
5688 Cost
+= TTI
.getMaskedMemoryOpCost(I
->getOpcode(), VectorTy
, Alignment
, AS
);
5690 Cost
+= TTI
.getMemoryOpCost(I
->getOpcode(), VectorTy
, Alignment
, AS
, I
);
5692 bool Reverse
= ConsecutiveStride
< 0;
5694 Cost
+= TTI
.getShuffleCost(TargetTransformInfo::SK_Reverse
, VectorTy
, 0);
5698 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction
*I
,
5700 Type
*ValTy
= getMemInstValueType(I
);
5701 Type
*VectorTy
= ToVectorTy(ValTy
, VF
);
5702 unsigned Alignment
= getLoadStoreAlignment(I
);
5703 unsigned AS
= getLoadStoreAddressSpace(I
);
5704 if (isa
<LoadInst
>(I
)) {
5705 return TTI
.getAddressComputationCost(ValTy
) +
5706 TTI
.getMemoryOpCost(Instruction::Load
, ValTy
, Alignment
, AS
) +
5707 TTI
.getShuffleCost(TargetTransformInfo::SK_Broadcast
, VectorTy
);
5709 StoreInst
*SI
= cast
<StoreInst
>(I
);
5711 bool isLoopInvariantStoreValue
= Legal
->isUniform(SI
->getValueOperand());
5712 return TTI
.getAddressComputationCost(ValTy
) +
5713 TTI
.getMemoryOpCost(Instruction::Store
, ValTy
, Alignment
, AS
) +
5714 (isLoopInvariantStoreValue
? 0 : TTI
.getVectorInstrCost(
5715 Instruction::ExtractElement
,
5719 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction
*I
,
5721 Type
*ValTy
= getMemInstValueType(I
);
5722 Type
*VectorTy
= ToVectorTy(ValTy
, VF
);
5723 unsigned Alignment
= getLoadStoreAlignment(I
);
5724 Value
*Ptr
= getLoadStorePointerOperand(I
);
5726 return TTI
.getAddressComputationCost(VectorTy
) +
5727 TTI
.getGatherScatterOpCost(I
->getOpcode(), VectorTy
, Ptr
,
5728 Legal
->isMaskRequired(I
), Alignment
);
5731 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction
*I
,
5733 Type
*ValTy
= getMemInstValueType(I
);
5734 Type
*VectorTy
= ToVectorTy(ValTy
, VF
);
5735 unsigned AS
= getLoadStoreAddressSpace(I
);
5737 auto Group
= getInterleavedAccessGroup(I
);
5738 assert(Group
&& "Fail to get an interleaved access group.");
5740 unsigned InterleaveFactor
= Group
->getFactor();
5741 Type
*WideVecTy
= VectorType::get(ValTy
, VF
* InterleaveFactor
);
5743 // Holds the indices of existing members in an interleaved load group.
5744 // An interleaved store group doesn't need this as it doesn't allow gaps.
5745 SmallVector
<unsigned, 4> Indices
;
5746 if (isa
<LoadInst
>(I
)) {
5747 for (unsigned i
= 0; i
< InterleaveFactor
; i
++)
5748 if (Group
->getMember(i
))
5749 Indices
.push_back(i
);
5752 // Calculate the cost of the whole interleaved group.
5753 bool UseMaskForGaps
=
5754 Group
->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5755 unsigned Cost
= TTI
.getInterleavedMemoryOpCost(
5756 I
->getOpcode(), WideVecTy
, Group
->getFactor(), Indices
,
5757 Group
->getAlignment(), AS
, Legal
->isMaskRequired(I
), UseMaskForGaps
);
5759 if (Group
->isReverse()) {
5760 // TODO: Add support for reversed masked interleaved access.
5761 assert(!Legal
->isMaskRequired(I
) &&
5762 "Reverse masked interleaved access not supported.");
5763 Cost
+= Group
->getNumMembers() *
5764 TTI
.getShuffleCost(TargetTransformInfo::SK_Reverse
, VectorTy
, 0);
5769 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction
*I
,
5771 // Calculate scalar cost only. Vectorization cost should be ready at this
5774 Type
*ValTy
= getMemInstValueType(I
);
5775 unsigned Alignment
= getLoadStoreAlignment(I
);
5776 unsigned AS
= getLoadStoreAddressSpace(I
);
5778 return TTI
.getAddressComputationCost(ValTy
) +
5779 TTI
.getMemoryOpCost(I
->getOpcode(), ValTy
, Alignment
, AS
, I
);
5781 return getWideningCost(I
, VF
);
5784 LoopVectorizationCostModel::VectorizationCostTy
5785 LoopVectorizationCostModel::getInstructionCost(Instruction
*I
, unsigned VF
) {
5786 // If we know that this instruction will remain uniform, check the cost of
5787 // the scalar version.
5788 if (isUniformAfterVectorization(I
, VF
))
5791 if (VF
> 1 && isProfitableToScalarize(I
, VF
))
5792 return VectorizationCostTy(InstsToScalarize
[VF
][I
], false);
5794 // Forced scalars do not have any scalarization overhead.
5795 auto ForcedScalar
= ForcedScalars
.find(VF
);
5796 if (VF
> 1 && ForcedScalar
!= ForcedScalars
.end()) {
5797 auto InstSet
= ForcedScalar
->second
;
5798 if (InstSet
.find(I
) != InstSet
.end())
5799 return VectorizationCostTy((getInstructionCost(I
, 1).first
* VF
), false);
5803 unsigned C
= getInstructionCost(I
, VF
, VectorTy
);
5805 bool TypeNotScalarized
=
5806 VF
> 1 && VectorTy
->isVectorTy() && TTI
.getNumberOfParts(VectorTy
) < VF
;
5807 return VectorizationCostTy(C
, TypeNotScalarized
);
5810 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction
*I
,
5817 Type
*RetTy
= ToVectorTy(I
->getType(), VF
);
5818 if (!RetTy
->isVoidTy() &&
5819 (!isa
<LoadInst
>(I
) || !TTI
.supportsEfficientVectorElementLoadStore()))
5820 Cost
+= TTI
.getScalarizationOverhead(RetTy
, true, false);
5822 // Some targets keep addresses scalar.
5823 if (isa
<LoadInst
>(I
) && !TTI
.prefersVectorizedAddressing())
5826 // Some targets support efficient element stores.
5827 if (isa
<StoreInst
>(I
) && TTI
.supportsEfficientVectorElementLoadStore())
5830 // Collect operands to consider.
5831 CallInst
*CI
= dyn_cast
<CallInst
>(I
);
5832 Instruction::op_range Ops
= CI
? CI
->arg_operands() : I
->operands();
5834 // Skip operands that do not require extraction/scalarization and do not incur
5836 return Cost
+ TTI
.getOperandsScalarizationOverhead(
5837 filterExtractingOperands(Ops
, VF
), VF
);
5840 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF
) {
5844 for (BasicBlock
*BB
: TheLoop
->blocks()) {
5845 // For each instruction in the old loop.
5846 for (Instruction
&I
: *BB
) {
5847 Value
*Ptr
= getLoadStorePointerOperand(&I
);
5851 // TODO: We should generate better code and update the cost model for
5852 // predicated uniform stores. Today they are treated as any other
5853 // predicated store (see added test cases in
5854 // invariant-store-vectorization.ll).
5855 if (isa
<StoreInst
>(&I
) && isScalarWithPredication(&I
))
5858 if (Legal
->isUniform(Ptr
) &&
5859 // Conditional loads and stores should be scalarized and predicated.
5860 // isScalarWithPredication cannot be used here since masked
5861 // gather/scatters are not considered scalar with predication.
5862 !Legal
->blockNeedsPredication(I
.getParent())) {
5863 // TODO: Avoid replicating loads and stores instead of
5864 // relying on instcombine to remove them.
5865 // Load: Scalar load + broadcast
5866 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5867 unsigned Cost
= getUniformMemOpCost(&I
, VF
);
5868 setWideningDecision(&I
, VF
, CM_Scalarize
, Cost
);
5872 // We assume that widening is the best solution when possible.
5873 if (memoryInstructionCanBeWidened(&I
, VF
)) {
5874 unsigned Cost
= getConsecutiveMemOpCost(&I
, VF
);
5875 int ConsecutiveStride
=
5876 Legal
->isConsecutivePtr(getLoadStorePointerOperand(&I
));
5877 assert((ConsecutiveStride
== 1 || ConsecutiveStride
== -1) &&
5878 "Expected consecutive stride.");
5879 InstWidening Decision
=
5880 ConsecutiveStride
== 1 ? CM_Widen
: CM_Widen_Reverse
;
5881 setWideningDecision(&I
, VF
, Decision
, Cost
);
5885 // Choose between Interleaving, Gather/Scatter or Scalarization.
5886 unsigned InterleaveCost
= std::numeric_limits
<unsigned>::max();
5887 unsigned NumAccesses
= 1;
5888 if (isAccessInterleaved(&I
)) {
5889 auto Group
= getInterleavedAccessGroup(&I
);
5890 assert(Group
&& "Fail to get an interleaved access group.");
5892 // Make one decision for the whole group.
5893 if (getWideningDecision(&I
, VF
) != CM_Unknown
)
5896 NumAccesses
= Group
->getNumMembers();
5897 if (interleavedAccessCanBeWidened(&I
, VF
))
5898 InterleaveCost
= getInterleaveGroupCost(&I
, VF
);
5901 unsigned GatherScatterCost
=
5902 isLegalGatherOrScatter(&I
)
5903 ? getGatherScatterCost(&I
, VF
) * NumAccesses
5904 : std::numeric_limits
<unsigned>::max();
5906 unsigned ScalarizationCost
=
5907 getMemInstScalarizationCost(&I
, VF
) * NumAccesses
;
5909 // Choose better solution for the current VF,
5910 // write down this decision and use it during vectorization.
5912 InstWidening Decision
;
5913 if (InterleaveCost
<= GatherScatterCost
&&
5914 InterleaveCost
< ScalarizationCost
) {
5915 Decision
= CM_Interleave
;
5916 Cost
= InterleaveCost
;
5917 } else if (GatherScatterCost
< ScalarizationCost
) {
5918 Decision
= CM_GatherScatter
;
5919 Cost
= GatherScatterCost
;
5921 Decision
= CM_Scalarize
;
5922 Cost
= ScalarizationCost
;
5924 // If the instructions belongs to an interleave group, the whole group
5925 // receives the same decision. The whole group receives the cost, but
5926 // the cost will actually be assigned to one instruction.
5927 if (auto Group
= getInterleavedAccessGroup(&I
))
5928 setWideningDecision(Group
, VF
, Decision
, Cost
);
5930 setWideningDecision(&I
, VF
, Decision
, Cost
);
5934 // Make sure that any load of address and any other address computation
5935 // remains scalar unless there is gather/scatter support. This avoids
5936 // inevitable extracts into address registers, and also has the benefit of
5937 // activating LSR more, since that pass can't optimize vectorized
5939 if (TTI
.prefersVectorizedAddressing())
5942 // Start with all scalar pointer uses.
5943 SmallPtrSet
<Instruction
*, 8> AddrDefs
;
5944 for (BasicBlock
*BB
: TheLoop
->blocks())
5945 for (Instruction
&I
: *BB
) {
5946 Instruction
*PtrDef
=
5947 dyn_cast_or_null
<Instruction
>(getLoadStorePointerOperand(&I
));
5948 if (PtrDef
&& TheLoop
->contains(PtrDef
) &&
5949 getWideningDecision(&I
, VF
) != CM_GatherScatter
)
5950 AddrDefs
.insert(PtrDef
);
5953 // Add all instructions used to generate the addresses.
5954 SmallVector
<Instruction
*, 4> Worklist
;
5955 for (auto *I
: AddrDefs
)
5956 Worklist
.push_back(I
);
5957 while (!Worklist
.empty()) {
5958 Instruction
*I
= Worklist
.pop_back_val();
5959 for (auto &Op
: I
->operands())
5960 if (auto *InstOp
= dyn_cast
<Instruction
>(Op
))
5961 if ((InstOp
->getParent() == I
->getParent()) && !isa
<PHINode
>(InstOp
) &&
5962 AddrDefs
.insert(InstOp
).second
)
5963 Worklist
.push_back(InstOp
);
5966 for (auto *I
: AddrDefs
) {
5967 if (isa
<LoadInst
>(I
)) {
5968 // Setting the desired widening decision should ideally be handled in
5969 // by cost functions, but since this involves the task of finding out
5970 // if the loaded register is involved in an address computation, it is
5971 // instead changed here when we know this is the case.
5972 InstWidening Decision
= getWideningDecision(I
, VF
);
5973 if (Decision
== CM_Widen
|| Decision
== CM_Widen_Reverse
)
5974 // Scalarize a widened load of address.
5975 setWideningDecision(I
, VF
, CM_Scalarize
,
5976 (VF
* getMemoryInstructionCost(I
, 1)));
5977 else if (auto Group
= getInterleavedAccessGroup(I
)) {
5978 // Scalarize an interleave group of address loads.
5979 for (unsigned I
= 0; I
< Group
->getFactor(); ++I
) {
5980 if (Instruction
*Member
= Group
->getMember(I
))
5981 setWideningDecision(Member
, VF
, CM_Scalarize
,
5982 (VF
* getMemoryInstructionCost(Member
, 1)));
5986 // Make sure I gets scalarized and a cost estimate without
5987 // scalarization overhead.
5988 ForcedScalars
[VF
].insert(I
);
5992 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction
*I
,
5995 Type
*RetTy
= I
->getType();
5996 if (canTruncateToMinimalBitwidth(I
, VF
))
5997 RetTy
= IntegerType::get(RetTy
->getContext(), MinBWs
[I
]);
5998 VectorTy
= isScalarAfterVectorization(I
, VF
) ? RetTy
: ToVectorTy(RetTy
, VF
);
5999 auto SE
= PSE
.getSE();
6001 // TODO: We need to estimate the cost of intrinsic calls.
6002 switch (I
->getOpcode()) {
6003 case Instruction::GetElementPtr
:
6004 // We mark this instruction as zero-cost because the cost of GEPs in
6005 // vectorized code depends on whether the corresponding memory instruction
6006 // is scalarized or not. Therefore, we handle GEPs with the memory
6007 // instruction cost.
6009 case Instruction::Br
: {
6010 // In cases of scalarized and predicated instructions, there will be VF
6011 // predicated blocks in the vectorized loop. Each branch around these
6012 // blocks requires also an extract of its vector compare i1 element.
6013 bool ScalarPredicatedBB
= false;
6014 BranchInst
*BI
= cast
<BranchInst
>(I
);
6015 if (VF
> 1 && BI
->isConditional() &&
6016 (PredicatedBBsAfterVectorization
.find(BI
->getSuccessor(0)) !=
6017 PredicatedBBsAfterVectorization
.end() ||
6018 PredicatedBBsAfterVectorization
.find(BI
->getSuccessor(1)) !=
6019 PredicatedBBsAfterVectorization
.end()))
6020 ScalarPredicatedBB
= true;
6022 if (ScalarPredicatedBB
) {
6023 // Return cost for branches around scalarized and predicated blocks.
6025 VectorType::get(IntegerType::getInt1Ty(RetTy
->getContext()), VF
);
6026 return (TTI
.getScalarizationOverhead(Vec_i1Ty
, false, true) +
6027 (TTI
.getCFInstrCost(Instruction::Br
) * VF
));
6028 } else if (I
->getParent() == TheLoop
->getLoopLatch() || VF
== 1)
6029 // The back-edge branch will remain, as will all scalar branches.
6030 return TTI
.getCFInstrCost(Instruction::Br
);
6032 // This branch will be eliminated by if-conversion.
6034 // Note: We currently assume zero cost for an unconditional branch inside
6035 // a predicated block since it will become a fall-through, although we
6036 // may decide in the future to call TTI for all branches.
6038 case Instruction::PHI
: {
6039 auto *Phi
= cast
<PHINode
>(I
);
6041 // First-order recurrences are replaced by vector shuffles inside the loop.
6042 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6043 if (VF
> 1 && Legal
->isFirstOrderRecurrence(Phi
))
6044 return TTI
.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector
,
6045 VectorTy
, VF
- 1, VectorType::get(RetTy
, 1));
6047 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6048 // converted into select instructions. We require N - 1 selects per phi
6049 // node, where N is the number of incoming values.
6050 if (VF
> 1 && Phi
->getParent() != TheLoop
->getHeader())
6051 return (Phi
->getNumIncomingValues() - 1) *
6052 TTI
.getCmpSelInstrCost(
6053 Instruction::Select
, ToVectorTy(Phi
->getType(), VF
),
6054 ToVectorTy(Type::getInt1Ty(Phi
->getContext()), VF
));
6056 return TTI
.getCFInstrCost(Instruction::PHI
);
6058 case Instruction::UDiv
:
6059 case Instruction::SDiv
:
6060 case Instruction::URem
:
6061 case Instruction::SRem
:
6062 // If we have a predicated instruction, it may not be executed for each
6063 // vector lane. Get the scalarization cost and scale this amount by the
6064 // probability of executing the predicated block. If the instruction is not
6065 // predicated, we fall through to the next case.
6066 if (VF
> 1 && isScalarWithPredication(I
)) {
6069 // These instructions have a non-void type, so account for the phi nodes
6070 // that we will create. This cost is likely to be zero. The phi node
6071 // cost, if any, should be scaled by the block probability because it
6072 // models a copy at the end of each predicated block.
6073 Cost
+= VF
* TTI
.getCFInstrCost(Instruction::PHI
);
6075 // The cost of the non-predicated instruction.
6076 Cost
+= VF
* TTI
.getArithmeticInstrCost(I
->getOpcode(), RetTy
);
6078 // The cost of insertelement and extractelement instructions needed for
6080 Cost
+= getScalarizationOverhead(I
, VF
);
6082 // Scale the cost by the probability of executing the predicated blocks.
6083 // This assumes the predicated block for each vector lane is equally
6085 return Cost
/ getReciprocalPredBlockProb();
6088 case Instruction::Add
:
6089 case Instruction::FAdd
:
6090 case Instruction::Sub
:
6091 case Instruction::FSub
:
6092 case Instruction::Mul
:
6093 case Instruction::FMul
:
6094 case Instruction::FDiv
:
6095 case Instruction::FRem
:
6096 case Instruction::Shl
:
6097 case Instruction::LShr
:
6098 case Instruction::AShr
:
6099 case Instruction::And
:
6100 case Instruction::Or
:
6101 case Instruction::Xor
: {
6102 // Since we will replace the stride by 1 the multiplication should go away.
6103 if (I
->getOpcode() == Instruction::Mul
&& isStrideMul(I
, Legal
))
6105 // Certain instructions can be cheaper to vectorize if they have a constant
6106 // second vector operand. One example of this are shifts on x86.
6107 Value
*Op2
= I
->getOperand(1);
6108 TargetTransformInfo::OperandValueProperties Op2VP
;
6109 TargetTransformInfo::OperandValueKind Op2VK
=
6110 TTI
.getOperandInfo(Op2
, Op2VP
);
6111 if (Op2VK
== TargetTransformInfo::OK_AnyValue
&& Legal
->isUniform(Op2
))
6112 Op2VK
= TargetTransformInfo::OK_UniformValue
;
6114 SmallVector
<const Value
*, 4> Operands(I
->operand_values());
6115 unsigned N
= isScalarAfterVectorization(I
, VF
) ? VF
: 1;
6116 return N
* TTI
.getArithmeticInstrCost(
6117 I
->getOpcode(), VectorTy
, TargetTransformInfo::OK_AnyValue
,
6118 Op2VK
, TargetTransformInfo::OP_None
, Op2VP
, Operands
);
6120 case Instruction::FNeg
: {
6121 unsigned N
= isScalarAfterVectorization(I
, VF
) ? VF
: 1;
6122 return N
* TTI
.getArithmeticInstrCost(
6123 I
->getOpcode(), VectorTy
, TargetTransformInfo::OK_AnyValue
,
6124 TargetTransformInfo::OK_AnyValue
,
6125 TargetTransformInfo::OP_None
, TargetTransformInfo::OP_None
,
6128 case Instruction::Select
: {
6129 SelectInst
*SI
= cast
<SelectInst
>(I
);
6130 const SCEV
*CondSCEV
= SE
->getSCEV(SI
->getCondition());
6131 bool ScalarCond
= (SE
->isLoopInvariant(CondSCEV
, TheLoop
));
6132 Type
*CondTy
= SI
->getCondition()->getType();
6134 CondTy
= VectorType::get(CondTy
, VF
);
6136 return TTI
.getCmpSelInstrCost(I
->getOpcode(), VectorTy
, CondTy
, I
);
6138 case Instruction::ICmp
:
6139 case Instruction::FCmp
: {
6140 Type
*ValTy
= I
->getOperand(0)->getType();
6141 Instruction
*Op0AsInstruction
= dyn_cast
<Instruction
>(I
->getOperand(0));
6142 if (canTruncateToMinimalBitwidth(Op0AsInstruction
, VF
))
6143 ValTy
= IntegerType::get(ValTy
->getContext(), MinBWs
[Op0AsInstruction
]);
6144 VectorTy
= ToVectorTy(ValTy
, VF
);
6145 return TTI
.getCmpSelInstrCost(I
->getOpcode(), VectorTy
, nullptr, I
);
6147 case Instruction::Store
:
6148 case Instruction::Load
: {
6149 unsigned Width
= VF
;
6151 InstWidening Decision
= getWideningDecision(I
, Width
);
6152 assert(Decision
!= CM_Unknown
&&
6153 "CM decision should be taken at this point");
6154 if (Decision
== CM_Scalarize
)
6157 VectorTy
= ToVectorTy(getMemInstValueType(I
), Width
);
6158 return getMemoryInstructionCost(I
, VF
);
6160 case Instruction::ZExt
:
6161 case Instruction::SExt
:
6162 case Instruction::FPToUI
:
6163 case Instruction::FPToSI
:
6164 case Instruction::FPExt
:
6165 case Instruction::PtrToInt
:
6166 case Instruction::IntToPtr
:
6167 case Instruction::SIToFP
:
6168 case Instruction::UIToFP
:
6169 case Instruction::Trunc
:
6170 case Instruction::FPTrunc
:
6171 case Instruction::BitCast
: {
6172 // We optimize the truncation of induction variables having constant
6173 // integer steps. The cost of these truncations is the same as the scalar
6175 if (isOptimizableIVTruncate(I
, VF
)) {
6176 auto *Trunc
= cast
<TruncInst
>(I
);
6177 return TTI
.getCastInstrCost(Instruction::Trunc
, Trunc
->getDestTy(),
6178 Trunc
->getSrcTy(), Trunc
);
6181 Type
*SrcScalarTy
= I
->getOperand(0)->getType();
6183 VectorTy
->isVectorTy() ? ToVectorTy(SrcScalarTy
, VF
) : SrcScalarTy
;
6184 if (canTruncateToMinimalBitwidth(I
, VF
)) {
6185 // This cast is going to be shrunk. This may remove the cast or it might
6186 // turn it into slightly different cast. For example, if MinBW == 16,
6187 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6189 // Calculate the modified src and dest types.
6190 Type
*MinVecTy
= VectorTy
;
6191 if (I
->getOpcode() == Instruction::Trunc
) {
6192 SrcVecTy
= smallestIntegerVectorType(SrcVecTy
, MinVecTy
);
6194 largestIntegerVectorType(ToVectorTy(I
->getType(), VF
), MinVecTy
);
6195 } else if (I
->getOpcode() == Instruction::ZExt
||
6196 I
->getOpcode() == Instruction::SExt
) {
6197 SrcVecTy
= largestIntegerVectorType(SrcVecTy
, MinVecTy
);
6199 smallestIntegerVectorType(ToVectorTy(I
->getType(), VF
), MinVecTy
);
6203 unsigned N
= isScalarAfterVectorization(I
, VF
) ? VF
: 1;
6204 return N
* TTI
.getCastInstrCost(I
->getOpcode(), VectorTy
, SrcVecTy
, I
);
6206 case Instruction::Call
: {
6207 bool NeedToScalarize
;
6208 CallInst
*CI
= cast
<CallInst
>(I
);
6209 unsigned CallCost
= getVectorCallCost(CI
, VF
, NeedToScalarize
);
6210 if (getVectorIntrinsicIDForCall(CI
, TLI
))
6211 return std::min(CallCost
, getVectorIntrinsicCost(CI
, VF
));
6215 // The cost of executing VF copies of the scalar instruction. This opcode
6216 // is unknown. Assume that it is the same as 'mul'.
6217 return VF
* TTI
.getArithmeticInstrCost(Instruction::Mul
, VectorTy
) +
6218 getScalarizationOverhead(I
, VF
);
6222 char LoopVectorize::ID
= 0;
6224 static const char lv_name
[] = "Loop Vectorization";
6226 INITIALIZE_PASS_BEGIN(LoopVectorize
, LV_NAME
, lv_name
, false, false)
6227 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass
)
6228 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass
)
6229 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass
)
6230 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass
)
6231 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker
)
6232 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass
)
6233 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass
)
6234 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass
)
6235 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass
)
6236 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis
)
6237 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass
)
6238 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass
)
6239 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass
)
6240 INITIALIZE_PASS_END(LoopVectorize
, LV_NAME
, lv_name
, false, false)
6244 Pass
*createLoopVectorizePass() { return new LoopVectorize(); }
6246 Pass
*createLoopVectorizePass(bool InterleaveOnlyWhenForced
,
6247 bool VectorizeOnlyWhenForced
) {
6248 return new LoopVectorize(InterleaveOnlyWhenForced
, VectorizeOnlyWhenForced
);
6251 } // end namespace llvm
6253 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction
*Inst
) {
6254 // Check if the pointer operand of a load or store instruction is
6256 if (auto *Ptr
= getLoadStorePointerOperand(Inst
))
6257 return Legal
->isConsecutivePtr(Ptr
);
6261 void LoopVectorizationCostModel::collectValuesToIgnore() {
6262 // Ignore ephemeral values.
6263 CodeMetrics::collectEphemeralValues(TheLoop
, AC
, ValuesToIgnore
);
6265 // Ignore type-promoting instructions we identified during reduction
6267 for (auto &Reduction
: *Legal
->getReductionVars()) {
6268 RecurrenceDescriptor
&RedDes
= Reduction
.second
;
6269 SmallPtrSetImpl
<Instruction
*> &Casts
= RedDes
.getCastInsts();
6270 VecValuesToIgnore
.insert(Casts
.begin(), Casts
.end());
6272 // Ignore type-casting instructions we identified during induction
6274 for (auto &Induction
: *Legal
->getInductionVars()) {
6275 InductionDescriptor
&IndDes
= Induction
.second
;
6276 const SmallVectorImpl
<Instruction
*> &Casts
= IndDes
.getCastInsts();
6277 VecValuesToIgnore
.insert(Casts
.begin(), Casts
.end());
6281 // TODO: we could return a pair of values that specify the max VF and
6282 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6283 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6284 // doesn't have a cost model that can choose which plan to execute if
6285 // more than one is generated.
6286 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits
,
6287 LoopVectorizationCostModel
&CM
) {
6288 unsigned WidestType
;
6289 std::tie(std::ignore
, WidestType
) = CM
.getSmallestAndWidestTypes();
6290 return WidestVectorRegBits
/ WidestType
;
6294 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF
) {
6295 unsigned VF
= UserVF
;
6296 // Outer loop handling: They may require CFG and instruction level
6297 // transformations before even evaluating whether vectorization is profitable.
6298 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6299 // the vectorization pipeline.
6300 if (!OrigLoop
->empty()) {
6301 // If the user doesn't provide a vectorization factor, determine a
6304 VF
= determineVPlanVF(TTI
->getRegisterBitWidth(true /* Vector*/), CM
);
6305 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF
<< ".\n");
6307 // Make sure we have a VF > 1 for stress testing.
6308 if (VPlanBuildStressTest
&& VF
< 2) {
6309 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6310 << "overriding computed VF.\n");
6314 assert(EnableVPlanNativePath
&& "VPlan-native path is not enabled.");
6315 assert(isPowerOf2_32(VF
) && "VF needs to be a power of two");
6316 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF
? "user " : "") << "VF " << VF
6317 << " to build VPlans.\n");
6318 buildVPlans(VF
, VF
);
6320 // For VPlan build stress testing, we bail out after VPlan construction.
6321 if (VPlanBuildStressTest
)
6322 return VectorizationFactor::Disabled();
6328 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6329 "VPlan-native path.\n");
6330 return VectorizationFactor::Disabled();
6333 Optional
<VectorizationFactor
> LoopVectorizationPlanner::plan(unsigned UserVF
) {
6334 assert(OrigLoop
->empty() && "Inner loop expected.");
6335 Optional
<unsigned> MaybeMaxVF
= CM
.computeMaxVF();
6336 if (!MaybeMaxVF
) // Cases that should not to be vectorized nor interleaved.
6339 // Invalidate interleave groups if all blocks of loop will be predicated.
6340 if (CM
.blockNeedsPredication(OrigLoop
->getHeader()) &&
6341 !useMaskedInterleavedAccesses(*TTI
)) {
6344 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6345 "which requires masked-interleaved support.\n");
6346 CM
.InterleaveInfo
.reset();
6350 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF
<< ".\n");
6351 assert(isPowerOf2_32(UserVF
) && "VF needs to be a power of two");
6352 // Collect the instructions (and their associated costs) that will be more
6353 // profitable to scalarize.
6354 CM
.selectUserVectorizationFactor(UserVF
);
6355 buildVPlansWithVPRecipes(UserVF
, UserVF
);
6356 LLVM_DEBUG(printPlans(dbgs()));
6357 return {{UserVF
, 0}};
6360 unsigned MaxVF
= MaybeMaxVF
.getValue();
6361 assert(MaxVF
!= 0 && "MaxVF is zero.");
6363 for (unsigned VF
= 1; VF
<= MaxVF
; VF
*= 2) {
6364 // Collect Uniform and Scalar instructions after vectorization with VF.
6365 CM
.collectUniformsAndScalars(VF
);
6367 // Collect the instructions (and their associated costs) that will be more
6368 // profitable to scalarize.
6370 CM
.collectInstsToScalarize(VF
);
6373 buildVPlansWithVPRecipes(1, MaxVF
);
6374 LLVM_DEBUG(printPlans(dbgs()));
6376 return VectorizationFactor::Disabled();
6378 // Select the optimal vectorization factor.
6379 return CM
.selectVectorizationFactor(MaxVF
);
6382 void LoopVectorizationPlanner::setBestPlan(unsigned VF
, unsigned UF
) {
6383 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF
<< ", UF=" << UF
6388 erase_if(VPlans
, [VF
](const VPlanPtr
&Plan
) {
6389 return !Plan
->hasVF(VF
);
6391 assert(VPlans
.size() == 1 && "Best VF has not a single VPlan.");
6394 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer
&ILV
,
6395 DominatorTree
*DT
) {
6396 // Perform the actual loop transformation.
6398 // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6399 VPCallbackILV
CallbackILV(ILV
);
6401 VPTransformState State
{BestVF
, BestUF
, LI
,
6402 DT
, ILV
.Builder
, ILV
.VectorLoopValueMap
,
6404 State
.CFG
.PrevBB
= ILV
.createVectorizedLoopSkeleton();
6405 State
.TripCount
= ILV
.getOrCreateTripCount(nullptr);
6407 //===------------------------------------------------===//
6409 // Notice: any optimization or new instruction that go
6410 // into the code below should also be implemented in
6413 //===------------------------------------------------===//
6415 // 2. Copy and widen instructions from the old loop into the new loop.
6416 assert(VPlans
.size() == 1 && "Not a single VPlan to execute.");
6417 VPlans
.front()->execute(&State
);
6419 // 3. Fix the vectorized code: take care of header phi's, live-outs,
6420 // predication, updating analyses.
6421 ILV
.fixVectorizedLoop();
6424 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6425 SmallPtrSetImpl
<Instruction
*> &DeadInstructions
) {
6426 BasicBlock
*Latch
= OrigLoop
->getLoopLatch();
6428 // We create new control-flow for the vectorized loop, so the original
6429 // condition will be dead after vectorization if it's only used by the
6431 auto *Cmp
= dyn_cast
<Instruction
>(Latch
->getTerminator()->getOperand(0));
6432 if (Cmp
&& Cmp
->hasOneUse())
6433 DeadInstructions
.insert(Cmp
);
6435 // We create new "steps" for induction variable updates to which the original
6436 // induction variables map. An original update instruction will be dead if
6437 // all its users except the induction variable are dead.
6438 for (auto &Induction
: *Legal
->getInductionVars()) {
6439 PHINode
*Ind
= Induction
.first
;
6440 auto *IndUpdate
= cast
<Instruction
>(Ind
->getIncomingValueForBlock(Latch
));
6441 if (llvm::all_of(IndUpdate
->users(), [&](User
*U
) -> bool {
6442 return U
== Ind
|| DeadInstructions
.find(cast
<Instruction
>(U
)) !=
6443 DeadInstructions
.end();
6445 DeadInstructions
.insert(IndUpdate
);
6447 // We record as "Dead" also the type-casting instructions we had identified
6448 // during induction analysis. We don't need any handling for them in the
6449 // vectorized loop because we have proven that, under a proper runtime
6450 // test guarding the vectorized loop, the value of the phi, and the casted
6451 // value of the phi, are the same. The last instruction in this casting chain
6452 // will get its scalar/vector/widened def from the scalar/vector/widened def
6453 // of the respective phi node. Any other casts in the induction def-use chain
6454 // have no other uses outside the phi update chain, and will be ignored.
6455 InductionDescriptor
&IndDes
= Induction
.second
;
6456 const SmallVectorImpl
<Instruction
*> &Casts
= IndDes
.getCastInsts();
6457 DeadInstructions
.insert(Casts
.begin(), Casts
.end());
6461 Value
*InnerLoopUnroller::reverseVector(Value
*Vec
) { return Vec
; }
6463 Value
*InnerLoopUnroller::getBroadcastInstrs(Value
*V
) { return V
; }
6465 Value
*InnerLoopUnroller::getStepVector(Value
*Val
, int StartIdx
, Value
*Step
,
6466 Instruction::BinaryOps BinOp
) {
6467 // When unrolling and the VF is 1, we only need to add a simple scalar.
6468 Type
*Ty
= Val
->getType();
6469 assert(!Ty
->isVectorTy() && "Val must be a scalar");
6471 if (Ty
->isFloatingPointTy()) {
6472 Constant
*C
= ConstantFP::get(Ty
, (double)StartIdx
);
6474 // Floating point operations had to be 'fast' to enable the unrolling.
6475 Value
*MulOp
= addFastMathFlag(Builder
.CreateFMul(C
, Step
));
6476 return addFastMathFlag(Builder
.CreateBinOp(BinOp
, Val
, MulOp
));
6478 Constant
*C
= ConstantInt::get(Ty
, StartIdx
);
6479 return Builder
.CreateAdd(Val
, Builder
.CreateMul(C
, Step
), "induction");
6482 static void AddRuntimeUnrollDisableMetaData(Loop
*L
) {
6483 SmallVector
<Metadata
*, 4> MDs
;
6484 // Reserve first location for self reference to the LoopID metadata node.
6485 MDs
.push_back(nullptr);
6486 bool IsUnrollMetadata
= false;
6487 MDNode
*LoopID
= L
->getLoopID();
6489 // First find existing loop unrolling disable metadata.
6490 for (unsigned i
= 1, ie
= LoopID
->getNumOperands(); i
< ie
; ++i
) {
6491 auto *MD
= dyn_cast
<MDNode
>(LoopID
->getOperand(i
));
6493 const auto *S
= dyn_cast
<MDString
>(MD
->getOperand(0));
6495 S
&& S
->getString().startswith("llvm.loop.unroll.disable");
6497 MDs
.push_back(LoopID
->getOperand(i
));
6501 if (!IsUnrollMetadata
) {
6502 // Add runtime unroll disable metadata.
6503 LLVMContext
&Context
= L
->getHeader()->getContext();
6504 SmallVector
<Metadata
*, 1> DisableOperands
;
6505 DisableOperands
.push_back(
6506 MDString::get(Context
, "llvm.loop.unroll.runtime.disable"));
6507 MDNode
*DisableNode
= MDNode::get(Context
, DisableOperands
);
6508 MDs
.push_back(DisableNode
);
6509 MDNode
*NewLoopID
= MDNode::get(Context
, MDs
);
6510 // Set operand 0 to refer to the loop id itself.
6511 NewLoopID
->replaceOperandWith(0, NewLoopID
);
6512 L
->setLoopID(NewLoopID
);
6516 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6517 const std::function
<bool(unsigned)> &Predicate
, VFRange
&Range
) {
6518 assert(Range
.End
> Range
.Start
&& "Trying to test an empty VF range.");
6519 bool PredicateAtRangeStart
= Predicate(Range
.Start
);
6521 for (unsigned TmpVF
= Range
.Start
* 2; TmpVF
< Range
.End
; TmpVF
*= 2)
6522 if (Predicate(TmpVF
) != PredicateAtRangeStart
) {
6527 return PredicateAtRangeStart
;
6530 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6531 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6532 /// of VF's starting at a given VF and extending it as much as possible. Each
6533 /// vectorization decision can potentially shorten this sub-range during
6535 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF
, unsigned MaxVF
) {
6536 for (unsigned VF
= MinVF
; VF
< MaxVF
+ 1;) {
6537 VFRange SubRange
= {VF
, MaxVF
+ 1};
6538 VPlans
.push_back(buildVPlan(SubRange
));
6543 VPValue
*VPRecipeBuilder::createEdgeMask(BasicBlock
*Src
, BasicBlock
*Dst
,
6545 assert(is_contained(predecessors(Dst
), Src
) && "Invalid edge");
6547 // Look for cached value.
6548 std::pair
<BasicBlock
*, BasicBlock
*> Edge(Src
, Dst
);
6549 EdgeMaskCacheTy::iterator ECEntryIt
= EdgeMaskCache
.find(Edge
);
6550 if (ECEntryIt
!= EdgeMaskCache
.end())
6551 return ECEntryIt
->second
;
6553 VPValue
*SrcMask
= createBlockInMask(Src
, Plan
);
6555 // The terminator has to be a branch inst!
6556 BranchInst
*BI
= dyn_cast
<BranchInst
>(Src
->getTerminator());
6557 assert(BI
&& "Unexpected terminator found");
6559 if (!BI
->isConditional())
6560 return EdgeMaskCache
[Edge
] = SrcMask
;
6562 VPValue
*EdgeMask
= Plan
->getVPValue(BI
->getCondition());
6563 assert(EdgeMask
&& "No Edge Mask found for condition");
6565 if (BI
->getSuccessor(0) != Dst
)
6566 EdgeMask
= Builder
.createNot(EdgeMask
);
6568 if (SrcMask
) // Otherwise block in-mask is all-one, no need to AND.
6569 EdgeMask
= Builder
.createAnd(EdgeMask
, SrcMask
);
6571 return EdgeMaskCache
[Edge
] = EdgeMask
;
6574 VPValue
*VPRecipeBuilder::createBlockInMask(BasicBlock
*BB
, VPlanPtr
&Plan
) {
6575 assert(OrigLoop
->contains(BB
) && "Block is not a part of a loop");
6577 // Look for cached value.
6578 BlockMaskCacheTy::iterator BCEntryIt
= BlockMaskCache
.find(BB
);
6579 if (BCEntryIt
!= BlockMaskCache
.end())
6580 return BCEntryIt
->second
;
6582 // All-one mask is modelled as no-mask following the convention for masked
6583 // load/store/gather/scatter. Initialize BlockMask to no-mask.
6584 VPValue
*BlockMask
= nullptr;
6586 if (OrigLoop
->getHeader() == BB
) {
6587 if (!CM
.blockNeedsPredication(BB
))
6588 return BlockMaskCache
[BB
] = BlockMask
; // Loop incoming mask is all-one.
6590 // Introduce the early-exit compare IV <= BTC to form header block mask.
6591 // This is used instead of IV < TC because TC may wrap, unlike BTC.
6592 VPValue
*IV
= Plan
->getVPValue(Legal
->getPrimaryInduction());
6593 VPValue
*BTC
= Plan
->getOrCreateBackedgeTakenCount();
6594 BlockMask
= Builder
.createNaryOp(VPInstruction::ICmpULE
, {IV
, BTC
});
6595 return BlockMaskCache
[BB
] = BlockMask
;
6598 // This is the block mask. We OR all incoming edges.
6599 for (auto *Predecessor
: predecessors(BB
)) {
6600 VPValue
*EdgeMask
= createEdgeMask(Predecessor
, BB
, Plan
);
6601 if (!EdgeMask
) // Mask of predecessor is all-one so mask of block is too.
6602 return BlockMaskCache
[BB
] = EdgeMask
;
6604 if (!BlockMask
) { // BlockMask has its initialized nullptr value.
6605 BlockMask
= EdgeMask
;
6609 BlockMask
= Builder
.createOr(BlockMask
, EdgeMask
);
6612 return BlockMaskCache
[BB
] = BlockMask
;
6615 VPInterleaveRecipe
*VPRecipeBuilder::tryToInterleaveMemory(Instruction
*I
,
6618 const InterleaveGroup
<Instruction
> *IG
= CM
.getInterleavedAccessGroup(I
);
6622 // Now check if IG is relevant for VF's in the given range.
6623 auto isIGMember
= [&](Instruction
*I
) -> std::function
<bool(unsigned)> {
6624 return [=](unsigned VF
) -> bool {
6625 return (VF
>= 2 && // Query is illegal for VF == 1
6626 CM
.getWideningDecision(I
, VF
) ==
6627 LoopVectorizationCostModel::CM_Interleave
);
6630 if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I
), Range
))
6633 // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
6634 // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
6635 // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
6636 assert(I
== IG
->getInsertPos() &&
6637 "Generating a recipe for an adjunct member of an interleave group");
6639 VPValue
*Mask
= nullptr;
6640 if (Legal
->isMaskRequired(I
))
6641 Mask
= createBlockInMask(I
->getParent(), Plan
);
6643 return new VPInterleaveRecipe(IG
, Mask
);
6646 VPWidenMemoryInstructionRecipe
*
6647 VPRecipeBuilder::tryToWidenMemory(Instruction
*I
, VFRange
&Range
,
6649 if (!isa
<LoadInst
>(I
) && !isa
<StoreInst
>(I
))
6652 auto willWiden
= [&](unsigned VF
) -> bool {
6655 if (CM
.isScalarAfterVectorization(I
, VF
) ||
6656 CM
.isProfitableToScalarize(I
, VF
))
6658 LoopVectorizationCostModel::InstWidening Decision
=
6659 CM
.getWideningDecision(I
, VF
);
6660 assert(Decision
!= LoopVectorizationCostModel::CM_Unknown
&&
6661 "CM decision should be taken at this point.");
6662 assert(Decision
!= LoopVectorizationCostModel::CM_Interleave
&&
6663 "Interleave memory opportunity should be caught earlier.");
6664 return Decision
!= LoopVectorizationCostModel::CM_Scalarize
;
6667 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden
, Range
))
6670 VPValue
*Mask
= nullptr;
6671 if (Legal
->isMaskRequired(I
))
6672 Mask
= createBlockInMask(I
->getParent(), Plan
);
6674 return new VPWidenMemoryInstructionRecipe(*I
, Mask
);
6677 VPWidenIntOrFpInductionRecipe
*
6678 VPRecipeBuilder::tryToOptimizeInduction(Instruction
*I
, VFRange
&Range
) {
6679 if (PHINode
*Phi
= dyn_cast
<PHINode
>(I
)) {
6680 // Check if this is an integer or fp induction. If so, build the recipe that
6681 // produces its scalar and vector values.
6682 InductionDescriptor II
= Legal
->getInductionVars()->lookup(Phi
);
6683 if (II
.getKind() == InductionDescriptor::IK_IntInduction
||
6684 II
.getKind() == InductionDescriptor::IK_FpInduction
)
6685 return new VPWidenIntOrFpInductionRecipe(Phi
);
6690 // Optimize the special case where the source is a constant integer
6691 // induction variable. Notice that we can only optimize the 'trunc' case
6692 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6693 // (c) other casts depend on pointer size.
6695 // Determine whether \p K is a truncation based on an induction variable that
6696 // can be optimized.
6697 auto isOptimizableIVTruncate
=
6698 [&](Instruction
*K
) -> std::function
<bool(unsigned)> {
6700 [=](unsigned VF
) -> bool { return CM
.isOptimizableIVTruncate(K
, VF
); };
6703 if (isa
<TruncInst
>(I
) && LoopVectorizationPlanner::getDecisionAndClampRange(
6704 isOptimizableIVTruncate(I
), Range
))
6705 return new VPWidenIntOrFpInductionRecipe(cast
<PHINode
>(I
->getOperand(0)),
6706 cast
<TruncInst
>(I
));
6710 VPBlendRecipe
*VPRecipeBuilder::tryToBlend(Instruction
*I
, VPlanPtr
&Plan
) {
6711 PHINode
*Phi
= dyn_cast
<PHINode
>(I
);
6712 if (!Phi
|| Phi
->getParent() == OrigLoop
->getHeader())
6715 // We know that all PHIs in non-header blocks are converted into selects, so
6716 // we don't have to worry about the insertion order and we can just use the
6717 // builder. At this point we generate the predication tree. There may be
6718 // duplications since this is a simple recursive scan, but future
6719 // optimizations will clean it up.
6721 SmallVector
<VPValue
*, 2> Masks
;
6722 unsigned NumIncoming
= Phi
->getNumIncomingValues();
6723 for (unsigned In
= 0; In
< NumIncoming
; In
++) {
6725 createEdgeMask(Phi
->getIncomingBlock(In
), Phi
->getParent(), Plan
);
6726 assert((EdgeMask
|| NumIncoming
== 1) &&
6727 "Multiple predecessors with one having a full mask");
6729 Masks
.push_back(EdgeMask
);
6731 return new VPBlendRecipe(Phi
, Masks
);
6734 bool VPRecipeBuilder::tryToWiden(Instruction
*I
, VPBasicBlock
*VPBB
,
6737 bool IsPredicated
= LoopVectorizationPlanner::getDecisionAndClampRange(
6738 [&](unsigned VF
) { return CM
.isScalarWithPredication(I
, VF
); }, Range
);
6743 auto IsVectorizableOpcode
= [](unsigned Opcode
) {
6745 case Instruction::Add
:
6746 case Instruction::And
:
6747 case Instruction::AShr
:
6748 case Instruction::BitCast
:
6749 case Instruction::Br
:
6750 case Instruction::Call
:
6751 case Instruction::FAdd
:
6752 case Instruction::FCmp
:
6753 case Instruction::FDiv
:
6754 case Instruction::FMul
:
6755 case Instruction::FNeg
:
6756 case Instruction::FPExt
:
6757 case Instruction::FPToSI
:
6758 case Instruction::FPToUI
:
6759 case Instruction::FPTrunc
:
6760 case Instruction::FRem
:
6761 case Instruction::FSub
:
6762 case Instruction::GetElementPtr
:
6763 case Instruction::ICmp
:
6764 case Instruction::IntToPtr
:
6765 case Instruction::Load
:
6766 case Instruction::LShr
:
6767 case Instruction::Mul
:
6768 case Instruction::Or
:
6769 case Instruction::PHI
:
6770 case Instruction::PtrToInt
:
6771 case Instruction::SDiv
:
6772 case Instruction::Select
:
6773 case Instruction::SExt
:
6774 case Instruction::Shl
:
6775 case Instruction::SIToFP
:
6776 case Instruction::SRem
:
6777 case Instruction::Store
:
6778 case Instruction::Sub
:
6779 case Instruction::Trunc
:
6780 case Instruction::UDiv
:
6781 case Instruction::UIToFP
:
6782 case Instruction::URem
:
6783 case Instruction::Xor
:
6784 case Instruction::ZExt
:
6790 if (!IsVectorizableOpcode(I
->getOpcode()))
6793 if (CallInst
*CI
= dyn_cast
<CallInst
>(I
)) {
6794 Intrinsic::ID ID
= getVectorIntrinsicIDForCall(CI
, TLI
);
6795 if (ID
&& (ID
== Intrinsic::assume
|| ID
== Intrinsic::lifetime_end
||
6796 ID
== Intrinsic::lifetime_start
|| ID
== Intrinsic::sideeffect
))
6800 auto willWiden
= [&](unsigned VF
) -> bool {
6801 if (!isa
<PHINode
>(I
) && (CM
.isScalarAfterVectorization(I
, VF
) ||
6802 CM
.isProfitableToScalarize(I
, VF
)))
6804 if (CallInst
*CI
= dyn_cast
<CallInst
>(I
)) {
6805 Intrinsic::ID ID
= getVectorIntrinsicIDForCall(CI
, TLI
);
6806 // The following case may be scalarized depending on the VF.
6807 // The flag shows whether we use Intrinsic or a usual Call for vectorized
6808 // version of the instruction.
6809 // Is it beneficial to perform intrinsic call compared to lib call?
6810 bool NeedToScalarize
;
6811 unsigned CallCost
= CM
.getVectorCallCost(CI
, VF
, NeedToScalarize
);
6812 bool UseVectorIntrinsic
=
6813 ID
&& CM
.getVectorIntrinsicCost(CI
, VF
) <= CallCost
;
6814 return UseVectorIntrinsic
|| !NeedToScalarize
;
6816 if (isa
<LoadInst
>(I
) || isa
<StoreInst
>(I
)) {
6817 assert(CM
.getWideningDecision(I
, VF
) ==
6818 LoopVectorizationCostModel::CM_Scalarize
&&
6819 "Memory widening decisions should have been taken care by now");
6825 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden
, Range
))
6828 // Success: widen this instruction. We optimize the common case where
6829 // consecutive instructions can be represented by a single recipe.
6830 if (!VPBB
->empty()) {
6831 VPWidenRecipe
*LastWidenRecipe
= dyn_cast
<VPWidenRecipe
>(&VPBB
->back());
6832 if (LastWidenRecipe
&& LastWidenRecipe
->appendInstruction(I
))
6836 VPBB
->appendRecipe(new VPWidenRecipe(I
));
6840 VPBasicBlock
*VPRecipeBuilder::handleReplication(
6841 Instruction
*I
, VFRange
&Range
, VPBasicBlock
*VPBB
,
6842 DenseMap
<Instruction
*, VPReplicateRecipe
*> &PredInst2Recipe
,
6844 bool IsUniform
= LoopVectorizationPlanner::getDecisionAndClampRange(
6845 [&](unsigned VF
) { return CM
.isUniformAfterVectorization(I
, VF
); },
6848 bool IsPredicated
= LoopVectorizationPlanner::getDecisionAndClampRange(
6849 [&](unsigned VF
) { return CM
.isScalarWithPredication(I
, VF
); }, Range
);
6851 auto *Recipe
= new VPReplicateRecipe(I
, IsUniform
, IsPredicated
);
6853 // Find if I uses a predicated instruction. If so, it will use its scalar
6854 // value. Avoid hoisting the insert-element which packs the scalar value into
6855 // a vector value, as that happens iff all users use the vector value.
6856 for (auto &Op
: I
->operands())
6857 if (auto *PredInst
= dyn_cast
<Instruction
>(Op
))
6858 if (PredInst2Recipe
.find(PredInst
) != PredInst2Recipe
.end())
6859 PredInst2Recipe
[PredInst
]->setAlsoPack(false);
6861 // Finalize the recipe for Instr, first if it is not predicated.
6862 if (!IsPredicated
) {
6863 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I
<< "\n");
6864 VPBB
->appendRecipe(Recipe
);
6867 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I
<< "\n");
6868 assert(VPBB
->getSuccessors().empty() &&
6869 "VPBB has successors when handling predicated replication.");
6870 // Record predicated instructions for above packing optimizations.
6871 PredInst2Recipe
[I
] = Recipe
;
6872 VPBlockBase
*Region
= createReplicateRegion(I
, Recipe
, Plan
);
6873 VPBlockUtils::insertBlockAfter(Region
, VPBB
);
6874 auto *RegSucc
= new VPBasicBlock();
6875 VPBlockUtils::insertBlockAfter(RegSucc
, Region
);
6879 VPRegionBlock
*VPRecipeBuilder::createReplicateRegion(Instruction
*Instr
,
6880 VPRecipeBase
*PredRecipe
,
6882 // Instructions marked for predication are replicated and placed under an
6883 // if-then construct to prevent side-effects.
6885 // Generate recipes to compute the block mask for this region.
6886 VPValue
*BlockInMask
= createBlockInMask(Instr
->getParent(), Plan
);
6888 // Build the triangular if-then region.
6889 std::string RegionName
= (Twine("pred.") + Instr
->getOpcodeName()).str();
6890 assert(Instr
->getParent() && "Predicated instruction not in any basic block");
6891 auto *BOMRecipe
= new VPBranchOnMaskRecipe(BlockInMask
);
6892 auto *Entry
= new VPBasicBlock(Twine(RegionName
) + ".entry", BOMRecipe
);
6894 Instr
->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr
);
6895 auto *Exit
= new VPBasicBlock(Twine(RegionName
) + ".continue", PHIRecipe
);
6896 auto *Pred
= new VPBasicBlock(Twine(RegionName
) + ".if", PredRecipe
);
6897 VPRegionBlock
*Region
= new VPRegionBlock(Entry
, Exit
, RegionName
, true);
6899 // Note: first set Entry as region entry and then connect successors starting
6900 // from it in order, to propagate the "parent" of each VPBasicBlock.
6901 VPBlockUtils::insertTwoBlocksAfter(Pred
, Exit
, BlockInMask
, Entry
);
6902 VPBlockUtils::connectBlocks(Pred
, Exit
);
6907 bool VPRecipeBuilder::tryToCreateRecipe(Instruction
*Instr
, VFRange
&Range
,
6908 VPlanPtr
&Plan
, VPBasicBlock
*VPBB
) {
6909 VPRecipeBase
*Recipe
= nullptr;
6910 // Check if Instr should belong to an interleave memory recipe, or already
6911 // does. In the latter case Instr is irrelevant.
6912 if ((Recipe
= tryToInterleaveMemory(Instr
, Range
, Plan
))) {
6913 VPBB
->appendRecipe(Recipe
);
6917 // Check if Instr is a memory operation that should be widened.
6918 if ((Recipe
= tryToWidenMemory(Instr
, Range
, Plan
))) {
6919 VPBB
->appendRecipe(Recipe
);
6923 // Check if Instr should form some PHI recipe.
6924 if ((Recipe
= tryToOptimizeInduction(Instr
, Range
))) {
6925 VPBB
->appendRecipe(Recipe
);
6928 if ((Recipe
= tryToBlend(Instr
, Plan
))) {
6929 VPBB
->appendRecipe(Recipe
);
6932 if (PHINode
*Phi
= dyn_cast
<PHINode
>(Instr
)) {
6933 VPBB
->appendRecipe(new VPWidenPHIRecipe(Phi
));
6937 // Check if Instr is to be widened by a general VPWidenRecipe, after
6938 // having first checked for specific widening recipes that deal with
6939 // Interleave Groups, Inductions and Phi nodes.
6940 if (tryToWiden(Instr
, VPBB
, Range
))
6946 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF
,
6948 assert(OrigLoop
->empty() && "Inner loop expected.");
6950 // Collect conditions feeding internal conditional branches; they need to be
6951 // represented in VPlan for it to model masking.
6952 SmallPtrSet
<Value
*, 1> NeedDef
;
6954 auto *Latch
= OrigLoop
->getLoopLatch();
6955 for (BasicBlock
*BB
: OrigLoop
->blocks()) {
6958 BranchInst
*Branch
= dyn_cast
<BranchInst
>(BB
->getTerminator());
6959 if (Branch
&& Branch
->isConditional())
6960 NeedDef
.insert(Branch
->getCondition());
6963 // If the tail is to be folded by masking, the primary induction variable
6964 // needs to be represented in VPlan for it to model early-exit masking.
6965 // Also, both the Phi and the live-out instruction of each reduction are
6966 // required in order to introduce a select between them in VPlan.
6967 if (CM
.foldTailByMasking()) {
6968 NeedDef
.insert(Legal
->getPrimaryInduction());
6969 for (auto &Reduction
: *Legal
->getReductionVars()) {
6970 NeedDef
.insert(Reduction
.first
);
6971 NeedDef
.insert(Reduction
.second
.getLoopExitInstr());
6975 // Collect instructions from the original loop that will become trivially dead
6976 // in the vectorized loop. We don't need to vectorize these instructions. For
6977 // example, original induction update instructions can become dead because we
6978 // separately emit induction "steps" when generating code for the new loop.
6979 // Similarly, we create a new latch condition when setting up the structure
6980 // of the new loop, so the old one can become dead.
6981 SmallPtrSet
<Instruction
*, 4> DeadInstructions
;
6982 collectTriviallyDeadInstructions(DeadInstructions
);
6984 for (unsigned VF
= MinVF
; VF
< MaxVF
+ 1;) {
6985 VFRange SubRange
= {VF
, MaxVF
+ 1};
6987 buildVPlanWithVPRecipes(SubRange
, NeedDef
, DeadInstructions
));
6992 VPlanPtr
LoopVectorizationPlanner::buildVPlanWithVPRecipes(
6993 VFRange
&Range
, SmallPtrSetImpl
<Value
*> &NeedDef
,
6994 SmallPtrSetImpl
<Instruction
*> &DeadInstructions
) {
6995 // Hold a mapping from predicated instructions to their recipes, in order to
6996 // fix their AlsoPack behavior if a user is determined to replicate and use a
6997 // scalar instead of vector value.
6998 DenseMap
<Instruction
*, VPReplicateRecipe
*> PredInst2Recipe
;
7000 DenseMap
<Instruction
*, Instruction
*> &SinkAfter
= Legal
->getSinkAfter();
7001 DenseMap
<Instruction
*, Instruction
*> SinkAfterInverse
;
7003 // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7004 VPBasicBlock
*VPBB
= new VPBasicBlock("Pre-Entry");
7005 auto Plan
= std::make_unique
<VPlan
>(VPBB
);
7007 VPRecipeBuilder
RecipeBuilder(OrigLoop
, TLI
, Legal
, CM
, Builder
);
7008 // Represent values that will have defs inside VPlan.
7009 for (Value
*V
: NeedDef
)
7010 Plan
->addVPValue(V
);
7012 // Scan the body of the loop in a topological order to visit each basic block
7013 // after having visited its predecessor basic blocks.
7014 LoopBlocksDFS
DFS(OrigLoop
);
7017 for (BasicBlock
*BB
: make_range(DFS
.beginRPO(), DFS
.endRPO())) {
7018 // Relevant instructions from basic block BB will be grouped into VPRecipe
7019 // ingredients and fill a new VPBasicBlock.
7020 unsigned VPBBsForBB
= 0;
7021 auto *FirstVPBBForBB
= new VPBasicBlock(BB
->getName());
7022 VPBlockUtils::insertBlockAfter(FirstVPBBForBB
, VPBB
);
7023 VPBB
= FirstVPBBForBB
;
7024 Builder
.setInsertPoint(VPBB
);
7026 std::vector
<Instruction
*> Ingredients
;
7028 // Organize the ingredients to vectorize from current basic block in the
7030 for (Instruction
&I
: BB
->instructionsWithoutDebug()) {
7031 Instruction
*Instr
= &I
;
7033 // First filter out irrelevant instructions, to ensure no recipes are
7035 if (isa
<BranchInst
>(Instr
) ||
7036 DeadInstructions
.find(Instr
) != DeadInstructions
.end())
7039 // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
7040 // member of the IG, do not construct any Recipe for it.
7041 const InterleaveGroup
<Instruction
> *IG
=
7042 CM
.getInterleavedAccessGroup(Instr
);
7043 if (IG
&& Instr
!= IG
->getInsertPos() &&
7044 Range
.Start
>= 2 && // Query is illegal for VF == 1
7045 CM
.getWideningDecision(Instr
, Range
.Start
) ==
7046 LoopVectorizationCostModel::CM_Interleave
) {
7047 auto SinkCandidate
= SinkAfterInverse
.find(Instr
);
7048 if (SinkCandidate
!= SinkAfterInverse
.end())
7049 Ingredients
.push_back(SinkCandidate
->second
);
7053 // Move instructions to handle first-order recurrences, step 1: avoid
7054 // handling this instruction until after we've handled the instruction it
7056 auto SAIt
= SinkAfter
.find(Instr
);
7057 if (SAIt
!= SinkAfter
.end()) {
7058 LLVM_DEBUG(dbgs() << "Sinking" << *SAIt
->first
<< " after"
7060 << " to vectorize a 1st order recurrence.\n");
7061 SinkAfterInverse
[SAIt
->second
] = Instr
;
7065 Ingredients
.push_back(Instr
);
7067 // Move instructions to handle first-order recurrences, step 2: push the
7068 // instruction to be sunk at its insertion point.
7069 auto SAInvIt
= SinkAfterInverse
.find(Instr
);
7070 if (SAInvIt
!= SinkAfterInverse
.end())
7071 Ingredients
.push_back(SAInvIt
->second
);
7074 // Introduce each ingredient into VPlan.
7075 for (Instruction
*Instr
: Ingredients
) {
7076 if (RecipeBuilder
.tryToCreateRecipe(Instr
, Range
, Plan
, VPBB
))
7079 // Otherwise, if all widening options failed, Instruction is to be
7080 // replicated. This may create a successor for VPBB.
7081 VPBasicBlock
*NextVPBB
= RecipeBuilder
.handleReplication(
7082 Instr
, Range
, VPBB
, PredInst2Recipe
, Plan
);
7083 if (NextVPBB
!= VPBB
) {
7085 VPBB
->setName(BB
->hasName() ? BB
->getName() + "." + Twine(VPBBsForBB
++)
7091 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7092 // may also be empty, such as the last one VPBB, reflecting original
7093 // basic-blocks with no recipes.
7094 VPBasicBlock
*PreEntry
= cast
<VPBasicBlock
>(Plan
->getEntry());
7095 assert(PreEntry
->empty() && "Expecting empty pre-entry block.");
7096 VPBlockBase
*Entry
= Plan
->setEntry(PreEntry
->getSingleSuccessor());
7097 VPBlockUtils::disconnectBlocks(PreEntry
, Entry
);
7100 // Finally, if tail is folded by masking, introduce selects between the phi
7101 // and the live-out instruction of each reduction, at the end of the latch.
7102 if (CM
.foldTailByMasking()) {
7103 Builder
.setInsertPoint(VPBB
);
7104 auto *Cond
= RecipeBuilder
.createBlockInMask(OrigLoop
->getHeader(), Plan
);
7105 for (auto &Reduction
: *Legal
->getReductionVars()) {
7106 VPValue
*Phi
= Plan
->getVPValue(Reduction
.first
);
7107 VPValue
*Red
= Plan
->getVPValue(Reduction
.second
.getLoopExitInstr());
7108 Builder
.createNaryOp(Instruction::Select
, {Cond
, Red
, Phi
});
7112 std::string PlanName
;
7113 raw_string_ostream
RSO(PlanName
);
7114 unsigned VF
= Range
.Start
;
7116 RSO
<< "Initial VPlan for VF={" << VF
;
7117 for (VF
*= 2; VF
< Range
.End
; VF
*= 2) {
7123 Plan
->setName(PlanName
);
7128 VPlanPtr
LoopVectorizationPlanner::buildVPlan(VFRange
&Range
) {
7129 // Outer loop handling: They may require CFG and instruction level
7130 // transformations before even evaluating whether vectorization is profitable.
7131 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7132 // the vectorization pipeline.
7133 assert(!OrigLoop
->empty());
7134 assert(EnableVPlanNativePath
&& "VPlan-native path is not enabled.");
7136 // Create new empty VPlan
7137 auto Plan
= std::make_unique
<VPlan
>();
7139 // Build hierarchical CFG
7140 VPlanHCFGBuilder
HCFGBuilder(OrigLoop
, LI
, *Plan
);
7141 HCFGBuilder
.buildHierarchicalCFG();
7143 for (unsigned VF
= Range
.Start
; VF
< Range
.End
; VF
*= 2)
7146 if (EnableVPlanPredication
) {
7147 VPlanPredicator
VPP(*Plan
);
7150 // Avoid running transformation to recipes until masked code generation in
7151 // VPlan-native path is in place.
7155 SmallPtrSet
<Instruction
*, 1> DeadInstructions
;
7156 VPlanHCFGTransforms::VPInstructionsToVPRecipes(
7157 Plan
, Legal
->getInductionVars(), DeadInstructions
);
7162 Value
* LoopVectorizationPlanner::VPCallbackILV::
7163 getOrCreateVectorValues(Value
*V
, unsigned Part
) {
7164 return ILV
.getOrCreateVectorValue(V
, Part
);
7167 void VPInterleaveRecipe::print(raw_ostream
&O
, const Twine
&Indent
) const {
7169 << Indent
<< "\"INTERLEAVE-GROUP with factor " << IG
->getFactor() << " at ";
7170 IG
->getInsertPos()->printAsOperand(O
, false);
7173 User
->getOperand(0)->printAsOperand(O
);
7176 for (unsigned i
= 0; i
< IG
->getFactor(); ++i
)
7177 if (Instruction
*I
= IG
->getMember(i
))
7179 << Indent
<< "\" " << VPlanIngredient(I
) << " " << i
<< "\\l\"";
7182 void VPWidenRecipe::execute(VPTransformState
&State
) {
7183 for (auto &Instr
: make_range(Begin
, End
))
7184 State
.ILV
->widenInstruction(Instr
);
7187 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState
&State
) {
7188 assert(!State
.Instance
&& "Int or FP induction being replicated.");
7189 State
.ILV
->widenIntOrFpInduction(IV
, Trunc
);
7192 void VPWidenPHIRecipe::execute(VPTransformState
&State
) {
7193 State
.ILV
->widenPHIInstruction(Phi
, State
.UF
, State
.VF
);
7196 void VPBlendRecipe::execute(VPTransformState
&State
) {
7197 State
.ILV
->setDebugLocFromInst(State
.Builder
, Phi
);
7198 // We know that all PHIs in non-header blocks are converted into
7199 // selects, so we don't have to worry about the insertion order and we
7200 // can just use the builder.
7201 // At this point we generate the predication tree. There may be
7202 // duplications since this is a simple recursive scan, but future
7203 // optimizations will clean it up.
7205 unsigned NumIncoming
= Phi
->getNumIncomingValues();
7207 assert((User
|| NumIncoming
== 1) &&
7208 "Multiple predecessors with predecessors having a full mask");
7209 // Generate a sequence of selects of the form:
7210 // SELECT(Mask3, In3,
7211 // SELECT(Mask2, In2,
7213 InnerLoopVectorizer::VectorParts
Entry(State
.UF
);
7214 for (unsigned In
= 0; In
< NumIncoming
; ++In
) {
7215 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
) {
7216 // We might have single edge PHIs (blocks) - use an identity
7217 // 'select' for the first PHI operand.
7219 State
.ILV
->getOrCreateVectorValue(Phi
->getIncomingValue(In
), Part
);
7221 Entry
[Part
] = In0
; // Initialize with the first incoming value.
7223 // Select between the current value and the previous incoming edge
7224 // based on the incoming mask.
7225 Value
*Cond
= State
.get(User
->getOperand(In
), Part
);
7227 State
.Builder
.CreateSelect(Cond
, In0
, Entry
[Part
], "predphi");
7231 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
)
7232 State
.ValueMap
.setVectorValue(Phi
, Part
, Entry
[Part
]);
7235 void VPInterleaveRecipe::execute(VPTransformState
&State
) {
7236 assert(!State
.Instance
&& "Interleave group being replicated.");
7238 return State
.ILV
->vectorizeInterleaveGroup(IG
->getInsertPos());
7240 // Last (and currently only) operand is a mask.
7241 InnerLoopVectorizer::VectorParts
MaskValues(State
.UF
);
7242 VPValue
*Mask
= User
->getOperand(User
->getNumOperands() - 1);
7243 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
)
7244 MaskValues
[Part
] = State
.get(Mask
, Part
);
7245 State
.ILV
->vectorizeInterleaveGroup(IG
->getInsertPos(), &MaskValues
);
7248 void VPReplicateRecipe::execute(VPTransformState
&State
) {
7249 if (State
.Instance
) { // Generate a single instance.
7250 State
.ILV
->scalarizeInstruction(Ingredient
, *State
.Instance
, IsPredicated
);
7251 // Insert scalar instance packing it into a vector.
7252 if (AlsoPack
&& State
.VF
> 1) {
7253 // If we're constructing lane 0, initialize to start from undef.
7254 if (State
.Instance
->Lane
== 0) {
7256 UndefValue::get(VectorType::get(Ingredient
->getType(), State
.VF
));
7257 State
.ValueMap
.setVectorValue(Ingredient
, State
.Instance
->Part
, Undef
);
7259 State
.ILV
->packScalarIntoVectorValue(Ingredient
, *State
.Instance
);
7264 // Generate scalar instances for all VF lanes of all UF parts, unless the
7265 // instruction is uniform inwhich case generate only the first lane for each
7267 unsigned EndLane
= IsUniform
? 1 : State
.VF
;
7268 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
)
7269 for (unsigned Lane
= 0; Lane
< EndLane
; ++Lane
)
7270 State
.ILV
->scalarizeInstruction(Ingredient
, {Part
, Lane
}, IsPredicated
);
7273 void VPBranchOnMaskRecipe::execute(VPTransformState
&State
) {
7274 assert(State
.Instance
&& "Branch on Mask works only on single instance.");
7276 unsigned Part
= State
.Instance
->Part
;
7277 unsigned Lane
= State
.Instance
->Lane
;
7279 Value
*ConditionBit
= nullptr;
7280 if (!User
) // Block in mask is all-one.
7281 ConditionBit
= State
.Builder
.getTrue();
7283 VPValue
*BlockInMask
= User
->getOperand(0);
7284 ConditionBit
= State
.get(BlockInMask
, Part
);
7285 if (ConditionBit
->getType()->isVectorTy())
7286 ConditionBit
= State
.Builder
.CreateExtractElement(
7287 ConditionBit
, State
.Builder
.getInt32(Lane
));
7290 // Replace the temporary unreachable terminator with a new conditional branch,
7291 // whose two destinations will be set later when they are created.
7292 auto *CurrentTerminator
= State
.CFG
.PrevBB
->getTerminator();
7293 assert(isa
<UnreachableInst
>(CurrentTerminator
) &&
7294 "Expected to replace unreachable terminator with conditional branch.");
7295 auto *CondBr
= BranchInst::Create(State
.CFG
.PrevBB
, nullptr, ConditionBit
);
7296 CondBr
->setSuccessor(0, nullptr);
7297 ReplaceInstWithInst(CurrentTerminator
, CondBr
);
7300 void VPPredInstPHIRecipe::execute(VPTransformState
&State
) {
7301 assert(State
.Instance
&& "Predicated instruction PHI works per instance.");
7302 Instruction
*ScalarPredInst
= cast
<Instruction
>(
7303 State
.ValueMap
.getScalarValue(PredInst
, *State
.Instance
));
7304 BasicBlock
*PredicatedBB
= ScalarPredInst
->getParent();
7305 BasicBlock
*PredicatingBB
= PredicatedBB
->getSinglePredecessor();
7306 assert(PredicatingBB
&& "Predicated block has no single predecessor.");
7308 // By current pack/unpack logic we need to generate only a single phi node: if
7309 // a vector value for the predicated instruction exists at this point it means
7310 // the instruction has vector users only, and a phi for the vector value is
7311 // needed. In this case the recipe of the predicated instruction is marked to
7312 // also do that packing, thereby "hoisting" the insert-element sequence.
7313 // Otherwise, a phi node for the scalar value is needed.
7314 unsigned Part
= State
.Instance
->Part
;
7315 if (State
.ValueMap
.hasVectorValue(PredInst
, Part
)) {
7316 Value
*VectorValue
= State
.ValueMap
.getVectorValue(PredInst
, Part
);
7317 InsertElementInst
*IEI
= cast
<InsertElementInst
>(VectorValue
);
7318 PHINode
*VPhi
= State
.Builder
.CreatePHI(IEI
->getType(), 2);
7319 VPhi
->addIncoming(IEI
->getOperand(0), PredicatingBB
); // Unmodified vector.
7320 VPhi
->addIncoming(IEI
, PredicatedBB
); // New vector with inserted element.
7321 State
.ValueMap
.resetVectorValue(PredInst
, Part
, VPhi
); // Update cache.
7323 Type
*PredInstType
= PredInst
->getType();
7324 PHINode
*Phi
= State
.Builder
.CreatePHI(PredInstType
, 2);
7325 Phi
->addIncoming(UndefValue::get(ScalarPredInst
->getType()), PredicatingBB
);
7326 Phi
->addIncoming(ScalarPredInst
, PredicatedBB
);
7327 State
.ValueMap
.resetScalarValue(PredInst
, *State
.Instance
, Phi
);
7331 void VPWidenMemoryInstructionRecipe::execute(VPTransformState
&State
) {
7333 return State
.ILV
->vectorizeMemoryInstruction(&Instr
);
7335 // Last (and currently only) operand is a mask.
7336 InnerLoopVectorizer::VectorParts
MaskValues(State
.UF
);
7337 VPValue
*Mask
= User
->getOperand(User
->getNumOperands() - 1);
7338 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
)
7339 MaskValues
[Part
] = State
.get(Mask
, Part
);
7340 State
.ILV
->vectorizeMemoryInstruction(&Instr
, &MaskValues
);
7343 static ScalarEpilogueLowering
7344 getScalarEpilogueLowering(Function
*F
, Loop
*L
, LoopVectorizeHints
&Hints
,
7345 ProfileSummaryInfo
*PSI
, BlockFrequencyInfo
*BFI
) {
7346 ScalarEpilogueLowering SEL
= CM_ScalarEpilogueAllowed
;
7347 if (Hints
.getForce() != LoopVectorizeHints::FK_Enabled
&&
7349 llvm::shouldOptimizeForSize(L
->getHeader(), PSI
, BFI
)))
7350 SEL
= CM_ScalarEpilogueNotAllowedOptSize
;
7351 else if (PreferPredicateOverEpilog
|| Hints
.getPredicate())
7352 SEL
= CM_ScalarEpilogueNotNeededUsePredicate
;
7357 // Process the loop in the VPlan-native vectorization path. This path builds
7358 // VPlan upfront in the vectorization pipeline, which allows to apply
7359 // VPlan-to-VPlan transformations from the very beginning without modifying the
7361 static bool processLoopInVPlanNativePath(
7362 Loop
*L
, PredicatedScalarEvolution
&PSE
, LoopInfo
*LI
, DominatorTree
*DT
,
7363 LoopVectorizationLegality
*LVL
, TargetTransformInfo
*TTI
,
7364 TargetLibraryInfo
*TLI
, DemandedBits
*DB
, AssumptionCache
*AC
,
7365 OptimizationRemarkEmitter
*ORE
, BlockFrequencyInfo
*BFI
,
7366 ProfileSummaryInfo
*PSI
, LoopVectorizeHints
&Hints
) {
7368 assert(EnableVPlanNativePath
&& "VPlan-native path is disabled.");
7369 Function
*F
= L
->getHeader()->getParent();
7370 InterleavedAccessInfo
IAI(PSE
, L
, DT
, LI
, LVL
->getLAI());
7371 ScalarEpilogueLowering SEL
= getScalarEpilogueLowering(F
, L
, Hints
, PSI
, BFI
);
7373 LoopVectorizationCostModel
CM(SEL
, L
, PSE
, LI
, LVL
, *TTI
, TLI
, DB
, AC
, ORE
, F
,
7375 // Use the planner for outer loop vectorization.
7376 // TODO: CM is not used at this point inside the planner. Turn CM into an
7377 // optional argument if we don't need it in the future.
7378 LoopVectorizationPlanner
LVP(L
, LI
, TLI
, TTI
, LVL
, CM
);
7380 // Get user vectorization factor.
7381 const unsigned UserVF
= Hints
.getWidth();
7383 // Plan how to best vectorize, return the best VF and its cost.
7384 const VectorizationFactor VF
= LVP
.planInVPlanNativePath(UserVF
);
7386 // If we are stress testing VPlan builds, do not attempt to generate vector
7387 // code. Masked vector code generation support will follow soon.
7388 // Also, do not attempt to vectorize if no vector code will be produced.
7389 if (VPlanBuildStressTest
|| EnableVPlanPredication
||
7390 VectorizationFactor::Disabled() == VF
)
7393 LVP
.setBestPlan(VF
.Width
, 1);
7395 InnerLoopVectorizer
LB(L
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
, VF
.Width
, 1, LVL
,
7397 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7398 << L
->getHeader()->getParent()->getName() << "\"\n");
7399 LVP
.executePlan(LB
, DT
);
7401 // Mark the loop as already vectorized to avoid vectorizing again.
7402 Hints
.setAlreadyVectorized();
7404 LLVM_DEBUG(verifyFunction(*L
->getHeader()->getParent()));
7408 bool LoopVectorizePass::processLoop(Loop
*L
) {
7409 assert((EnableVPlanNativePath
|| L
->empty()) &&
7410 "VPlan-native path is not enabled. Only process inner loops.");
7413 const std::string DebugLocStr
= getDebugLocString(L
);
7416 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7417 << L
->getHeader()->getParent()->getName() << "\" from "
7418 << DebugLocStr
<< "\n");
7420 LoopVectorizeHints
Hints(L
, InterleaveOnlyWhenForced
, *ORE
);
7423 dbgs() << "LV: Loop hints:"
7425 << (Hints
.getForce() == LoopVectorizeHints::FK_Disabled
7427 : (Hints
.getForce() == LoopVectorizeHints::FK_Enabled
7430 << " width=" << Hints
.getWidth()
7431 << " unroll=" << Hints
.getInterleave() << "\n");
7433 // Function containing loop
7434 Function
*F
= L
->getHeader()->getParent();
7436 // Looking at the diagnostic output is the only way to determine if a loop
7437 // was vectorized (other than looking at the IR or machine code), so it
7438 // is important to generate an optimization remark for each loop. Most of
7439 // these messages are generated as OptimizationRemarkAnalysis. Remarks
7440 // generated as OptimizationRemark and OptimizationRemarkMissed are
7441 // less verbose reporting vectorized loops and unvectorized loops that may
7442 // benefit from vectorization, respectively.
7444 if (!Hints
.allowVectorization(F
, L
, VectorizeOnlyWhenForced
)) {
7445 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7449 PredicatedScalarEvolution
PSE(*SE
, *L
);
7451 // Check if it is legal to vectorize the loop.
7452 LoopVectorizationRequirements
Requirements(*ORE
);
7453 LoopVectorizationLegality
LVL(L
, PSE
, DT
, TTI
, TLI
, AA
, F
, GetLAA
, LI
, ORE
,
7454 &Requirements
, &Hints
, DB
, AC
);
7455 if (!LVL
.canVectorize(EnableVPlanNativePath
)) {
7456 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7457 Hints
.emitRemarkWithHints();
7461 // Check the function attributes and profiles to find out if this function
7462 // should be optimized for size.
7463 ScalarEpilogueLowering SEL
= getScalarEpilogueLowering(F
, L
, Hints
, PSI
, BFI
);
7465 // Entrance to the VPlan-native vectorization path. Outer loops are processed
7466 // here. They may require CFG and instruction level transformations before
7467 // even evaluating whether vectorization is profitable. Since we cannot modify
7468 // the incoming IR, we need to build VPlan upfront in the vectorization
7471 return processLoopInVPlanNativePath(L
, PSE
, LI
, DT
, &LVL
, TTI
, TLI
, DB
, AC
,
7472 ORE
, BFI
, PSI
, Hints
);
7474 assert(L
->empty() && "Inner loop expected.");
7475 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7476 // count by optimizing for size, to minimize overheads.
7477 // Prefer constant trip counts over profile data, over upper bound estimate.
7478 unsigned ExpectedTC
= 0;
7479 bool HasExpectedTC
= false;
7480 if (const SCEVConstant
*ConstExits
=
7481 dyn_cast
<SCEVConstant
>(SE
->getBackedgeTakenCount(L
))) {
7482 const APInt
&ExitsCount
= ConstExits
->getAPInt();
7483 // We are interested in small values for ExpectedTC. Skip over those that
7484 // can't fit an unsigned.
7485 if (ExitsCount
.ult(std::numeric_limits
<unsigned>::max())) {
7486 ExpectedTC
= static_cast<unsigned>(ExitsCount
.getZExtValue()) + 1;
7487 HasExpectedTC
= true;
7490 // ExpectedTC may be large because it's bound by a variable. Check
7491 // profiling information to validate we should vectorize.
7492 if (!HasExpectedTC
&& LoopVectorizeWithBlockFrequency
) {
7493 auto EstimatedTC
= getLoopEstimatedTripCount(L
);
7495 ExpectedTC
= *EstimatedTC
;
7496 HasExpectedTC
= true;
7499 if (!HasExpectedTC
) {
7500 ExpectedTC
= SE
->getSmallConstantMaxTripCount(L
);
7501 HasExpectedTC
= (ExpectedTC
> 0);
7504 if (HasExpectedTC
&& ExpectedTC
< TinyTripCountVectorThreshold
) {
7505 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7506 << "This loop is worth vectorizing only if no scalar "
7507 << "iteration overheads are incurred.");
7508 if (Hints
.getForce() == LoopVectorizeHints::FK_Enabled
)
7509 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7511 LLVM_DEBUG(dbgs() << "\n");
7512 SEL
= CM_ScalarEpilogueNotAllowedLowTripLoop
;
7516 // Check the function attributes to see if implicit floats are allowed.
7517 // FIXME: This check doesn't seem possibly correct -- what if the loop is
7518 // an integer loop and the vector instructions selected are purely integer
7519 // vector instructions?
7520 if (F
->hasFnAttribute(Attribute::NoImplicitFloat
)) {
7521 reportVectorizationFailure(
7522 "Can't vectorize when the NoImplicitFloat attribute is used",
7523 "loop not vectorized due to NoImplicitFloat attribute",
7524 "NoImplicitFloat", ORE
, L
);
7525 Hints
.emitRemarkWithHints();
7529 // Check if the target supports potentially unsafe FP vectorization.
7530 // FIXME: Add a check for the type of safety issue (denormal, signaling)
7531 // for the target we're vectorizing for, to make sure none of the
7532 // additional fp-math flags can help.
7533 if (Hints
.isPotentiallyUnsafe() &&
7534 TTI
->isFPVectorizationPotentiallyUnsafe()) {
7535 reportVectorizationFailure(
7536 "Potentially unsafe FP op prevents vectorization",
7537 "loop not vectorized due to unsafe FP support.",
7538 "UnsafeFP", ORE
, L
);
7539 Hints
.emitRemarkWithHints();
7543 bool UseInterleaved
= TTI
->enableInterleavedAccessVectorization();
7544 InterleavedAccessInfo
IAI(PSE
, L
, DT
, LI
, LVL
.getLAI());
7546 // If an override option has been passed in for interleaved accesses, use it.
7547 if (EnableInterleavedMemAccesses
.getNumOccurrences() > 0)
7548 UseInterleaved
= EnableInterleavedMemAccesses
;
7550 // Analyze interleaved memory accesses.
7551 if (UseInterleaved
) {
7552 IAI
.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI
));
7555 // Use the cost model.
7556 LoopVectorizationCostModel
CM(SEL
, L
, PSE
, LI
, &LVL
, *TTI
, TLI
, DB
, AC
, ORE
,
7558 CM
.collectValuesToIgnore();
7560 // Use the planner for vectorization.
7561 LoopVectorizationPlanner
LVP(L
, LI
, TLI
, TTI
, &LVL
, CM
);
7563 // Get user vectorization factor.
7564 unsigned UserVF
= Hints
.getWidth();
7566 // Plan how to best vectorize, return the best VF and its cost.
7567 Optional
<VectorizationFactor
> MaybeVF
= LVP
.plan(UserVF
);
7569 VectorizationFactor VF
= VectorizationFactor::Disabled();
7571 unsigned UserIC
= Hints
.getInterleave();
7575 // Select the interleave count.
7576 IC
= CM
.selectInterleaveCount(VF
.Width
, VF
.Cost
);
7579 // Identify the diagnostic messages that should be produced.
7580 std::pair
<StringRef
, std::string
> VecDiagMsg
, IntDiagMsg
;
7581 bool VectorizeLoop
= true, InterleaveLoop
= true;
7582 if (Requirements
.doesNotMeet(F
, L
, Hints
)) {
7583 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7585 Hints
.emitRemarkWithHints();
7589 if (VF
.Width
== 1) {
7590 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7591 VecDiagMsg
= std::make_pair(
7592 "VectorizationNotBeneficial",
7593 "the cost-model indicates that vectorization is not beneficial");
7594 VectorizeLoop
= false;
7597 if (!MaybeVF
&& UserIC
> 1) {
7598 // Tell the user interleaving was avoided up-front, despite being explicitly
7600 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7601 "interleaving should be avoided up front\n");
7602 IntDiagMsg
= std::make_pair(
7603 "InterleavingAvoided",
7604 "Ignoring UserIC, because interleaving was avoided up front");
7605 InterleaveLoop
= false;
7606 } else if (IC
== 1 && UserIC
<= 1) {
7607 // Tell the user interleaving is not beneficial.
7608 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7609 IntDiagMsg
= std::make_pair(
7610 "InterleavingNotBeneficial",
7611 "the cost-model indicates that interleaving is not beneficial");
7612 InterleaveLoop
= false;
7614 IntDiagMsg
.first
= "InterleavingNotBeneficialAndDisabled";
7615 IntDiagMsg
.second
+=
7616 " and is explicitly disabled or interleave count is set to 1";
7618 } else if (IC
> 1 && UserIC
== 1) {
7619 // Tell the user interleaving is beneficial, but it explicitly disabled.
7621 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7622 IntDiagMsg
= std::make_pair(
7623 "InterleavingBeneficialButDisabled",
7624 "the cost-model indicates that interleaving is beneficial "
7625 "but is explicitly disabled or interleave count is set to 1");
7626 InterleaveLoop
= false;
7629 // Override IC if user provided an interleave count.
7630 IC
= UserIC
> 0 ? UserIC
: IC
;
7632 // Emit diagnostic messages, if any.
7633 const char *VAPassName
= Hints
.vectorizeAnalysisPassName();
7634 if (!VectorizeLoop
&& !InterleaveLoop
) {
7635 // Do not vectorize or interleaving the loop.
7637 return OptimizationRemarkMissed(VAPassName
, VecDiagMsg
.first
,
7638 L
->getStartLoc(), L
->getHeader())
7639 << VecDiagMsg
.second
;
7642 return OptimizationRemarkMissed(LV_NAME
, IntDiagMsg
.first
,
7643 L
->getStartLoc(), L
->getHeader())
7644 << IntDiagMsg
.second
;
7647 } else if (!VectorizeLoop
&& InterleaveLoop
) {
7648 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC
<< '\n');
7650 return OptimizationRemarkAnalysis(VAPassName
, VecDiagMsg
.first
,
7651 L
->getStartLoc(), L
->getHeader())
7652 << VecDiagMsg
.second
;
7654 } else if (VectorizeLoop
&& !InterleaveLoop
) {
7655 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF
.Width
7656 << ") in " << DebugLocStr
<< '\n');
7658 return OptimizationRemarkAnalysis(LV_NAME
, IntDiagMsg
.first
,
7659 L
->getStartLoc(), L
->getHeader())
7660 << IntDiagMsg
.second
;
7662 } else if (VectorizeLoop
&& InterleaveLoop
) {
7663 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF
.Width
7664 << ") in " << DebugLocStr
<< '\n');
7665 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC
<< '\n');
7668 LVP
.setBestPlan(VF
.Width
, IC
);
7670 using namespace ore
;
7671 bool DisableRuntimeUnroll
= false;
7672 MDNode
*OrigLoopID
= L
->getLoopID();
7674 if (!VectorizeLoop
) {
7675 assert(IC
> 1 && "interleave count should not be 1 or 0");
7676 // If we decided that it is not legal to vectorize the loop, then
7678 InnerLoopUnroller
Unroller(L
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
, IC
, &LVL
,
7680 LVP
.executePlan(Unroller
, DT
);
7683 return OptimizationRemark(LV_NAME
, "Interleaved", L
->getStartLoc(),
7685 << "interleaved loop (interleaved count: "
7686 << NV("InterleaveCount", IC
) << ")";
7689 // If we decided that it is *legal* to vectorize the loop, then do it.
7690 InnerLoopVectorizer
LB(L
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
, VF
.Width
, IC
,
7692 LVP
.executePlan(LB
, DT
);
7695 // Add metadata to disable runtime unrolling a scalar loop when there are
7696 // no runtime checks about strides and memory. A scalar loop that is
7697 // rarely used is not worth unrolling.
7698 if (!LB
.areSafetyChecksAdded())
7699 DisableRuntimeUnroll
= true;
7701 // Report the vectorization decision.
7703 return OptimizationRemark(LV_NAME
, "Vectorized", L
->getStartLoc(),
7705 << "vectorized loop (vectorization width: "
7706 << NV("VectorizationFactor", VF
.Width
)
7707 << ", interleaved count: " << NV("InterleaveCount", IC
) << ")";
7711 Optional
<MDNode
*> RemainderLoopID
=
7712 makeFollowupLoopID(OrigLoopID
, {LLVMLoopVectorizeFollowupAll
,
7713 LLVMLoopVectorizeFollowupEpilogue
});
7714 if (RemainderLoopID
.hasValue()) {
7715 L
->setLoopID(RemainderLoopID
.getValue());
7717 if (DisableRuntimeUnroll
)
7718 AddRuntimeUnrollDisableMetaData(L
);
7720 // Mark the loop as already vectorized to avoid vectorizing again.
7721 Hints
.setAlreadyVectorized();
7724 LLVM_DEBUG(verifyFunction(*L
->getHeader()->getParent()));
7728 bool LoopVectorizePass::runImpl(
7729 Function
&F
, ScalarEvolution
&SE_
, LoopInfo
&LI_
, TargetTransformInfo
&TTI_
,
7730 DominatorTree
&DT_
, BlockFrequencyInfo
&BFI_
, TargetLibraryInfo
*TLI_
,
7731 DemandedBits
&DB_
, AliasAnalysis
&AA_
, AssumptionCache
&AC_
,
7732 std::function
<const LoopAccessInfo
&(Loop
&)> &GetLAA_
,
7733 OptimizationRemarkEmitter
&ORE_
, ProfileSummaryInfo
*PSI_
) {
7748 // 1. the target claims to have no vector registers, and
7749 // 2. interleaving won't help ILP.
7751 // The second condition is necessary because, even if the target has no
7752 // vector registers, loop vectorization may still enable scalar
7754 if (!TTI
->getNumberOfRegisters(true) && TTI
->getMaxInterleaveFactor(1) < 2)
7757 bool Changed
= false;
7759 // The vectorizer requires loops to be in simplified form.
7760 // Since simplification may add new inner loops, it has to run before the
7761 // legality and profitability checks. This means running the loop vectorizer
7762 // will simplify all loops, regardless of whether anything end up being
7766 simplifyLoop(L
, DT
, LI
, SE
, AC
, nullptr, false /* PreserveLCSSA */);
7768 // Build up a worklist of inner-loops to vectorize. This is necessary as
7769 // the act of vectorizing or partially unrolling a loop creates new loops
7770 // and can invalidate iterators across the loops.
7771 SmallVector
<Loop
*, 8> Worklist
;
7774 collectSupportedLoops(*L
, LI
, ORE
, Worklist
);
7776 LoopsAnalyzed
+= Worklist
.size();
7778 // Now walk the identified inner loops.
7779 while (!Worklist
.empty()) {
7780 Loop
*L
= Worklist
.pop_back_val();
7782 // For the inner loops we actually process, form LCSSA to simplify the
7784 Changed
|= formLCSSARecursively(*L
, *DT
, LI
, SE
);
7786 Changed
|= processLoop(L
);
7789 // Process each loop nest in the function.
7793 PreservedAnalyses
LoopVectorizePass::run(Function
&F
,
7794 FunctionAnalysisManager
&AM
) {
7795 auto &SE
= AM
.getResult
<ScalarEvolutionAnalysis
>(F
);
7796 auto &LI
= AM
.getResult
<LoopAnalysis
>(F
);
7797 auto &TTI
= AM
.getResult
<TargetIRAnalysis
>(F
);
7798 auto &DT
= AM
.getResult
<DominatorTreeAnalysis
>(F
);
7799 auto &BFI
= AM
.getResult
<BlockFrequencyAnalysis
>(F
);
7800 auto &TLI
= AM
.getResult
<TargetLibraryAnalysis
>(F
);
7801 auto &AA
= AM
.getResult
<AAManager
>(F
);
7802 auto &AC
= AM
.getResult
<AssumptionAnalysis
>(F
);
7803 auto &DB
= AM
.getResult
<DemandedBitsAnalysis
>(F
);
7804 auto &ORE
= AM
.getResult
<OptimizationRemarkEmitterAnalysis
>(F
);
7805 MemorySSA
*MSSA
= EnableMSSALoopDependency
7806 ? &AM
.getResult
<MemorySSAAnalysis
>(F
).getMSSA()
7809 auto &LAM
= AM
.getResult
<LoopAnalysisManagerFunctionProxy
>(F
).getManager();
7810 std::function
<const LoopAccessInfo
&(Loop
&)> GetLAA
=
7811 [&](Loop
&L
) -> const LoopAccessInfo
& {
7812 LoopStandardAnalysisResults AR
= {AA
, AC
, DT
, LI
, SE
, TLI
, TTI
, MSSA
};
7813 return LAM
.getResult
<LoopAccessAnalysis
>(L
, AR
);
7815 const ModuleAnalysisManager
&MAM
=
7816 AM
.getResult
<ModuleAnalysisManagerFunctionProxy
>(F
).getManager();
7817 ProfileSummaryInfo
*PSI
=
7818 MAM
.getCachedResult
<ProfileSummaryAnalysis
>(*F
.getParent());
7820 runImpl(F
, SE
, LI
, TTI
, DT
, BFI
, &TLI
, DB
, AA
, AC
, GetLAA
, ORE
, PSI
);
7822 return PreservedAnalyses::all();
7823 PreservedAnalyses PA
;
7825 // We currently do not preserve loopinfo/dominator analyses with outer loop
7826 // vectorization. Until this is addressed, mark these analyses as preserved
7827 // only for non-VPlan-native path.
7828 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7829 if (!EnableVPlanNativePath
) {
7830 PA
.preserve
<LoopAnalysis
>();
7831 PA
.preserve
<DominatorTreeAnalysis
>();
7833 PA
.preserve
<BasicAA
>();
7834 PA
.preserve
<GlobalsAA
>();