1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 // of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 // widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 // of vectorization. It decides on the optimal vector width, which
26 // can be one, if vectorization is not profitable.
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
36 //===----------------------------------------------------------------------===//
38 // The reduction-variable vectorization is based on the paper:
39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
41 // Variable uniformity checks are inspired by:
42 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
44 // The interleaved access vectorization is based on the paper:
45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
48 // Other ideas/concepts are from:
49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52 // Vectorizing Compilers.
54 //===----------------------------------------------------------------------===//
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanHCFGTransforms.h"
62 #include "VPlanPredicator.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/LoopSimplify.h"
137 #include "llvm/Transforms/Utils/LoopUtils.h"
138 #include "llvm/Transforms/Utils/LoopVersioning.h"
139 #include "llvm/Transforms/Utils/SizeOpts.h"
140 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
145 #include <functional>
154 using namespace llvm
;
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
160 /// Metadata attribute names
161 static const char *const LLVMLoopVectorizeFollowupAll
=
162 "llvm.loop.vectorize.followup_all";
163 static const char *const LLVMLoopVectorizeFollowupVectorized
=
164 "llvm.loop.vectorize.followup_vectorized";
165 static const char *const LLVMLoopVectorizeFollowupEpilogue
=
166 "llvm.loop.vectorize.followup_epilogue";
169 STATISTIC(LoopsVectorized
, "Number of loops vectorized");
170 STATISTIC(LoopsAnalyzed
, "Number of loops analyzed for vectorization");
172 /// Loops with a known constant trip count below this number are vectorized only
173 /// if no scalar iteration overheads are incurred.
174 static cl::opt
<unsigned> TinyTripCountVectorThreshold(
175 "vectorizer-min-trip-count", cl::init(16), cl::Hidden
,
176 cl::desc("Loops with a constant trip count that is smaller than this "
177 "value are vectorized only if no scalar iteration overheads "
180 // Indicates that an epilogue is undesired, predication is preferred.
181 // This means that the vectorizer will try to fold the loop-tail (epilogue)
182 // into the loop and predicate the loop body accordingly.
183 static cl::opt
<bool> PreferPredicateOverEpilog(
184 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden
,
185 cl::desc("Indicate that an epilogue is undesired, predication should be "
188 static cl::opt
<bool> MaximizeBandwidth(
189 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden
,
190 cl::desc("Maximize bandwidth when selecting vectorization factor which "
191 "will be determined by the smallest type in loop."));
193 static cl::opt
<bool> EnableInterleavedMemAccesses(
194 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden
,
195 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
197 /// An interleave-group may need masking if it resides in a block that needs
198 /// predication, or in order to mask away gaps.
199 static cl::opt
<bool> EnableMaskedInterleavedMemAccesses(
200 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden
,
201 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
203 /// We don't interleave loops with a known constant trip count below this
205 static const unsigned TinyTripCountInterleaveThreshold
= 128;
207 static cl::opt
<unsigned> ForceTargetNumScalarRegs(
208 "force-target-num-scalar-regs", cl::init(0), cl::Hidden
,
209 cl::desc("A flag that overrides the target's number of scalar registers."));
211 static cl::opt
<unsigned> ForceTargetNumVectorRegs(
212 "force-target-num-vector-regs", cl::init(0), cl::Hidden
,
213 cl::desc("A flag that overrides the target's number of vector registers."));
215 static cl::opt
<unsigned> ForceTargetMaxScalarInterleaveFactor(
216 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden
,
217 cl::desc("A flag that overrides the target's max interleave factor for "
220 static cl::opt
<unsigned> ForceTargetMaxVectorInterleaveFactor(
221 "force-target-max-vector-interleave", cl::init(0), cl::Hidden
,
222 cl::desc("A flag that overrides the target's max interleave factor for "
223 "vectorized loops."));
225 static cl::opt
<unsigned> ForceTargetInstructionCost(
226 "force-target-instruction-cost", cl::init(0), cl::Hidden
,
227 cl::desc("A flag that overrides the target's expected cost for "
228 "an instruction to a single constant value. Mostly "
229 "useful for getting consistent testing."));
231 static cl::opt
<unsigned> SmallLoopCost(
232 "small-loop-cost", cl::init(20), cl::Hidden
,
234 "The cost of a loop that is considered 'small' by the interleaver."));
236 static cl::opt
<bool> LoopVectorizeWithBlockFrequency(
237 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden
,
238 cl::desc("Enable the use of the block frequency analysis to access PGO "
239 "heuristics minimizing code growth in cold regions and being more "
240 "aggressive in hot regions."));
242 // Runtime interleave loops for load/store throughput.
243 static cl::opt
<bool> EnableLoadStoreRuntimeInterleave(
244 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden
,
246 "Enable runtime interleaving until load/store ports are saturated"));
248 /// The number of stores in a loop that are allowed to need predication.
249 static cl::opt
<unsigned> NumberOfStoresToPredicate(
250 "vectorize-num-stores-pred", cl::init(1), cl::Hidden
,
251 cl::desc("Max number of stores to be predicated behind an if."));
253 static cl::opt
<bool> EnableIndVarRegisterHeur(
254 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden
,
255 cl::desc("Count the induction variable only once when interleaving"));
257 static cl::opt
<bool> EnableCondStoresVectorization(
258 "enable-cond-stores-vec", cl::init(true), cl::Hidden
,
259 cl::desc("Enable if predication of stores during vectorization."));
261 static cl::opt
<unsigned> MaxNestedScalarReductionIC(
262 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden
,
263 cl::desc("The maximum interleave count to use when interleaving a scalar "
264 "reduction in a nested loop."));
266 cl::opt
<bool> EnableVPlanNativePath(
267 "enable-vplan-native-path", cl::init(false), cl::Hidden
,
268 cl::desc("Enable VPlan-native vectorization path with "
269 "support for outer loop vectorization."));
271 // FIXME: Remove this switch once we have divergence analysis. Currently we
272 // assume divergent non-backedge branches when this switch is true.
273 cl::opt
<bool> EnableVPlanPredication(
274 "enable-vplan-predication", cl::init(false), cl::Hidden
,
275 cl::desc("Enable VPlan-native vectorization path predicator with "
276 "support for outer loop vectorization."));
278 // This flag enables the stress testing of the VPlan H-CFG construction in the
279 // VPlan-native vectorization path. It must be used in conjuction with
280 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
281 // verification of the H-CFGs built.
282 static cl::opt
<bool> VPlanBuildStressTest(
283 "vplan-build-stress-test", cl::init(false), cl::Hidden
,
285 "Build VPlan for every supported loop nest in the function and bail "
286 "out right after the build (stress test the VPlan H-CFG construction "
287 "in the VPlan-native vectorization path)."));
289 cl::opt
<bool> llvm::EnableLoopInterleaving(
290 "interleave-loops", cl::init(true), cl::Hidden
,
291 cl::desc("Enable loop interleaving in Loop vectorization passes"));
292 cl::opt
<bool> llvm::EnableLoopVectorization(
293 "vectorize-loops", cl::init(true), cl::Hidden
,
294 cl::desc("Run the Loop vectorization passes"));
296 /// A helper function for converting Scalar types to vector types.
297 /// If the incoming type is void, we return void. If the VF is 1, we return
299 static Type
*ToVectorTy(Type
*Scalar
, unsigned VF
) {
300 if (Scalar
->isVoidTy() || VF
== 1)
302 return VectorType::get(Scalar
, VF
);
305 /// A helper function that returns the type of loaded or stored value.
306 static Type
*getMemInstValueType(Value
*I
) {
307 assert((isa
<LoadInst
>(I
) || isa
<StoreInst
>(I
)) &&
308 "Expected Load or Store instruction");
309 if (auto *LI
= dyn_cast
<LoadInst
>(I
))
310 return LI
->getType();
311 return cast
<StoreInst
>(I
)->getValueOperand()->getType();
314 /// A helper function that returns true if the given type is irregular. The
315 /// type is irregular if its allocated size doesn't equal the store size of an
316 /// element of the corresponding vector type at the given vectorization factor.
317 static bool hasIrregularType(Type
*Ty
, const DataLayout
&DL
, unsigned VF
) {
318 // Determine if an array of VF elements of type Ty is "bitcast compatible"
319 // with a <VF x Ty> vector.
321 auto *VectorTy
= VectorType::get(Ty
, VF
);
322 return VF
* DL
.getTypeAllocSize(Ty
) != DL
.getTypeStoreSize(VectorTy
);
325 // If the vectorization factor is one, we just check if an array of type Ty
326 // requires padding between elements.
327 return DL
.getTypeAllocSizeInBits(Ty
) != DL
.getTypeSizeInBits(Ty
);
330 /// A helper function that returns the reciprocal of the block probability of
331 /// predicated blocks. If we return X, we are assuming the predicated block
332 /// will execute once for every X iterations of the loop header.
334 /// TODO: We should use actual block probability here, if available. Currently,
335 /// we always assume predicated blocks have a 50% chance of executing.
336 static unsigned getReciprocalPredBlockProb() { return 2; }
338 /// A helper function that adds a 'fast' flag to floating-point operations.
339 static Value
*addFastMathFlag(Value
*V
) {
340 if (isa
<FPMathOperator
>(V
))
341 cast
<Instruction
>(V
)->setFastMathFlags(FastMathFlags::getFast());
345 static Value
*addFastMathFlag(Value
*V
, FastMathFlags FMF
) {
346 if (isa
<FPMathOperator
>(V
))
347 cast
<Instruction
>(V
)->setFastMathFlags(FMF
);
351 /// A helper function that returns an integer or floating-point constant with
353 static Constant
*getSignedIntOrFpConstant(Type
*Ty
, int64_t C
) {
354 return Ty
->isIntegerTy() ? ConstantInt::getSigned(Ty
, C
)
355 : ConstantFP::get(Ty
, C
);
358 /// Returns "best known" trip count for the specified loop \p L as defined by
359 /// the following procedure:
360 /// 1) Returns exact trip count if it is known.
361 /// 2) Returns expected trip count according to profile data if any.
362 /// 3) Returns upper bound estimate if it is known.
363 /// 4) Returns None if all of the above failed.
364 static Optional
<unsigned> getSmallBestKnownTC(ScalarEvolution
&SE
, Loop
*L
) {
365 // Check if exact trip count is known.
366 if (unsigned ExpectedTC
= SE
.getSmallConstantTripCount(L
))
369 // Check if there is an expected trip count available from profile data.
370 if (LoopVectorizeWithBlockFrequency
)
371 if (auto EstimatedTC
= getLoopEstimatedTripCount(L
))
374 // Check if upper bound estimate is known.
375 if (unsigned ExpectedTC
= SE
.getSmallConstantMaxTripCount(L
))
383 /// InnerLoopVectorizer vectorizes loops which contain only one basic
384 /// block to a specified vectorization factor (VF).
385 /// This class performs the widening of scalars into vectors, or multiple
386 /// scalars. This class also implements the following features:
387 /// * It inserts an epilogue loop for handling loops that don't have iteration
388 /// counts that are known to be a multiple of the vectorization factor.
389 /// * It handles the code generation for reduction variables.
390 /// * Scalarization (implementation using scalars) of un-vectorizable
392 /// InnerLoopVectorizer does not perform any vectorization-legality
393 /// checks, and relies on the caller to check for the different legality
394 /// aspects. The InnerLoopVectorizer relies on the
395 /// LoopVectorizationLegality class to provide information about the induction
396 /// and reduction variables that were found to a given vectorization factor.
397 class InnerLoopVectorizer
{
399 InnerLoopVectorizer(Loop
*OrigLoop
, PredicatedScalarEvolution
&PSE
,
400 LoopInfo
*LI
, DominatorTree
*DT
,
401 const TargetLibraryInfo
*TLI
,
402 const TargetTransformInfo
*TTI
, AssumptionCache
*AC
,
403 OptimizationRemarkEmitter
*ORE
, unsigned VecWidth
,
404 unsigned UnrollFactor
, LoopVectorizationLegality
*LVL
,
405 LoopVectorizationCostModel
*CM
)
406 : OrigLoop(OrigLoop
), PSE(PSE
), LI(LI
), DT(DT
), TLI(TLI
), TTI(TTI
),
407 AC(AC
), ORE(ORE
), VF(VecWidth
), UF(UnrollFactor
),
408 Builder(PSE
.getSE()->getContext()),
409 VectorLoopValueMap(UnrollFactor
, VecWidth
), Legal(LVL
), Cost(CM
) {}
410 virtual ~InnerLoopVectorizer() = default;
412 /// Create a new empty loop. Unlink the old loop and connect the new one.
413 /// Return the pre-header block of the new loop.
414 BasicBlock
*createVectorizedLoopSkeleton();
416 /// Widen a single instruction within the innermost loop.
417 void widenInstruction(Instruction
&I
);
419 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
420 void fixVectorizedLoop();
422 // Return true if any runtime check is added.
423 bool areSafetyChecksAdded() { return AddedSafetyChecks
; }
425 /// A type for vectorized values in the new loop. Each value from the
426 /// original loop, when vectorized, is represented by UF vector values in the
427 /// new unrolled loop, where UF is the unroll factor.
428 using VectorParts
= SmallVector
<Value
*, 2>;
430 /// Vectorize a single PHINode in a block. This method handles the induction
431 /// variable canonicalization. It supports both VF = 1 for unrolled loops and
432 /// arbitrary length vectors.
433 void widenPHIInstruction(Instruction
*PN
, unsigned UF
, unsigned VF
);
435 /// A helper function to scalarize a single Instruction in the innermost loop.
436 /// Generates a sequence of scalar instances for each lane between \p MinLane
437 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
439 void scalarizeInstruction(Instruction
*Instr
, const VPIteration
&Instance
,
440 bool IfPredicateInstr
);
442 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
443 /// is provided, the integer induction variable will first be truncated to
444 /// the corresponding type.
445 void widenIntOrFpInduction(PHINode
*IV
, TruncInst
*Trunc
= nullptr);
447 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
448 /// vector or scalar value on-demand if one is not yet available. When
449 /// vectorizing a loop, we visit the definition of an instruction before its
450 /// uses. When visiting the definition, we either vectorize or scalarize the
451 /// instruction, creating an entry for it in the corresponding map. (In some
452 /// cases, such as induction variables, we will create both vector and scalar
453 /// entries.) Then, as we encounter uses of the definition, we derive values
454 /// for each scalar or vector use unless such a value is already available.
455 /// For example, if we scalarize a definition and one of its uses is vector,
456 /// we build the required vector on-demand with an insertelement sequence
457 /// when visiting the use. Otherwise, if the use is scalar, we can use the
458 /// existing scalar definition.
460 /// Return a value in the new loop corresponding to \p V from the original
461 /// loop at unroll index \p Part. If the value has already been vectorized,
462 /// the corresponding vector entry in VectorLoopValueMap is returned. If,
463 /// however, the value has a scalar entry in VectorLoopValueMap, we construct
464 /// a new vector value on-demand by inserting the scalar values into a vector
465 /// with an insertelement sequence. If the value has been neither vectorized
466 /// nor scalarized, it must be loop invariant, so we simply broadcast the
467 /// value into a vector.
468 Value
*getOrCreateVectorValue(Value
*V
, unsigned Part
);
470 /// Return a value in the new loop corresponding to \p V from the original
471 /// loop at unroll and vector indices \p Instance. If the value has been
472 /// vectorized but not scalarized, the necessary extractelement instruction
473 /// will be generated.
474 Value
*getOrCreateScalarValue(Value
*V
, const VPIteration
&Instance
);
476 /// Construct the vector value of a scalarized value \p V one lane at a time.
477 void packScalarIntoVectorValue(Value
*V
, const VPIteration
&Instance
);
479 /// Try to vectorize the interleaved access group that \p Instr belongs to,
480 /// optionally masking the vector operations if \p BlockInMask is non-null.
481 void vectorizeInterleaveGroup(Instruction
*Instr
,
482 VectorParts
*BlockInMask
= nullptr);
484 /// Vectorize Load and Store instructions, optionally masking the vector
485 /// operations if \p BlockInMask is non-null.
486 void vectorizeMemoryInstruction(Instruction
*Instr
,
487 VectorParts
*BlockInMask
= nullptr);
489 /// Set the debug location in the builder using the debug location in
491 void setDebugLocFromInst(IRBuilder
<> &B
, const Value
*Ptr
);
493 /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
494 void fixNonInductionPHIs(void);
497 friend class LoopVectorizationPlanner
;
499 /// A small list of PHINodes.
500 using PhiVector
= SmallVector
<PHINode
*, 4>;
502 /// A type for scalarized values in the new loop. Each value from the
503 /// original loop, when scalarized, is represented by UF x VF scalar values
504 /// in the new unrolled loop, where UF is the unroll factor and VF is the
505 /// vectorization factor.
506 using ScalarParts
= SmallVector
<SmallVector
<Value
*, 4>, 2>;
508 /// Set up the values of the IVs correctly when exiting the vector loop.
509 void fixupIVUsers(PHINode
*OrigPhi
, const InductionDescriptor
&II
,
510 Value
*CountRoundDown
, Value
*EndValue
,
511 BasicBlock
*MiddleBlock
);
513 /// Create a new induction variable inside L.
514 PHINode
*createInductionVariable(Loop
*L
, Value
*Start
, Value
*End
,
515 Value
*Step
, Instruction
*DL
);
517 /// Handle all cross-iteration phis in the header.
518 void fixCrossIterationPHIs();
520 /// Fix a first-order recurrence. This is the second phase of vectorizing
522 void fixFirstOrderRecurrence(PHINode
*Phi
);
524 /// Fix a reduction cross-iteration phi. This is the second phase of
525 /// vectorizing this phi node.
526 void fixReduction(PHINode
*Phi
);
528 /// The Loop exit block may have single value PHI nodes with some
529 /// incoming value. While vectorizing we only handled real values
530 /// that were defined inside the loop and we should have one value for
531 /// each predecessor of its parent basic block. See PR14725.
534 /// Iteratively sink the scalarized operands of a predicated instruction into
535 /// the block that was created for it.
536 void sinkScalarOperands(Instruction
*PredInst
);
538 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
540 void truncateToMinimalBitwidths();
542 /// Insert the new loop to the loop hierarchy and pass manager
543 /// and update the analysis passes.
544 void updateAnalysis();
546 /// Create a broadcast instruction. This method generates a broadcast
547 /// instruction (shuffle) for loop invariant values and for the induction
548 /// value. If this is the induction variable then we extend it to N, N+1, ...
549 /// this is needed because each iteration in the loop corresponds to a SIMD
551 virtual Value
*getBroadcastInstrs(Value
*V
);
553 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
554 /// to each vector element of Val. The sequence starts at StartIndex.
555 /// \p Opcode is relevant for FP induction variable.
556 virtual Value
*getStepVector(Value
*Val
, int StartIdx
, Value
*Step
,
557 Instruction::BinaryOps Opcode
=
558 Instruction::BinaryOpsEnd
);
560 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
561 /// variable on which to base the steps, \p Step is the size of the step, and
562 /// \p EntryVal is the value from the original loop that maps to the steps.
563 /// Note that \p EntryVal doesn't have to be an induction variable - it
564 /// can also be a truncate instruction.
565 void buildScalarSteps(Value
*ScalarIV
, Value
*Step
, Instruction
*EntryVal
,
566 const InductionDescriptor
&ID
);
568 /// Create a vector induction phi node based on an existing scalar one. \p
569 /// EntryVal is the value from the original loop that maps to the vector phi
570 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
571 /// truncate instruction, instead of widening the original IV, we widen a
572 /// version of the IV truncated to \p EntryVal's type.
573 void createVectorIntOrFpInductionPHI(const InductionDescriptor
&II
,
574 Value
*Step
, Instruction
*EntryVal
);
576 /// Returns true if an instruction \p I should be scalarized instead of
577 /// vectorized for the chosen vectorization factor.
578 bool shouldScalarizeInstruction(Instruction
*I
) const;
580 /// Returns true if we should generate a scalar version of \p IV.
581 bool needsScalarInduction(Instruction
*IV
) const;
583 /// If there is a cast involved in the induction variable \p ID, which should
584 /// be ignored in the vectorized loop body, this function records the
585 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
586 /// cast. We had already proved that the casted Phi is equal to the uncasted
587 /// Phi in the vectorized loop (under a runtime guard), and therefore
588 /// there is no need to vectorize the cast - the same value can be used in the
589 /// vector loop for both the Phi and the cast.
590 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
591 /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
593 /// \p EntryVal is the value from the original loop that maps to the vector
594 /// phi node and is used to distinguish what is the IV currently being
595 /// processed - original one (if \p EntryVal is a phi corresponding to the
596 /// original IV) or the "newly-created" one based on the proof mentioned above
597 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
598 /// latter case \p EntryVal is a TruncInst and we must not record anything for
599 /// that IV, but it's error-prone to expect callers of this routine to care
600 /// about that, hence this explicit parameter.
601 void recordVectorLoopValueForInductionCast(const InductionDescriptor
&ID
,
602 const Instruction
*EntryVal
,
603 Value
*VectorLoopValue
,
605 unsigned Lane
= UINT_MAX
);
607 /// Generate a shuffle sequence that will reverse the vector Vec.
608 virtual Value
*reverseVector(Value
*Vec
);
610 /// Returns (and creates if needed) the original loop trip count.
611 Value
*getOrCreateTripCount(Loop
*NewLoop
);
613 /// Returns (and creates if needed) the trip count of the widened loop.
614 Value
*getOrCreateVectorTripCount(Loop
*NewLoop
);
616 /// Returns a bitcasted value to the requested vector type.
617 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
618 Value
*createBitOrPointerCast(Value
*V
, VectorType
*DstVTy
,
619 const DataLayout
&DL
);
621 /// Emit a bypass check to see if the vector trip count is zero, including if
623 void emitMinimumIterationCountCheck(Loop
*L
, BasicBlock
*Bypass
);
625 /// Emit a bypass check to see if all of the SCEV assumptions we've
626 /// had to make are correct.
627 void emitSCEVChecks(Loop
*L
, BasicBlock
*Bypass
);
629 /// Emit bypass checks to check any memory assumptions we may have made.
630 void emitMemRuntimeChecks(Loop
*L
, BasicBlock
*Bypass
);
632 /// Compute the transformed value of Index at offset StartValue using step
634 /// For integer induction, returns StartValue + Index * StepValue.
635 /// For pointer induction, returns StartValue[Index * StepValue].
636 /// FIXME: The newly created binary instructions should contain nsw/nuw
637 /// flags, which can be found from the original scalar operations.
638 Value
*emitTransformedIndex(IRBuilder
<> &B
, Value
*Index
, ScalarEvolution
*SE
,
639 const DataLayout
&DL
,
640 const InductionDescriptor
&ID
) const;
642 /// Add additional metadata to \p To that was not present on \p Orig.
644 /// Currently this is used to add the noalias annotations based on the
645 /// inserted memchecks. Use this for instructions that are *cloned* into the
647 void addNewMetadata(Instruction
*To
, const Instruction
*Orig
);
649 /// Add metadata from one instruction to another.
651 /// This includes both the original MDs from \p From and additional ones (\see
652 /// addNewMetadata). Use this for *newly created* instructions in the vector
654 void addMetadata(Instruction
*To
, Instruction
*From
);
656 /// Similar to the previous function but it adds the metadata to a
657 /// vector of instructions.
658 void addMetadata(ArrayRef
<Value
*> To
, Instruction
*From
);
660 /// The original loop.
663 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
664 /// dynamic knowledge to simplify SCEV expressions and converts them to a
665 /// more usable form.
666 PredicatedScalarEvolution
&PSE
;
677 /// Target Library Info.
678 const TargetLibraryInfo
*TLI
;
680 /// Target Transform Info.
681 const TargetTransformInfo
*TTI
;
683 /// Assumption Cache.
686 /// Interface to emit optimization remarks.
687 OptimizationRemarkEmitter
*ORE
;
689 /// LoopVersioning. It's only set up (non-null) if memchecks were
692 /// This is currently only used to add no-alias metadata based on the
693 /// memchecks. The actually versioning is performed manually.
694 std::unique_ptr
<LoopVersioning
> LVer
;
696 /// The vectorization SIMD factor to use. Each vector will have this many
700 /// The vectorization unroll factor to use. Each scalar is vectorized to this
701 /// many different vector instructions.
704 /// The builder that we use
707 // --- Vectorization state ---
709 /// The vector-loop preheader.
710 BasicBlock
*LoopVectorPreHeader
;
712 /// The scalar-loop preheader.
713 BasicBlock
*LoopScalarPreHeader
;
715 /// Middle Block between the vector and the scalar.
716 BasicBlock
*LoopMiddleBlock
;
718 /// The ExitBlock of the scalar loop.
719 BasicBlock
*LoopExitBlock
;
721 /// The vector loop body.
722 BasicBlock
*LoopVectorBody
;
724 /// The scalar loop body.
725 BasicBlock
*LoopScalarBody
;
727 /// A list of all bypass blocks. The first block is the entry of the loop.
728 SmallVector
<BasicBlock
*, 4> LoopBypassBlocks
;
730 /// The new Induction variable which was added to the new block.
731 PHINode
*Induction
= nullptr;
733 /// The induction variable of the old basic block.
734 PHINode
*OldInduction
= nullptr;
736 /// Maps values from the original loop to their corresponding values in the
737 /// vectorized loop. A key value can map to either vector values, scalar
738 /// values or both kinds of values, depending on whether the key was
739 /// vectorized and scalarized.
740 VectorizerValueMap VectorLoopValueMap
;
742 /// Store instructions that were predicated.
743 SmallVector
<Instruction
*, 4> PredicatedInstructions
;
745 /// Trip count of the original loop.
746 Value
*TripCount
= nullptr;
748 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
749 Value
*VectorTripCount
= nullptr;
751 /// The legality analysis.
752 LoopVectorizationLegality
*Legal
;
754 /// The profitablity analysis.
755 LoopVectorizationCostModel
*Cost
;
757 // Record whether runtime checks are added.
758 bool AddedSafetyChecks
= false;
760 // Holds the end values for each induction variable. We save the end values
761 // so we can later fix-up the external users of the induction variables.
762 DenseMap
<PHINode
*, Value
*> IVEndValues
;
764 // Vector of original scalar PHIs whose corresponding widened PHIs need to be
765 // fixed up at the end of vector code generation.
766 SmallVector
<PHINode
*, 8> OrigPHIsToFix
;
769 class InnerLoopUnroller
: public InnerLoopVectorizer
{
771 InnerLoopUnroller(Loop
*OrigLoop
, PredicatedScalarEvolution
&PSE
,
772 LoopInfo
*LI
, DominatorTree
*DT
,
773 const TargetLibraryInfo
*TLI
,
774 const TargetTransformInfo
*TTI
, AssumptionCache
*AC
,
775 OptimizationRemarkEmitter
*ORE
, unsigned UnrollFactor
,
776 LoopVectorizationLegality
*LVL
,
777 LoopVectorizationCostModel
*CM
)
778 : InnerLoopVectorizer(OrigLoop
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
, 1,
779 UnrollFactor
, LVL
, CM
) {}
782 Value
*getBroadcastInstrs(Value
*V
) override
;
783 Value
*getStepVector(Value
*Val
, int StartIdx
, Value
*Step
,
784 Instruction::BinaryOps Opcode
=
785 Instruction::BinaryOpsEnd
) override
;
786 Value
*reverseVector(Value
*Vec
) override
;
789 } // end namespace llvm
791 /// Look for a meaningful debug location on the instruction or it's
793 static Instruction
*getDebugLocFromInstOrOperands(Instruction
*I
) {
798 if (I
->getDebugLoc() != Empty
)
801 for (User::op_iterator OI
= I
->op_begin(), OE
= I
->op_end(); OI
!= OE
; ++OI
) {
802 if (Instruction
*OpInst
= dyn_cast
<Instruction
>(*OI
))
803 if (OpInst
->getDebugLoc() != Empty
)
810 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder
<> &B
, const Value
*Ptr
) {
811 if (const Instruction
*Inst
= dyn_cast_or_null
<Instruction
>(Ptr
)) {
812 const DILocation
*DIL
= Inst
->getDebugLoc();
813 if (DIL
&& Inst
->getFunction()->isDebugInfoForProfiling() &&
814 !isa
<DbgInfoIntrinsic
>(Inst
)) {
815 auto NewDIL
= DIL
->cloneByMultiplyingDuplicationFactor(UF
* VF
);
817 B
.SetCurrentDebugLocation(NewDIL
.getValue());
820 << "Failed to create new discriminator: "
821 << DIL
->getFilename() << " Line: " << DIL
->getLine());
824 B
.SetCurrentDebugLocation(DIL
);
826 B
.SetCurrentDebugLocation(DebugLoc());
829 /// Write a record \p DebugMsg about vectorization failure to the debug
830 /// output stream. If \p I is passed, it is an instruction that prevents
833 static void debugVectorizationFailure(const StringRef DebugMsg
,
835 dbgs() << "LV: Not vectorizing: " << DebugMsg
;
844 /// Create an analysis remark that explains why vectorization failed
846 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
847 /// RemarkName is the identifier for the remark. If \p I is passed it is an
848 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for
849 /// the location of the remark. \return the remark object that can be
851 static OptimizationRemarkAnalysis
createLVAnalysis(const char *PassName
,
852 StringRef RemarkName
, Loop
*TheLoop
, Instruction
*I
) {
853 Value
*CodeRegion
= TheLoop
->getHeader();
854 DebugLoc DL
= TheLoop
->getStartLoc();
857 CodeRegion
= I
->getParent();
858 // If there is no debug location attached to the instruction, revert back to
860 if (I
->getDebugLoc())
861 DL
= I
->getDebugLoc();
864 OptimizationRemarkAnalysis
R(PassName
, RemarkName
, DL
, CodeRegion
);
865 R
<< "loop not vectorized: ";
871 void reportVectorizationFailure(const StringRef DebugMsg
,
872 const StringRef OREMsg
, const StringRef ORETag
,
873 OptimizationRemarkEmitter
*ORE
, Loop
*TheLoop
, Instruction
*I
) {
874 LLVM_DEBUG(debugVectorizationFailure(DebugMsg
, I
));
875 LoopVectorizeHints
Hints(TheLoop
, true /* doesn't matter */, *ORE
);
876 ORE
->emit(createLVAnalysis(Hints
.vectorizeAnalysisPassName(),
877 ORETag
, TheLoop
, I
) << OREMsg
);
880 } // end namespace llvm
883 /// \return string containing a file name and a line # for the given loop.
884 static std::string
getDebugLocString(const Loop
*L
) {
887 raw_string_ostream
OS(Result
);
888 if (const DebugLoc LoopDbgLoc
= L
->getStartLoc())
889 LoopDbgLoc
.print(OS
);
891 // Just print the module name.
892 OS
<< L
->getHeader()->getParent()->getParent()->getModuleIdentifier();
899 void InnerLoopVectorizer::addNewMetadata(Instruction
*To
,
900 const Instruction
*Orig
) {
901 // If the loop was versioned with memchecks, add the corresponding no-alias
903 if (LVer
&& (isa
<LoadInst
>(Orig
) || isa
<StoreInst
>(Orig
)))
904 LVer
->annotateInstWithNoAlias(To
, Orig
);
907 void InnerLoopVectorizer::addMetadata(Instruction
*To
,
909 propagateMetadata(To
, From
);
910 addNewMetadata(To
, From
);
913 void InnerLoopVectorizer::addMetadata(ArrayRef
<Value
*> To
,
915 for (Value
*V
: To
) {
916 if (Instruction
*I
= dyn_cast
<Instruction
>(V
))
917 addMetadata(I
, From
);
923 // Loop vectorization cost-model hints how the scalar epilogue loop should be
925 enum ScalarEpilogueLowering
{
927 // The default: allowing scalar epilogues.
928 CM_ScalarEpilogueAllowed
,
930 // Vectorization with OptForSize: don't allow epilogues.
931 CM_ScalarEpilogueNotAllowedOptSize
,
933 // A special case of vectorisation with OptForSize: loops with a very small
934 // trip count are considered for vectorization under OptForSize, thereby
935 // making sure the cost of their loop body is dominant, free of runtime
936 // guards and scalar iteration overheads.
937 CM_ScalarEpilogueNotAllowedLowTripLoop
,
939 // Loop hint predicate indicating an epilogue is undesired.
940 CM_ScalarEpilogueNotNeededUsePredicate
943 /// LoopVectorizationCostModel - estimates the expected speedups due to
945 /// In many cases vectorization is not profitable. This can happen because of
946 /// a number of reasons. In this class we mainly attempt to predict the
947 /// expected speedup/slowdowns due to the supported instruction set. We use the
948 /// TargetTransformInfo to query the different backends for the cost of
949 /// different operations.
950 class LoopVectorizationCostModel
{
952 LoopVectorizationCostModel(ScalarEpilogueLowering SEL
, Loop
*L
,
953 PredicatedScalarEvolution
&PSE
, LoopInfo
*LI
,
954 LoopVectorizationLegality
*Legal
,
955 const TargetTransformInfo
&TTI
,
956 const TargetLibraryInfo
*TLI
, DemandedBits
*DB
,
958 OptimizationRemarkEmitter
*ORE
, const Function
*F
,
959 const LoopVectorizeHints
*Hints
,
960 InterleavedAccessInfo
&IAI
)
961 : ScalarEpilogueStatus(SEL
), TheLoop(L
), PSE(PSE
), LI(LI
), Legal(Legal
),
962 TTI(TTI
), TLI(TLI
), DB(DB
), AC(AC
), ORE(ORE
), TheFunction(F
),
963 Hints(Hints
), InterleaveInfo(IAI
) {}
965 /// \return An upper bound for the vectorization factor, or None if
966 /// vectorization and interleaving should be avoided up front.
967 Optional
<unsigned> computeMaxVF();
969 /// \return True if runtime checks are required for vectorization, and false
971 bool runtimeChecksRequired();
973 /// \return The most profitable vectorization factor and the cost of that VF.
974 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
975 /// then this vectorization factor will be selected if vectorization is
977 VectorizationFactor
selectVectorizationFactor(unsigned MaxVF
);
979 /// Setup cost-based decisions for user vectorization factor.
980 void selectUserVectorizationFactor(unsigned UserVF
) {
981 collectUniformsAndScalars(UserVF
);
982 collectInstsToScalarize(UserVF
);
985 /// \return The size (in bits) of the smallest and widest types in the code
986 /// that needs to be vectorized. We ignore values that remain scalar such as
987 /// 64 bit loop indices.
988 std::pair
<unsigned, unsigned> getSmallestAndWidestTypes();
990 /// \return The desired interleave count.
991 /// If interleave count has been specified by metadata it will be returned.
992 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
993 /// are the selected vectorization factor and the cost of the selected VF.
994 unsigned selectInterleaveCount(unsigned VF
, unsigned LoopCost
);
996 /// Memory access instruction may be vectorized in more than one way.
997 /// Form of instruction after vectorization depends on cost.
998 /// This function takes cost-based decisions for Load/Store instructions
999 /// and collects them in a map. This decisions map is used for building
1000 /// the lists of loop-uniform and loop-scalar instructions.
1001 /// The calculated cost is saved with widening decision in order to
1002 /// avoid redundant calculations.
1003 void setCostBasedWideningDecision(unsigned VF
);
1005 /// A struct that represents some properties of the register usage
1007 struct RegisterUsage
{
1008 /// Holds the number of loop invariant values that are used in the loop.
1009 /// The key is ClassID of target-provided register class.
1010 SmallMapVector
<unsigned, unsigned, 4> LoopInvariantRegs
;
1011 /// Holds the maximum number of concurrent live intervals in the loop.
1012 /// The key is ClassID of target-provided register class.
1013 SmallMapVector
<unsigned, unsigned, 4> MaxLocalUsers
;
1016 /// \return Returns information about the register usages of the loop for the
1017 /// given vectorization factors.
1018 SmallVector
<RegisterUsage
, 8> calculateRegisterUsage(ArrayRef
<unsigned> VFs
);
1020 /// Collect values we want to ignore in the cost model.
1021 void collectValuesToIgnore();
1023 /// \returns The smallest bitwidth each instruction can be represented with.
1024 /// The vector equivalents of these instructions should be truncated to this
1026 const MapVector
<Instruction
*, uint64_t> &getMinimalBitwidths() const {
1030 /// \returns True if it is more profitable to scalarize instruction \p I for
1031 /// vectorization factor \p VF.
1032 bool isProfitableToScalarize(Instruction
*I
, unsigned VF
) const {
1033 assert(VF
> 1 && "Profitable to scalarize relevant only for VF > 1.");
1035 // Cost model is not run in the VPlan-native path - return conservative
1036 // result until this changes.
1037 if (EnableVPlanNativePath
)
1040 auto Scalars
= InstsToScalarize
.find(VF
);
1041 assert(Scalars
!= InstsToScalarize
.end() &&
1042 "VF not yet analyzed for scalarization profitability");
1043 return Scalars
->second
.find(I
) != Scalars
->second
.end();
1046 /// Returns true if \p I is known to be uniform after vectorization.
1047 bool isUniformAfterVectorization(Instruction
*I
, unsigned VF
) const {
1051 // Cost model is not run in the VPlan-native path - return conservative
1052 // result until this changes.
1053 if (EnableVPlanNativePath
)
1056 auto UniformsPerVF
= Uniforms
.find(VF
);
1057 assert(UniformsPerVF
!= Uniforms
.end() &&
1058 "VF not yet analyzed for uniformity");
1059 return UniformsPerVF
->second
.find(I
) != UniformsPerVF
->second
.end();
1062 /// Returns true if \p I is known to be scalar after vectorization.
1063 bool isScalarAfterVectorization(Instruction
*I
, unsigned VF
) const {
1067 // Cost model is not run in the VPlan-native path - return conservative
1068 // result until this changes.
1069 if (EnableVPlanNativePath
)
1072 auto ScalarsPerVF
= Scalars
.find(VF
);
1073 assert(ScalarsPerVF
!= Scalars
.end() &&
1074 "Scalar values are not calculated for VF");
1075 return ScalarsPerVF
->second
.find(I
) != ScalarsPerVF
->second
.end();
1078 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1079 /// for vectorization factor \p VF.
1080 bool canTruncateToMinimalBitwidth(Instruction
*I
, unsigned VF
) const {
1081 return VF
> 1 && MinBWs
.find(I
) != MinBWs
.end() &&
1082 !isProfitableToScalarize(I
, VF
) &&
1083 !isScalarAfterVectorization(I
, VF
);
1086 /// Decision that was taken during cost calculation for memory instruction.
1089 CM_Widen
, // For consecutive accesses with stride +1.
1090 CM_Widen_Reverse
, // For consecutive accesses with stride -1.
1096 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1097 /// instruction \p I and vector width \p VF.
1098 void setWideningDecision(Instruction
*I
, unsigned VF
, InstWidening W
,
1100 assert(VF
>= 2 && "Expected VF >=2");
1101 WideningDecisions
[std::make_pair(I
, VF
)] = std::make_pair(W
, Cost
);
1104 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1105 /// interleaving group \p Grp and vector width \p VF.
1106 void setWideningDecision(const InterleaveGroup
<Instruction
> *Grp
, unsigned VF
,
1107 InstWidening W
, unsigned Cost
) {
1108 assert(VF
>= 2 && "Expected VF >=2");
1109 /// Broadcast this decicion to all instructions inside the group.
1110 /// But the cost will be assigned to one instruction only.
1111 for (unsigned i
= 0; i
< Grp
->getFactor(); ++i
) {
1112 if (auto *I
= Grp
->getMember(i
)) {
1113 if (Grp
->getInsertPos() == I
)
1114 WideningDecisions
[std::make_pair(I
, VF
)] = std::make_pair(W
, Cost
);
1116 WideningDecisions
[std::make_pair(I
, VF
)] = std::make_pair(W
, 0);
1121 /// Return the cost model decision for the given instruction \p I and vector
1122 /// width \p VF. Return CM_Unknown if this instruction did not pass
1123 /// through the cost modeling.
1124 InstWidening
getWideningDecision(Instruction
*I
, unsigned VF
) {
1125 assert(VF
>= 2 && "Expected VF >=2");
1127 // Cost model is not run in the VPlan-native path - return conservative
1128 // result until this changes.
1129 if (EnableVPlanNativePath
)
1130 return CM_GatherScatter
;
1132 std::pair
<Instruction
*, unsigned> InstOnVF
= std::make_pair(I
, VF
);
1133 auto Itr
= WideningDecisions
.find(InstOnVF
);
1134 if (Itr
== WideningDecisions
.end())
1136 return Itr
->second
.first
;
1139 /// Return the vectorization cost for the given instruction \p I and vector
1141 unsigned getWideningCost(Instruction
*I
, unsigned VF
) {
1142 assert(VF
>= 2 && "Expected VF >=2");
1143 std::pair
<Instruction
*, unsigned> InstOnVF
= std::make_pair(I
, VF
);
1144 assert(WideningDecisions
.find(InstOnVF
) != WideningDecisions
.end() &&
1145 "The cost is not calculated");
1146 return WideningDecisions
[InstOnVF
].second
;
1149 /// Return True if instruction \p I is an optimizable truncate whose operand
1150 /// is an induction variable. Such a truncate will be removed by adding a new
1151 /// induction variable with the destination type.
1152 bool isOptimizableIVTruncate(Instruction
*I
, unsigned VF
) {
1153 // If the instruction is not a truncate, return false.
1154 auto *Trunc
= dyn_cast
<TruncInst
>(I
);
1158 // Get the source and destination types of the truncate.
1159 Type
*SrcTy
= ToVectorTy(cast
<CastInst
>(I
)->getSrcTy(), VF
);
1160 Type
*DestTy
= ToVectorTy(cast
<CastInst
>(I
)->getDestTy(), VF
);
1162 // If the truncate is free for the given types, return false. Replacing a
1163 // free truncate with an induction variable would add an induction variable
1164 // update instruction to each iteration of the loop. We exclude from this
1165 // check the primary induction variable since it will need an update
1166 // instruction regardless.
1167 Value
*Op
= Trunc
->getOperand(0);
1168 if (Op
!= Legal
->getPrimaryInduction() && TTI
.isTruncateFree(SrcTy
, DestTy
))
1171 // If the truncated value is not an induction variable, return false.
1172 return Legal
->isInductionPhi(Op
);
1175 /// Collects the instructions to scalarize for each predicated instruction in
1177 void collectInstsToScalarize(unsigned VF
);
1179 /// Collect Uniform and Scalar values for the given \p VF.
1180 /// The sets depend on CM decision for Load/Store instructions
1181 /// that may be vectorized as interleave, gather-scatter or scalarized.
1182 void collectUniformsAndScalars(unsigned VF
) {
1183 // Do the analysis once.
1184 if (VF
== 1 || Uniforms
.find(VF
) != Uniforms
.end())
1186 setCostBasedWideningDecision(VF
);
1187 collectLoopUniforms(VF
);
1188 collectLoopScalars(VF
);
1191 /// Returns true if the target machine supports masked store operation
1192 /// for the given \p DataType and kind of access to \p Ptr.
1193 bool isLegalMaskedStore(Type
*DataType
, Value
*Ptr
, MaybeAlign Alignment
) {
1194 return Legal
->isConsecutivePtr(Ptr
) &&
1195 TTI
.isLegalMaskedStore(DataType
, Alignment
);
1198 /// Returns true if the target machine supports masked load operation
1199 /// for the given \p DataType and kind of access to \p Ptr.
1200 bool isLegalMaskedLoad(Type
*DataType
, Value
*Ptr
, MaybeAlign Alignment
) {
1201 return Legal
->isConsecutivePtr(Ptr
) &&
1202 TTI
.isLegalMaskedLoad(DataType
, Alignment
);
1205 /// Returns true if the target machine supports masked scatter operation
1206 /// for the given \p DataType.
1207 bool isLegalMaskedScatter(Type
*DataType
) {
1208 return TTI
.isLegalMaskedScatter(DataType
);
1211 /// Returns true if the target machine supports masked gather operation
1212 /// for the given \p DataType.
1213 bool isLegalMaskedGather(Type
*DataType
) {
1214 return TTI
.isLegalMaskedGather(DataType
);
1217 /// Returns true if the target machine can represent \p V as a masked gather
1218 /// or scatter operation.
1219 bool isLegalGatherOrScatter(Value
*V
) {
1220 bool LI
= isa
<LoadInst
>(V
);
1221 bool SI
= isa
<StoreInst
>(V
);
1224 auto *Ty
= getMemInstValueType(V
);
1225 return (LI
&& isLegalMaskedGather(Ty
)) || (SI
&& isLegalMaskedScatter(Ty
));
1228 /// Returns true if \p I is an instruction that will be scalarized with
1229 /// predication. Such instructions include conditional stores and
1230 /// instructions that may divide by zero.
1231 /// If a non-zero VF has been calculated, we check if I will be scalarized
1232 /// predication for that VF.
1233 bool isScalarWithPredication(Instruction
*I
, unsigned VF
= 1);
1235 // Returns true if \p I is an instruction that will be predicated either
1236 // through scalar predication or masked load/store or masked gather/scatter.
1237 // Superset of instructions that return true for isScalarWithPredication.
1238 bool isPredicatedInst(Instruction
*I
) {
1239 if (!blockNeedsPredication(I
->getParent()))
1241 // Loads and stores that need some form of masked operation are predicated
1243 if (isa
<LoadInst
>(I
) || isa
<StoreInst
>(I
))
1244 return Legal
->isMaskRequired(I
);
1245 return isScalarWithPredication(I
);
1248 /// Returns true if \p I is a memory instruction with consecutive memory
1249 /// access that can be widened.
1250 bool memoryInstructionCanBeWidened(Instruction
*I
, unsigned VF
= 1);
1252 /// Returns true if \p I is a memory instruction in an interleaved-group
1253 /// of memory accesses that can be vectorized with wide vector loads/stores
1255 bool interleavedAccessCanBeWidened(Instruction
*I
, unsigned VF
= 1);
1257 /// Check if \p Instr belongs to any interleaved access group.
1258 bool isAccessInterleaved(Instruction
*Instr
) {
1259 return InterleaveInfo
.isInterleaved(Instr
);
1262 /// Get the interleaved access group that \p Instr belongs to.
1263 const InterleaveGroup
<Instruction
> *
1264 getInterleavedAccessGroup(Instruction
*Instr
) {
1265 return InterleaveInfo
.getInterleaveGroup(Instr
);
1268 /// Returns true if an interleaved group requires a scalar iteration
1269 /// to handle accesses with gaps, and there is nothing preventing us from
1270 /// creating a scalar epilogue.
1271 bool requiresScalarEpilogue() const {
1272 return isScalarEpilogueAllowed() && InterleaveInfo
.requiresScalarEpilogue();
1275 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1276 /// loop hint annotation.
1277 bool isScalarEpilogueAllowed() const {
1278 return ScalarEpilogueStatus
== CM_ScalarEpilogueAllowed
;
1281 /// Returns true if all loop blocks should be masked to fold tail loop.
1282 bool foldTailByMasking() const { return FoldTailByMasking
; }
1284 bool blockNeedsPredication(BasicBlock
*BB
) {
1285 return foldTailByMasking() || Legal
->blockNeedsPredication(BB
);
1288 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1289 /// with factor VF. Return the cost of the instruction, including
1290 /// scalarization overhead if it's needed.
1291 unsigned getVectorIntrinsicCost(CallInst
*CI
, unsigned VF
);
1293 /// Estimate cost of a call instruction CI if it were vectorized with factor
1294 /// VF. Return the cost of the instruction, including scalarization overhead
1295 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1297 /// i.e. either vector version isn't available, or is too expensive.
1298 unsigned getVectorCallCost(CallInst
*CI
, unsigned VF
, bool &NeedToScalarize
);
1301 unsigned NumPredStores
= 0;
1303 /// \return An upper bound for the vectorization factor, larger than zero.
1304 /// One is returned if vectorization should best be avoided due to cost.
1305 unsigned computeFeasibleMaxVF(unsigned ConstTripCount
);
1307 /// The vectorization cost is a combination of the cost itself and a boolean
1308 /// indicating whether any of the contributing operations will actually
1310 /// vector values after type legalization in the backend. If this latter value
1312 /// false, then all operations will be scalarized (i.e. no vectorization has
1313 /// actually taken place).
1314 using VectorizationCostTy
= std::pair
<unsigned, bool>;
1316 /// Returns the expected execution cost. The unit of the cost does
1317 /// not matter because we use the 'cost' units to compare different
1318 /// vector widths. The cost that is returned is *not* normalized by
1319 /// the factor width.
1320 VectorizationCostTy
expectedCost(unsigned VF
);
1322 /// Returns the execution time cost of an instruction for a given vector
1323 /// width. Vector width of one means scalar.
1324 VectorizationCostTy
getInstructionCost(Instruction
*I
, unsigned VF
);
1326 /// The cost-computation logic from getInstructionCost which provides
1327 /// the vector type as an output parameter.
1328 unsigned getInstructionCost(Instruction
*I
, unsigned VF
, Type
*&VectorTy
);
1330 /// Calculate vectorization cost of memory instruction \p I.
1331 unsigned getMemoryInstructionCost(Instruction
*I
, unsigned VF
);
1333 /// The cost computation for scalarized memory instruction.
1334 unsigned getMemInstScalarizationCost(Instruction
*I
, unsigned VF
);
1336 /// The cost computation for interleaving group of memory instructions.
1337 unsigned getInterleaveGroupCost(Instruction
*I
, unsigned VF
);
1339 /// The cost computation for Gather/Scatter instruction.
1340 unsigned getGatherScatterCost(Instruction
*I
, unsigned VF
);
1342 /// The cost computation for widening instruction \p I with consecutive
1344 unsigned getConsecutiveMemOpCost(Instruction
*I
, unsigned VF
);
1346 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1347 /// Load: scalar load + broadcast.
1348 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1350 unsigned getUniformMemOpCost(Instruction
*I
, unsigned VF
);
1352 /// Estimate the overhead of scalarizing an instruction. This is a
1353 /// convenience wrapper for the type-based getScalarizationOverhead API.
1354 unsigned getScalarizationOverhead(Instruction
*I
, unsigned VF
);
1356 /// Returns whether the instruction is a load or store and will be a emitted
1357 /// as a vector operation.
1358 bool isConsecutiveLoadOrStore(Instruction
*I
);
1360 /// Returns true if an artificially high cost for emulated masked memrefs
1362 bool useEmulatedMaskMemRefHack(Instruction
*I
);
1364 /// Map of scalar integer values to the smallest bitwidth they can be legally
1365 /// represented as. The vector equivalents of these values should be truncated
1367 MapVector
<Instruction
*, uint64_t> MinBWs
;
1369 /// A type representing the costs for instructions if they were to be
1370 /// scalarized rather than vectorized. The entries are Instruction-Cost
1372 using ScalarCostsTy
= DenseMap
<Instruction
*, unsigned>;
1374 /// A set containing all BasicBlocks that are known to present after
1375 /// vectorization as a predicated block.
1376 SmallPtrSet
<BasicBlock
*, 4> PredicatedBBsAfterVectorization
;
1378 /// Records whether it is allowed to have the original scalar loop execute at
1379 /// least once. This may be needed as a fallback loop in case runtime
1380 /// aliasing/dependence checks fail, or to handle the tail/remainder
1381 /// iterations when the trip count is unknown or doesn't divide by the VF,
1382 /// or as a peel-loop to handle gaps in interleave-groups.
1383 /// Under optsize and when the trip count is very small we don't allow any
1384 /// iterations to execute in the scalar loop.
1385 ScalarEpilogueLowering ScalarEpilogueStatus
= CM_ScalarEpilogueAllowed
;
1387 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1388 bool FoldTailByMasking
= false;
1390 /// A map holding scalar costs for different vectorization factors. The
1391 /// presence of a cost for an instruction in the mapping indicates that the
1392 /// instruction will be scalarized when vectorizing with the associated
1393 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1394 DenseMap
<unsigned, ScalarCostsTy
> InstsToScalarize
;
1396 /// Holds the instructions known to be uniform after vectorization.
1397 /// The data is collected per VF.
1398 DenseMap
<unsigned, SmallPtrSet
<Instruction
*, 4>> Uniforms
;
1400 /// Holds the instructions known to be scalar after vectorization.
1401 /// The data is collected per VF.
1402 DenseMap
<unsigned, SmallPtrSet
<Instruction
*, 4>> Scalars
;
1404 /// Holds the instructions (address computations) that are forced to be
1406 DenseMap
<unsigned, SmallPtrSet
<Instruction
*, 4>> ForcedScalars
;
1408 /// Returns the expected difference in cost from scalarizing the expression
1409 /// feeding a predicated instruction \p PredInst. The instructions to
1410 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1411 /// non-negative return value implies the expression will be scalarized.
1412 /// Currently, only single-use chains are considered for scalarization.
1413 int computePredInstDiscount(Instruction
*PredInst
, ScalarCostsTy
&ScalarCosts
,
1416 /// Collect the instructions that are uniform after vectorization. An
1417 /// instruction is uniform if we represent it with a single scalar value in
1418 /// the vectorized loop corresponding to each vector iteration. Examples of
1419 /// uniform instructions include pointer operands of consecutive or
1420 /// interleaved memory accesses. Note that although uniformity implies an
1421 /// instruction will be scalar, the reverse is not true. In general, a
1422 /// scalarized instruction will be represented by VF scalar values in the
1423 /// vectorized loop, each corresponding to an iteration of the original
1425 void collectLoopUniforms(unsigned VF
);
1427 /// Collect the instructions that are scalar after vectorization. An
1428 /// instruction is scalar if it is known to be uniform or will be scalarized
1429 /// during vectorization. Non-uniform scalarized instructions will be
1430 /// represented by VF values in the vectorized loop, each corresponding to an
1431 /// iteration of the original scalar loop.
1432 void collectLoopScalars(unsigned VF
);
1434 /// Keeps cost model vectorization decision and cost for instructions.
1435 /// Right now it is used for memory instructions only.
1436 using DecisionList
= DenseMap
<std::pair
<Instruction
*, unsigned>,
1437 std::pair
<InstWidening
, unsigned>>;
1439 DecisionList WideningDecisions
;
1441 /// Returns true if \p V is expected to be vectorized and it needs to be
1443 bool needsExtract(Value
*V
, unsigned VF
) const {
1444 Instruction
*I
= dyn_cast
<Instruction
>(V
);
1445 if (VF
== 1 || !I
|| !TheLoop
->contains(I
) || TheLoop
->isLoopInvariant(I
))
1448 // Assume we can vectorize V (and hence we need extraction) if the
1449 // scalars are not computed yet. This can happen, because it is called
1450 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1451 // the scalars are collected. That should be a safe assumption in most
1452 // cases, because we check if the operands have vectorizable types
1453 // beforehand in LoopVectorizationLegality.
1454 return Scalars
.find(VF
) == Scalars
.end() ||
1455 !isScalarAfterVectorization(I
, VF
);
1458 /// Returns a range containing only operands needing to be extracted.
1459 SmallVector
<Value
*, 4> filterExtractingOperands(Instruction::op_range Ops
,
1461 return SmallVector
<Value
*, 4>(make_filter_range(
1462 Ops
, [this, VF
](Value
*V
) { return this->needsExtract(V
, VF
); }));
1466 /// The loop that we evaluate.
1469 /// Predicated scalar evolution analysis.
1470 PredicatedScalarEvolution
&PSE
;
1472 /// Loop Info analysis.
1475 /// Vectorization legality.
1476 LoopVectorizationLegality
*Legal
;
1478 /// Vector target information.
1479 const TargetTransformInfo
&TTI
;
1481 /// Target Library Info.
1482 const TargetLibraryInfo
*TLI
;
1484 /// Demanded bits analysis.
1487 /// Assumption cache.
1488 AssumptionCache
*AC
;
1490 /// Interface to emit optimization remarks.
1491 OptimizationRemarkEmitter
*ORE
;
1493 const Function
*TheFunction
;
1495 /// Loop Vectorize Hint.
1496 const LoopVectorizeHints
*Hints
;
1498 /// The interleave access information contains groups of interleaved accesses
1499 /// with the same stride and close to each other.
1500 InterleavedAccessInfo
&InterleaveInfo
;
1502 /// Values to ignore in the cost model.
1503 SmallPtrSet
<const Value
*, 16> ValuesToIgnore
;
1505 /// Values to ignore in the cost model when VF > 1.
1506 SmallPtrSet
<const Value
*, 16> VecValuesToIgnore
;
1509 } // end namespace llvm
1511 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1512 // vectorization. The loop needs to be annotated with #pragma omp simd
1513 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1514 // vector length information is not provided, vectorization is not considered
1515 // explicit. Interleave hints are not allowed either. These limitations will be
1516 // relaxed in the future.
1517 // Please, note that we are currently forced to abuse the pragma 'clang
1518 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1519 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1520 // provides *explicit vectorization hints* (LV can bypass legal checks and
1521 // assume that vectorization is legal). However, both hints are implemented
1522 // using the same metadata (llvm.loop.vectorize, processed by
1523 // LoopVectorizeHints). This will be fixed in the future when the native IR
1524 // representation for pragma 'omp simd' is introduced.
1525 static bool isExplicitVecOuterLoop(Loop
*OuterLp
,
1526 OptimizationRemarkEmitter
*ORE
) {
1527 assert(!OuterLp
->empty() && "This is not an outer loop");
1528 LoopVectorizeHints
Hints(OuterLp
, true /*DisableInterleaving*/, *ORE
);
1530 // Only outer loops with an explicit vectorization hint are supported.
1531 // Unannotated outer loops are ignored.
1532 if (Hints
.getForce() == LoopVectorizeHints::FK_Undefined
)
1535 Function
*Fn
= OuterLp
->getHeader()->getParent();
1536 if (!Hints
.allowVectorization(Fn
, OuterLp
,
1537 true /*VectorizeOnlyWhenForced*/)) {
1538 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1542 if (Hints
.getInterleave() > 1) {
1543 // TODO: Interleave support is future work.
1544 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1546 Hints
.emitRemarkWithHints();
1553 static void collectSupportedLoops(Loop
&L
, LoopInfo
*LI
,
1554 OptimizationRemarkEmitter
*ORE
,
1555 SmallVectorImpl
<Loop
*> &V
) {
1556 // Collect inner loops and outer loops without irreducible control flow. For
1557 // now, only collect outer loops that have explicit vectorization hints. If we
1558 // are stress testing the VPlan H-CFG construction, we collect the outermost
1559 // loop of every loop nest.
1560 if (L
.empty() || VPlanBuildStressTest
||
1561 (EnableVPlanNativePath
&& isExplicitVecOuterLoop(&L
, ORE
))) {
1562 LoopBlocksRPO
RPOT(&L
);
1564 if (!containsIrreducibleCFG
<const BasicBlock
*>(RPOT
, *LI
)) {
1566 // TODO: Collect inner loops inside marked outer loops in case
1567 // vectorization fails for the outer loop. Do not invoke
1568 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1569 // already known to be reducible. We can use an inherited attribute for
1574 for (Loop
*InnerL
: L
)
1575 collectSupportedLoops(*InnerL
, LI
, ORE
, V
);
1580 /// The LoopVectorize Pass.
1581 struct LoopVectorize
: public FunctionPass
{
1582 /// Pass identification, replacement for typeid
1585 LoopVectorizePass Impl
;
1587 explicit LoopVectorize(bool InterleaveOnlyWhenForced
= false,
1588 bool VectorizeOnlyWhenForced
= false)
1589 : FunctionPass(ID
) {
1590 Impl
.InterleaveOnlyWhenForced
= InterleaveOnlyWhenForced
;
1591 Impl
.VectorizeOnlyWhenForced
= VectorizeOnlyWhenForced
;
1592 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1595 bool runOnFunction(Function
&F
) override
{
1596 if (skipFunction(F
))
1599 auto *SE
= &getAnalysis
<ScalarEvolutionWrapperPass
>().getSE();
1600 auto *LI
= &getAnalysis
<LoopInfoWrapperPass
>().getLoopInfo();
1601 auto *TTI
= &getAnalysis
<TargetTransformInfoWrapperPass
>().getTTI(F
);
1602 auto *DT
= &getAnalysis
<DominatorTreeWrapperPass
>().getDomTree();
1603 auto *BFI
= &getAnalysis
<BlockFrequencyInfoWrapperPass
>().getBFI();
1604 auto *TLIP
= getAnalysisIfAvailable
<TargetLibraryInfoWrapperPass
>();
1605 auto *TLI
= TLIP
? &TLIP
->getTLI(F
) : nullptr;
1606 auto *AA
= &getAnalysis
<AAResultsWrapperPass
>().getAAResults();
1607 auto *AC
= &getAnalysis
<AssumptionCacheTracker
>().getAssumptionCache(F
);
1608 auto *LAA
= &getAnalysis
<LoopAccessLegacyAnalysis
>();
1609 auto *DB
= &getAnalysis
<DemandedBitsWrapperPass
>().getDemandedBits();
1610 auto *ORE
= &getAnalysis
<OptimizationRemarkEmitterWrapperPass
>().getORE();
1611 auto *PSI
= &getAnalysis
<ProfileSummaryInfoWrapperPass
>().getPSI();
1613 std::function
<const LoopAccessInfo
&(Loop
&)> GetLAA
=
1614 [&](Loop
&L
) -> const LoopAccessInfo
& { return LAA
->getInfo(&L
); };
1616 return Impl
.runImpl(F
, *SE
, *LI
, *TTI
, *DT
, *BFI
, TLI
, *DB
, *AA
, *AC
,
1620 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
1621 AU
.addRequired
<AssumptionCacheTracker
>();
1622 AU
.addRequired
<BlockFrequencyInfoWrapperPass
>();
1623 AU
.addRequired
<DominatorTreeWrapperPass
>();
1624 AU
.addRequired
<LoopInfoWrapperPass
>();
1625 AU
.addRequired
<ScalarEvolutionWrapperPass
>();
1626 AU
.addRequired
<TargetTransformInfoWrapperPass
>();
1627 AU
.addRequired
<AAResultsWrapperPass
>();
1628 AU
.addRequired
<LoopAccessLegacyAnalysis
>();
1629 AU
.addRequired
<DemandedBitsWrapperPass
>();
1630 AU
.addRequired
<OptimizationRemarkEmitterWrapperPass
>();
1632 // We currently do not preserve loopinfo/dominator analyses with outer loop
1633 // vectorization. Until this is addressed, mark these analyses as preserved
1634 // only for non-VPlan-native path.
1635 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1636 if (!EnableVPlanNativePath
) {
1637 AU
.addPreserved
<LoopInfoWrapperPass
>();
1638 AU
.addPreserved
<DominatorTreeWrapperPass
>();
1641 AU
.addPreserved
<BasicAAWrapperPass
>();
1642 AU
.addPreserved
<GlobalsAAWrapperPass
>();
1643 AU
.addRequired
<ProfileSummaryInfoWrapperPass
>();
1647 } // end anonymous namespace
1649 //===----------------------------------------------------------------------===//
1650 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1651 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1652 //===----------------------------------------------------------------------===//
1654 Value
*InnerLoopVectorizer::getBroadcastInstrs(Value
*V
) {
1655 // We need to place the broadcast of invariant variables outside the loop,
1656 // but only if it's proven safe to do so. Else, broadcast will be inside
1657 // vector loop body.
1658 Instruction
*Instr
= dyn_cast
<Instruction
>(V
);
1659 bool SafeToHoist
= OrigLoop
->isLoopInvariant(V
) &&
1661 DT
->dominates(Instr
->getParent(), LoopVectorPreHeader
));
1662 // Place the code for broadcasting invariant variables in the new preheader.
1663 IRBuilder
<>::InsertPointGuard
Guard(Builder
);
1665 Builder
.SetInsertPoint(LoopVectorPreHeader
->getTerminator());
1667 // Broadcast the scalar into all locations in the vector.
1668 Value
*Shuf
= Builder
.CreateVectorSplat(VF
, V
, "broadcast");
1673 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1674 const InductionDescriptor
&II
, Value
*Step
, Instruction
*EntryVal
) {
1675 assert((isa
<PHINode
>(EntryVal
) || isa
<TruncInst
>(EntryVal
)) &&
1676 "Expected either an induction phi-node or a truncate of it!");
1677 Value
*Start
= II
.getStartValue();
1679 // Construct the initial value of the vector IV in the vector loop preheader
1680 auto CurrIP
= Builder
.saveIP();
1681 Builder
.SetInsertPoint(LoopVectorPreHeader
->getTerminator());
1682 if (isa
<TruncInst
>(EntryVal
)) {
1683 assert(Start
->getType()->isIntegerTy() &&
1684 "Truncation requires an integer type");
1685 auto *TruncType
= cast
<IntegerType
>(EntryVal
->getType());
1686 Step
= Builder
.CreateTrunc(Step
, TruncType
);
1687 Start
= Builder
.CreateCast(Instruction::Trunc
, Start
, TruncType
);
1689 Value
*SplatStart
= Builder
.CreateVectorSplat(VF
, Start
);
1690 Value
*SteppedStart
=
1691 getStepVector(SplatStart
, 0, Step
, II
.getInductionOpcode());
1693 // We create vector phi nodes for both integer and floating-point induction
1694 // variables. Here, we determine the kind of arithmetic we will perform.
1695 Instruction::BinaryOps AddOp
;
1696 Instruction::BinaryOps MulOp
;
1697 if (Step
->getType()->isIntegerTy()) {
1698 AddOp
= Instruction::Add
;
1699 MulOp
= Instruction::Mul
;
1701 AddOp
= II
.getInductionOpcode();
1702 MulOp
= Instruction::FMul
;
1705 // Multiply the vectorization factor by the step using integer or
1706 // floating-point arithmetic as appropriate.
1707 Value
*ConstVF
= getSignedIntOrFpConstant(Step
->getType(), VF
);
1708 Value
*Mul
= addFastMathFlag(Builder
.CreateBinOp(MulOp
, Step
, ConstVF
));
1710 // Create a vector splat to use in the induction update.
1712 // FIXME: If the step is non-constant, we create the vector splat with
1713 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1714 // handle a constant vector splat.
1715 Value
*SplatVF
= isa
<Constant
>(Mul
)
1716 ? ConstantVector::getSplat(VF
, cast
<Constant
>(Mul
))
1717 : Builder
.CreateVectorSplat(VF
, Mul
);
1718 Builder
.restoreIP(CurrIP
);
1720 // We may need to add the step a number of times, depending on the unroll
1721 // factor. The last of those goes into the PHI.
1722 PHINode
*VecInd
= PHINode::Create(SteppedStart
->getType(), 2, "vec.ind",
1723 &*LoopVectorBody
->getFirstInsertionPt());
1724 VecInd
->setDebugLoc(EntryVal
->getDebugLoc());
1725 Instruction
*LastInduction
= VecInd
;
1726 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
1727 VectorLoopValueMap
.setVectorValue(EntryVal
, Part
, LastInduction
);
1729 if (isa
<TruncInst
>(EntryVal
))
1730 addMetadata(LastInduction
, EntryVal
);
1731 recordVectorLoopValueForInductionCast(II
, EntryVal
, LastInduction
, Part
);
1733 LastInduction
= cast
<Instruction
>(addFastMathFlag(
1734 Builder
.CreateBinOp(AddOp
, LastInduction
, SplatVF
, "step.add")));
1735 LastInduction
->setDebugLoc(EntryVal
->getDebugLoc());
1738 // Move the last step to the end of the latch block. This ensures consistent
1739 // placement of all induction updates.
1740 auto *LoopVectorLatch
= LI
->getLoopFor(LoopVectorBody
)->getLoopLatch();
1741 auto *Br
= cast
<BranchInst
>(LoopVectorLatch
->getTerminator());
1742 auto *ICmp
= cast
<Instruction
>(Br
->getCondition());
1743 LastInduction
->moveBefore(ICmp
);
1744 LastInduction
->setName("vec.ind.next");
1746 VecInd
->addIncoming(SteppedStart
, LoopVectorPreHeader
);
1747 VecInd
->addIncoming(LastInduction
, LoopVectorLatch
);
1750 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction
*I
) const {
1751 return Cost
->isScalarAfterVectorization(I
, VF
) ||
1752 Cost
->isProfitableToScalarize(I
, VF
);
1755 bool InnerLoopVectorizer::needsScalarInduction(Instruction
*IV
) const {
1756 if (shouldScalarizeInstruction(IV
))
1758 auto isScalarInst
= [&](User
*U
) -> bool {
1759 auto *I
= cast
<Instruction
>(U
);
1760 return (OrigLoop
->contains(I
) && shouldScalarizeInstruction(I
));
1762 return llvm::any_of(IV
->users(), isScalarInst
);
1765 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1766 const InductionDescriptor
&ID
, const Instruction
*EntryVal
,
1767 Value
*VectorLoopVal
, unsigned Part
, unsigned Lane
) {
1768 assert((isa
<PHINode
>(EntryVal
) || isa
<TruncInst
>(EntryVal
)) &&
1769 "Expected either an induction phi-node or a truncate of it!");
1771 // This induction variable is not the phi from the original loop but the
1772 // newly-created IV based on the proof that casted Phi is equal to the
1773 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1774 // re-uses the same InductionDescriptor that original IV uses but we don't
1775 // have to do any recording in this case - that is done when original IV is
1777 if (isa
<TruncInst
>(EntryVal
))
1780 const SmallVectorImpl
<Instruction
*> &Casts
= ID
.getCastInsts();
1783 // Only the first Cast instruction in the Casts vector is of interest.
1784 // The rest of the Casts (if exist) have no uses outside the
1785 // induction update chain itself.
1786 Instruction
*CastInst
= *Casts
.begin();
1787 if (Lane
< UINT_MAX
)
1788 VectorLoopValueMap
.setScalarValue(CastInst
, {Part
, Lane
}, VectorLoopVal
);
1790 VectorLoopValueMap
.setVectorValue(CastInst
, Part
, VectorLoopVal
);
1793 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode
*IV
, TruncInst
*Trunc
) {
1794 assert((IV
->getType()->isIntegerTy() || IV
!= OldInduction
) &&
1795 "Primary induction variable must have an integer type");
1797 auto II
= Legal
->getInductionVars()->find(IV
);
1798 assert(II
!= Legal
->getInductionVars()->end() && "IV is not an induction");
1800 auto ID
= II
->second
;
1801 assert(IV
->getType() == ID
.getStartValue()->getType() && "Types must match");
1803 // The scalar value to broadcast. This will be derived from the canonical
1804 // induction variable.
1805 Value
*ScalarIV
= nullptr;
1807 // The value from the original loop to which we are mapping the new induction
1809 Instruction
*EntryVal
= Trunc
? cast
<Instruction
>(Trunc
) : IV
;
1811 // True if we have vectorized the induction variable.
1812 auto VectorizedIV
= false;
1814 // Determine if we want a scalar version of the induction variable. This is
1815 // true if the induction variable itself is not widened, or if it has at
1816 // least one user in the loop that is not widened.
1817 auto NeedsScalarIV
= VF
> 1 && needsScalarInduction(EntryVal
);
1819 // Generate code for the induction step. Note that induction steps are
1820 // required to be loop-invariant
1821 assert(PSE
.getSE()->isLoopInvariant(ID
.getStep(), OrigLoop
) &&
1822 "Induction step should be loop invariant");
1823 auto &DL
= OrigLoop
->getHeader()->getModule()->getDataLayout();
1824 Value
*Step
= nullptr;
1825 if (PSE
.getSE()->isSCEVable(IV
->getType())) {
1826 SCEVExpander
Exp(*PSE
.getSE(), DL
, "induction");
1827 Step
= Exp
.expandCodeFor(ID
.getStep(), ID
.getStep()->getType(),
1828 LoopVectorPreHeader
->getTerminator());
1830 Step
= cast
<SCEVUnknown
>(ID
.getStep())->getValue();
1833 // Try to create a new independent vector induction variable. If we can't
1834 // create the phi node, we will splat the scalar induction variable in each
1836 if (VF
> 1 && !shouldScalarizeInstruction(EntryVal
)) {
1837 createVectorIntOrFpInductionPHI(ID
, Step
, EntryVal
);
1838 VectorizedIV
= true;
1841 // If we haven't yet vectorized the induction variable, or if we will create
1842 // a scalar one, we need to define the scalar induction variable and step
1843 // values. If we were given a truncation type, truncate the canonical
1844 // induction variable and step. Otherwise, derive these values from the
1845 // induction descriptor.
1846 if (!VectorizedIV
|| NeedsScalarIV
) {
1847 ScalarIV
= Induction
;
1848 if (IV
!= OldInduction
) {
1849 ScalarIV
= IV
->getType()->isIntegerTy()
1850 ? Builder
.CreateSExtOrTrunc(Induction
, IV
->getType())
1851 : Builder
.CreateCast(Instruction::SIToFP
, Induction
,
1853 ScalarIV
= emitTransformedIndex(Builder
, ScalarIV
, PSE
.getSE(), DL
, ID
);
1854 ScalarIV
->setName("offset.idx");
1857 auto *TruncType
= cast
<IntegerType
>(Trunc
->getType());
1858 assert(Step
->getType()->isIntegerTy() &&
1859 "Truncation requires an integer step");
1860 ScalarIV
= Builder
.CreateTrunc(ScalarIV
, TruncType
);
1861 Step
= Builder
.CreateTrunc(Step
, TruncType
);
1865 // If we haven't yet vectorized the induction variable, splat the scalar
1866 // induction variable, and build the necessary step vectors.
1867 // TODO: Don't do it unless the vectorized IV is really required.
1868 if (!VectorizedIV
) {
1869 Value
*Broadcasted
= getBroadcastInstrs(ScalarIV
);
1870 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
1872 getStepVector(Broadcasted
, VF
* Part
, Step
, ID
.getInductionOpcode());
1873 VectorLoopValueMap
.setVectorValue(EntryVal
, Part
, EntryPart
);
1875 addMetadata(EntryPart
, Trunc
);
1876 recordVectorLoopValueForInductionCast(ID
, EntryVal
, EntryPart
, Part
);
1880 // If an induction variable is only used for counting loop iterations or
1881 // calculating addresses, it doesn't need to be widened. Create scalar steps
1882 // that can be used by instructions we will later scalarize. Note that the
1883 // addition of the scalar steps will not increase the number of instructions
1884 // in the loop in the common case prior to InstCombine. We will be trading
1885 // one vector extract for each scalar step.
1887 buildScalarSteps(ScalarIV
, Step
, EntryVal
, ID
);
1890 Value
*InnerLoopVectorizer::getStepVector(Value
*Val
, int StartIdx
, Value
*Step
,
1891 Instruction::BinaryOps BinOp
) {
1892 // Create and check the types.
1893 assert(Val
->getType()->isVectorTy() && "Must be a vector");
1894 int VLen
= Val
->getType()->getVectorNumElements();
1896 Type
*STy
= Val
->getType()->getScalarType();
1897 assert((STy
->isIntegerTy() || STy
->isFloatingPointTy()) &&
1898 "Induction Step must be an integer or FP");
1899 assert(Step
->getType() == STy
&& "Step has wrong type");
1901 SmallVector
<Constant
*, 8> Indices
;
1903 if (STy
->isIntegerTy()) {
1904 // Create a vector of consecutive numbers from zero to VF.
1905 for (int i
= 0; i
< VLen
; ++i
)
1906 Indices
.push_back(ConstantInt::get(STy
, StartIdx
+ i
));
1908 // Add the consecutive indices to the vector value.
1909 Constant
*Cv
= ConstantVector::get(Indices
);
1910 assert(Cv
->getType() == Val
->getType() && "Invalid consecutive vec");
1911 Step
= Builder
.CreateVectorSplat(VLen
, Step
);
1912 assert(Step
->getType() == Val
->getType() && "Invalid step vec");
1913 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1914 // which can be found from the original scalar operations.
1915 Step
= Builder
.CreateMul(Cv
, Step
);
1916 return Builder
.CreateAdd(Val
, Step
, "induction");
1919 // Floating point induction.
1920 assert((BinOp
== Instruction::FAdd
|| BinOp
== Instruction::FSub
) &&
1921 "Binary Opcode should be specified for FP induction");
1922 // Create a vector of consecutive numbers from zero to VF.
1923 for (int i
= 0; i
< VLen
; ++i
)
1924 Indices
.push_back(ConstantFP::get(STy
, (double)(StartIdx
+ i
)));
1926 // Add the consecutive indices to the vector value.
1927 Constant
*Cv
= ConstantVector::get(Indices
);
1929 Step
= Builder
.CreateVectorSplat(VLen
, Step
);
1931 // Floating point operations had to be 'fast' to enable the induction.
1932 FastMathFlags Flags
;
1935 Value
*MulOp
= Builder
.CreateFMul(Cv
, Step
);
1936 if (isa
<Instruction
>(MulOp
))
1937 // Have to check, MulOp may be a constant
1938 cast
<Instruction
>(MulOp
)->setFastMathFlags(Flags
);
1940 Value
*BOp
= Builder
.CreateBinOp(BinOp
, Val
, MulOp
, "induction");
1941 if (isa
<Instruction
>(BOp
))
1942 cast
<Instruction
>(BOp
)->setFastMathFlags(Flags
);
1946 void InnerLoopVectorizer::buildScalarSteps(Value
*ScalarIV
, Value
*Step
,
1947 Instruction
*EntryVal
,
1948 const InductionDescriptor
&ID
) {
1949 // We shouldn't have to build scalar steps if we aren't vectorizing.
1950 assert(VF
> 1 && "VF should be greater than one");
1952 // Get the value type and ensure it and the step have the same integer type.
1953 Type
*ScalarIVTy
= ScalarIV
->getType()->getScalarType();
1954 assert(ScalarIVTy
== Step
->getType() &&
1955 "Val and Step should have the same type");
1957 // We build scalar steps for both integer and floating-point induction
1958 // variables. Here, we determine the kind of arithmetic we will perform.
1959 Instruction::BinaryOps AddOp
;
1960 Instruction::BinaryOps MulOp
;
1961 if (ScalarIVTy
->isIntegerTy()) {
1962 AddOp
= Instruction::Add
;
1963 MulOp
= Instruction::Mul
;
1965 AddOp
= ID
.getInductionOpcode();
1966 MulOp
= Instruction::FMul
;
1969 // Determine the number of scalars we need to generate for each unroll
1970 // iteration. If EntryVal is uniform, we only need to generate the first
1971 // lane. Otherwise, we generate all VF values.
1973 Cost
->isUniformAfterVectorization(cast
<Instruction
>(EntryVal
), VF
) ? 1
1975 // Compute the scalar steps and save the results in VectorLoopValueMap.
1976 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
1977 for (unsigned Lane
= 0; Lane
< Lanes
; ++Lane
) {
1978 auto *StartIdx
= getSignedIntOrFpConstant(ScalarIVTy
, VF
* Part
+ Lane
);
1979 auto *Mul
= addFastMathFlag(Builder
.CreateBinOp(MulOp
, StartIdx
, Step
));
1980 auto *Add
= addFastMathFlag(Builder
.CreateBinOp(AddOp
, ScalarIV
, Mul
));
1981 VectorLoopValueMap
.setScalarValue(EntryVal
, {Part
, Lane
}, Add
);
1982 recordVectorLoopValueForInductionCast(ID
, EntryVal
, Add
, Part
, Lane
);
1987 Value
*InnerLoopVectorizer::getOrCreateVectorValue(Value
*V
, unsigned Part
) {
1988 assert(V
!= Induction
&& "The new induction variable should not be used.");
1989 assert(!V
->getType()->isVectorTy() && "Can't widen a vector");
1990 assert(!V
->getType()->isVoidTy() && "Type does not produce a value");
1992 // If we have a stride that is replaced by one, do it here. Defer this for
1993 // the VPlan-native path until we start running Legal checks in that path.
1994 if (!EnableVPlanNativePath
&& Legal
->hasStride(V
))
1995 V
= ConstantInt::get(V
->getType(), 1);
1997 // If we have a vector mapped to this value, return it.
1998 if (VectorLoopValueMap
.hasVectorValue(V
, Part
))
1999 return VectorLoopValueMap
.getVectorValue(V
, Part
);
2001 // If the value has not been vectorized, check if it has been scalarized
2002 // instead. If it has been scalarized, and we actually need the value in
2003 // vector form, we will construct the vector values on demand.
2004 if (VectorLoopValueMap
.hasAnyScalarValue(V
)) {
2005 Value
*ScalarValue
= VectorLoopValueMap
.getScalarValue(V
, {Part
, 0});
2007 // If we've scalarized a value, that value should be an instruction.
2008 auto *I
= cast
<Instruction
>(V
);
2010 // If we aren't vectorizing, we can just copy the scalar map values over to
2013 VectorLoopValueMap
.setVectorValue(V
, Part
, ScalarValue
);
2017 // Get the last scalar instruction we generated for V and Part. If the value
2018 // is known to be uniform after vectorization, this corresponds to lane zero
2019 // of the Part unroll iteration. Otherwise, the last instruction is the one
2020 // we created for the last vector lane of the Part unroll iteration.
2021 unsigned LastLane
= Cost
->isUniformAfterVectorization(I
, VF
) ? 0 : VF
- 1;
2022 auto *LastInst
= cast
<Instruction
>(
2023 VectorLoopValueMap
.getScalarValue(V
, {Part
, LastLane
}));
2025 // Set the insert point after the last scalarized instruction. This ensures
2026 // the insertelement sequence will directly follow the scalar definitions.
2027 auto OldIP
= Builder
.saveIP();
2028 auto NewIP
= std::next(BasicBlock::iterator(LastInst
));
2029 Builder
.SetInsertPoint(&*NewIP
);
2031 // However, if we are vectorizing, we need to construct the vector values.
2032 // If the value is known to be uniform after vectorization, we can just
2033 // broadcast the scalar value corresponding to lane zero for each unroll
2034 // iteration. Otherwise, we construct the vector values using insertelement
2035 // instructions. Since the resulting vectors are stored in
2036 // VectorLoopValueMap, we will only generate the insertelements once.
2037 Value
*VectorValue
= nullptr;
2038 if (Cost
->isUniformAfterVectorization(I
, VF
)) {
2039 VectorValue
= getBroadcastInstrs(ScalarValue
);
2040 VectorLoopValueMap
.setVectorValue(V
, Part
, VectorValue
);
2042 // Initialize packing with insertelements to start from undef.
2043 Value
*Undef
= UndefValue::get(VectorType::get(V
->getType(), VF
));
2044 VectorLoopValueMap
.setVectorValue(V
, Part
, Undef
);
2045 for (unsigned Lane
= 0; Lane
< VF
; ++Lane
)
2046 packScalarIntoVectorValue(V
, {Part
, Lane
});
2047 VectorValue
= VectorLoopValueMap
.getVectorValue(V
, Part
);
2049 Builder
.restoreIP(OldIP
);
2053 // If this scalar is unknown, assume that it is a constant or that it is
2054 // loop invariant. Broadcast V and save the value for future uses.
2055 Value
*B
= getBroadcastInstrs(V
);
2056 VectorLoopValueMap
.setVectorValue(V
, Part
, B
);
2061 InnerLoopVectorizer::getOrCreateScalarValue(Value
*V
,
2062 const VPIteration
&Instance
) {
2063 // If the value is not an instruction contained in the loop, it should
2064 // already be scalar.
2065 if (OrigLoop
->isLoopInvariant(V
))
2068 assert(Instance
.Lane
> 0
2069 ? !Cost
->isUniformAfterVectorization(cast
<Instruction
>(V
), VF
)
2070 : true && "Uniform values only have lane zero");
2072 // If the value from the original loop has not been vectorized, it is
2073 // represented by UF x VF scalar values in the new loop. Return the requested
2075 if (VectorLoopValueMap
.hasScalarValue(V
, Instance
))
2076 return VectorLoopValueMap
.getScalarValue(V
, Instance
);
2078 // If the value has not been scalarized, get its entry in VectorLoopValueMap
2079 // for the given unroll part. If this entry is not a vector type (i.e., the
2080 // vectorization factor is one), there is no need to generate an
2081 // extractelement instruction.
2082 auto *U
= getOrCreateVectorValue(V
, Instance
.Part
);
2083 if (!U
->getType()->isVectorTy()) {
2084 assert(VF
== 1 && "Value not scalarized has non-vector type");
2088 // Otherwise, the value from the original loop has been vectorized and is
2089 // represented by UF vector values. Extract and return the requested scalar
2090 // value from the appropriate vector lane.
2091 return Builder
.CreateExtractElement(U
, Builder
.getInt32(Instance
.Lane
));
2094 void InnerLoopVectorizer::packScalarIntoVectorValue(
2095 Value
*V
, const VPIteration
&Instance
) {
2096 assert(V
!= Induction
&& "The new induction variable should not be used.");
2097 assert(!V
->getType()->isVectorTy() && "Can't pack a vector");
2098 assert(!V
->getType()->isVoidTy() && "Type does not produce a value");
2100 Value
*ScalarInst
= VectorLoopValueMap
.getScalarValue(V
, Instance
);
2101 Value
*VectorValue
= VectorLoopValueMap
.getVectorValue(V
, Instance
.Part
);
2102 VectorValue
= Builder
.CreateInsertElement(VectorValue
, ScalarInst
,
2103 Builder
.getInt32(Instance
.Lane
));
2104 VectorLoopValueMap
.resetVectorValue(V
, Instance
.Part
, VectorValue
);
2107 Value
*InnerLoopVectorizer::reverseVector(Value
*Vec
) {
2108 assert(Vec
->getType()->isVectorTy() && "Invalid type");
2109 SmallVector
<Constant
*, 8> ShuffleMask
;
2110 for (unsigned i
= 0; i
< VF
; ++i
)
2111 ShuffleMask
.push_back(Builder
.getInt32(VF
- i
- 1));
2113 return Builder
.CreateShuffleVector(Vec
, UndefValue::get(Vec
->getType()),
2114 ConstantVector::get(ShuffleMask
),
2118 // Return whether we allow using masked interleave-groups (for dealing with
2119 // strided loads/stores that reside in predicated blocks, or for dealing
2121 static bool useMaskedInterleavedAccesses(const TargetTransformInfo
&TTI
) {
2122 // If an override option has been passed in for interleaved accesses, use it.
2123 if (EnableMaskedInterleavedMemAccesses
.getNumOccurrences() > 0)
2124 return EnableMaskedInterleavedMemAccesses
;
2126 return TTI
.enableMaskedInterleavedAccessVectorization();
2129 // Try to vectorize the interleave group that \p Instr belongs to.
2131 // E.g. Translate following interleaved load group (factor = 3):
2132 // for (i = 0; i < N; i+=3) {
2133 // R = Pic[i]; // Member of index 0
2134 // G = Pic[i+1]; // Member of index 1
2135 // B = Pic[i+2]; // Member of index 2
2136 // ... // do something to R, G, B
2139 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2140 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
2141 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
2142 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
2144 // Or translate following interleaved store group (factor = 3):
2145 // for (i = 0; i < N; i+=3) {
2146 // ... do something to R, G, B
2147 // Pic[i] = R; // Member of index 0
2148 // Pic[i+1] = G; // Member of index 1
2149 // Pic[i+2] = B; // Member of index 2
2152 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2153 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2154 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2155 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2156 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2157 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction
*Instr
,
2158 VectorParts
*BlockInMask
) {
2159 const InterleaveGroup
<Instruction
> *Group
=
2160 Cost
->getInterleavedAccessGroup(Instr
);
2161 assert(Group
&& "Fail to get an interleaved access group.");
2163 // Skip if current instruction is not the insert position.
2164 if (Instr
!= Group
->getInsertPos())
2167 const DataLayout
&DL
= Instr
->getModule()->getDataLayout();
2168 Value
*Ptr
= getLoadStorePointerOperand(Instr
);
2170 // Prepare for the vector type of the interleaved load/store.
2171 Type
*ScalarTy
= getMemInstValueType(Instr
);
2172 unsigned InterleaveFactor
= Group
->getFactor();
2173 Type
*VecTy
= VectorType::get(ScalarTy
, InterleaveFactor
* VF
);
2174 Type
*PtrTy
= VecTy
->getPointerTo(getLoadStoreAddressSpace(Instr
));
2176 // Prepare for the new pointers.
2177 setDebugLocFromInst(Builder
, Ptr
);
2178 SmallVector
<Value
*, 2> NewPtrs
;
2179 unsigned Index
= Group
->getIndex(Instr
);
2182 bool IsMaskForCondRequired
= BlockInMask
;
2183 if (IsMaskForCondRequired
) {
2184 Mask
= *BlockInMask
;
2185 // TODO: extend the masked interleaved-group support to reversed access.
2186 assert(!Group
->isReverse() && "Reversed masked interleave-group "
2190 // If the group is reverse, adjust the index to refer to the last vector lane
2191 // instead of the first. We adjust the index from the first vector lane,
2192 // rather than directly getting the pointer for lane VF - 1, because the
2193 // pointer operand of the interleaved access is supposed to be uniform. For
2194 // uniform instructions, we're only required to generate a value for the
2195 // first vector lane in each unroll iteration.
2196 if (Group
->isReverse())
2197 Index
+= (VF
- 1) * Group
->getFactor();
2199 bool InBounds
= false;
2200 if (auto *gep
= dyn_cast
<GetElementPtrInst
>(Ptr
->stripPointerCasts()))
2201 InBounds
= gep
->isInBounds();
2203 for (unsigned Part
= 0; Part
< UF
; Part
++) {
2204 Value
*NewPtr
= getOrCreateScalarValue(Ptr
, {Part
, 0});
2206 // Notice current instruction could be any index. Need to adjust the address
2207 // to the member of index 0.
2209 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2210 // b = A[i]; // Member of index 0
2211 // Current pointer is pointed to A[i+1], adjust it to A[i].
2213 // E.g. A[i+1] = a; // Member of index 1
2214 // A[i] = b; // Member of index 0
2215 // A[i+2] = c; // Member of index 2 (Current instruction)
2216 // Current pointer is pointed to A[i+2], adjust it to A[i].
2217 NewPtr
= Builder
.CreateGEP(ScalarTy
, NewPtr
, Builder
.getInt32(-Index
));
2219 cast
<GetElementPtrInst
>(NewPtr
)->setIsInBounds(true);
2221 // Cast to the vector pointer type.
2222 NewPtrs
.push_back(Builder
.CreateBitCast(NewPtr
, PtrTy
));
2225 setDebugLocFromInst(Builder
, Instr
);
2226 Value
*UndefVec
= UndefValue::get(VecTy
);
2228 Value
*MaskForGaps
= nullptr;
2229 if (Group
->requiresScalarEpilogue() && !Cost
->isScalarEpilogueAllowed()) {
2230 MaskForGaps
= createBitMaskForGaps(Builder
, VF
, *Group
);
2231 assert(MaskForGaps
&& "Mask for Gaps is required but it is null");
2234 // Vectorize the interleaved load group.
2235 if (isa
<LoadInst
>(Instr
)) {
2236 // For each unroll part, create a wide load for the group.
2237 SmallVector
<Value
*, 2> NewLoads
;
2238 for (unsigned Part
= 0; Part
< UF
; Part
++) {
2239 Instruction
*NewLoad
;
2240 if (IsMaskForCondRequired
|| MaskForGaps
) {
2241 assert(useMaskedInterleavedAccesses(*TTI
) &&
2242 "masked interleaved groups are not allowed.");
2243 Value
*GroupMask
= MaskForGaps
;
2244 if (IsMaskForCondRequired
) {
2245 auto *Undefs
= UndefValue::get(Mask
[Part
]->getType());
2246 auto *RepMask
= createReplicatedMask(Builder
, InterleaveFactor
, VF
);
2247 Value
*ShuffledMask
= Builder
.CreateShuffleVector(
2248 Mask
[Part
], Undefs
, RepMask
, "interleaved.mask");
2249 GroupMask
= MaskForGaps
2250 ? Builder
.CreateBinOp(Instruction::And
, ShuffledMask
,
2255 Builder
.CreateMaskedLoad(NewPtrs
[Part
], Group
->getAlignment(),
2256 GroupMask
, UndefVec
, "wide.masked.vec");
2259 NewLoad
= Builder
.CreateAlignedLoad(VecTy
, NewPtrs
[Part
],
2260 Group
->getAlignment(), "wide.vec");
2261 Group
->addMetadata(NewLoad
);
2262 NewLoads
.push_back(NewLoad
);
2265 // For each member in the group, shuffle out the appropriate data from the
2267 for (unsigned I
= 0; I
< InterleaveFactor
; ++I
) {
2268 Instruction
*Member
= Group
->getMember(I
);
2270 // Skip the gaps in the group.
2274 Constant
*StrideMask
= createStrideMask(Builder
, I
, InterleaveFactor
, VF
);
2275 for (unsigned Part
= 0; Part
< UF
; Part
++) {
2276 Value
*StridedVec
= Builder
.CreateShuffleVector(
2277 NewLoads
[Part
], UndefVec
, StrideMask
, "strided.vec");
2279 // If this member has different type, cast the result type.
2280 if (Member
->getType() != ScalarTy
) {
2281 VectorType
*OtherVTy
= VectorType::get(Member
->getType(), VF
);
2282 StridedVec
= createBitOrPointerCast(StridedVec
, OtherVTy
, DL
);
2285 if (Group
->isReverse())
2286 StridedVec
= reverseVector(StridedVec
);
2288 VectorLoopValueMap
.setVectorValue(Member
, Part
, StridedVec
);
2294 // The sub vector type for current instruction.
2295 VectorType
*SubVT
= VectorType::get(ScalarTy
, VF
);
2297 // Vectorize the interleaved store group.
2298 for (unsigned Part
= 0; Part
< UF
; Part
++) {
2299 // Collect the stored vector from each member.
2300 SmallVector
<Value
*, 4> StoredVecs
;
2301 for (unsigned i
= 0; i
< InterleaveFactor
; i
++) {
2302 // Interleaved store group doesn't allow a gap, so each index has a member
2303 Instruction
*Member
= Group
->getMember(i
);
2304 assert(Member
&& "Fail to get a member from an interleaved store group");
2306 Value
*StoredVec
= getOrCreateVectorValue(
2307 cast
<StoreInst
>(Member
)->getValueOperand(), Part
);
2308 if (Group
->isReverse())
2309 StoredVec
= reverseVector(StoredVec
);
2311 // If this member has different type, cast it to a unified type.
2313 if (StoredVec
->getType() != SubVT
)
2314 StoredVec
= createBitOrPointerCast(StoredVec
, SubVT
, DL
);
2316 StoredVecs
.push_back(StoredVec
);
2319 // Concatenate all vectors into a wide vector.
2320 Value
*WideVec
= concatenateVectors(Builder
, StoredVecs
);
2322 // Interleave the elements in the wide vector.
2323 Constant
*IMask
= createInterleaveMask(Builder
, VF
, InterleaveFactor
);
2324 Value
*IVec
= Builder
.CreateShuffleVector(WideVec
, UndefVec
, IMask
,
2327 Instruction
*NewStoreInstr
;
2328 if (IsMaskForCondRequired
) {
2329 auto *Undefs
= UndefValue::get(Mask
[Part
]->getType());
2330 auto *RepMask
= createReplicatedMask(Builder
, InterleaveFactor
, VF
);
2331 Value
*ShuffledMask
= Builder
.CreateShuffleVector(
2332 Mask
[Part
], Undefs
, RepMask
, "interleaved.mask");
2333 NewStoreInstr
= Builder
.CreateMaskedStore(
2334 IVec
, NewPtrs
[Part
], Group
->getAlignment(), ShuffledMask
);
2337 NewStoreInstr
= Builder
.CreateAlignedStore(IVec
, NewPtrs
[Part
],
2338 Group
->getAlignment());
2340 Group
->addMetadata(NewStoreInstr
);
2344 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction
*Instr
,
2345 VectorParts
*BlockInMask
) {
2346 // Attempt to issue a wide load.
2347 LoadInst
*LI
= dyn_cast
<LoadInst
>(Instr
);
2348 StoreInst
*SI
= dyn_cast
<StoreInst
>(Instr
);
2350 assert((LI
|| SI
) && "Invalid Load/Store instruction");
2352 LoopVectorizationCostModel::InstWidening Decision
=
2353 Cost
->getWideningDecision(Instr
, VF
);
2354 assert(Decision
!= LoopVectorizationCostModel::CM_Unknown
&&
2355 "CM decision should be taken at this point");
2356 if (Decision
== LoopVectorizationCostModel::CM_Interleave
)
2357 return vectorizeInterleaveGroup(Instr
);
2359 Type
*ScalarDataTy
= getMemInstValueType(Instr
);
2360 Type
*DataTy
= VectorType::get(ScalarDataTy
, VF
);
2361 Value
*Ptr
= getLoadStorePointerOperand(Instr
);
2362 // An alignment of 0 means target abi alignment. We need to use the scalar's
2363 // target abi alignment in such a case.
2364 const DataLayout
&DL
= Instr
->getModule()->getDataLayout();
2365 const Align Alignment
=
2366 DL
.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr
), ScalarDataTy
);
2367 unsigned AddressSpace
= getLoadStoreAddressSpace(Instr
);
2369 // Determine if the pointer operand of the access is either consecutive or
2370 // reverse consecutive.
2371 bool Reverse
= (Decision
== LoopVectorizationCostModel::CM_Widen_Reverse
);
2372 bool ConsecutiveStride
=
2373 Reverse
|| (Decision
== LoopVectorizationCostModel::CM_Widen
);
2374 bool CreateGatherScatter
=
2375 (Decision
== LoopVectorizationCostModel::CM_GatherScatter
);
2377 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2378 // gather/scatter. Otherwise Decision should have been to Scalarize.
2379 assert((ConsecutiveStride
|| CreateGatherScatter
) &&
2380 "The instruction should be scalarized");
2382 // Handle consecutive loads/stores.
2383 if (ConsecutiveStride
)
2384 Ptr
= getOrCreateScalarValue(Ptr
, {0, 0});
2387 bool isMaskRequired
= BlockInMask
;
2389 Mask
= *BlockInMask
;
2391 bool InBounds
= false;
2392 if (auto *gep
= dyn_cast
<GetElementPtrInst
>(
2393 getLoadStorePointerOperand(Instr
)->stripPointerCasts()))
2394 InBounds
= gep
->isInBounds();
2396 const auto CreateVecPtr
= [&](unsigned Part
, Value
*Ptr
) -> Value
* {
2397 // Calculate the pointer for the specific unroll-part.
2398 GetElementPtrInst
*PartPtr
= nullptr;
2401 // If the address is consecutive but reversed, then the
2402 // wide store needs to start at the last vector element.
2403 PartPtr
= cast
<GetElementPtrInst
>(
2404 Builder
.CreateGEP(ScalarDataTy
, Ptr
, Builder
.getInt32(-Part
* VF
)));
2405 PartPtr
->setIsInBounds(InBounds
);
2406 PartPtr
= cast
<GetElementPtrInst
>(
2407 Builder
.CreateGEP(ScalarDataTy
, PartPtr
, Builder
.getInt32(1 - VF
)));
2408 PartPtr
->setIsInBounds(InBounds
);
2409 if (isMaskRequired
) // Reverse of a null all-one mask is a null mask.
2410 Mask
[Part
] = reverseVector(Mask
[Part
]);
2412 PartPtr
= cast
<GetElementPtrInst
>(
2413 Builder
.CreateGEP(ScalarDataTy
, Ptr
, Builder
.getInt32(Part
* VF
)));
2414 PartPtr
->setIsInBounds(InBounds
);
2417 return Builder
.CreateBitCast(PartPtr
, DataTy
->getPointerTo(AddressSpace
));
2422 setDebugLocFromInst(Builder
, SI
);
2424 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
2425 Instruction
*NewSI
= nullptr;
2426 Value
*StoredVal
= getOrCreateVectorValue(SI
->getValueOperand(), Part
);
2427 if (CreateGatherScatter
) {
2428 Value
*MaskPart
= isMaskRequired
? Mask
[Part
] : nullptr;
2429 Value
*VectorGep
= getOrCreateVectorValue(Ptr
, Part
);
2430 NewSI
= Builder
.CreateMaskedScatter(StoredVal
, VectorGep
,
2431 Alignment
.value(), MaskPart
);
2434 // If we store to reverse consecutive memory locations, then we need
2435 // to reverse the order of elements in the stored value.
2436 StoredVal
= reverseVector(StoredVal
);
2437 // We don't want to update the value in the map as it might be used in
2438 // another expression. So don't call resetVectorValue(StoredVal).
2440 auto *VecPtr
= CreateVecPtr(Part
, Ptr
);
2442 NewSI
= Builder
.CreateMaskedStore(StoredVal
, VecPtr
,
2443 Alignment
.value(), Mask
[Part
]);
2446 Builder
.CreateAlignedStore(StoredVal
, VecPtr
, Alignment
.value());
2448 addMetadata(NewSI
, SI
);
2454 assert(LI
&& "Must have a load instruction");
2455 setDebugLocFromInst(Builder
, LI
);
2456 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
2458 if (CreateGatherScatter
) {
2459 Value
*MaskPart
= isMaskRequired
? Mask
[Part
] : nullptr;
2460 Value
*VectorGep
= getOrCreateVectorValue(Ptr
, Part
);
2461 NewLI
= Builder
.CreateMaskedGather(VectorGep
, Alignment
.value(), MaskPart
,
2462 nullptr, "wide.masked.gather");
2463 addMetadata(NewLI
, LI
);
2465 auto *VecPtr
= CreateVecPtr(Part
, Ptr
);
2467 NewLI
= Builder
.CreateMaskedLoad(VecPtr
, Alignment
.value(), Mask
[Part
],
2468 UndefValue::get(DataTy
),
2469 "wide.masked.load");
2471 NewLI
= Builder
.CreateAlignedLoad(DataTy
, VecPtr
, Alignment
.value(),
2474 // Add metadata to the load, but setVectorValue to the reverse shuffle.
2475 addMetadata(NewLI
, LI
);
2477 NewLI
= reverseVector(NewLI
);
2479 VectorLoopValueMap
.setVectorValue(Instr
, Part
, NewLI
);
2483 void InnerLoopVectorizer::scalarizeInstruction(Instruction
*Instr
,
2484 const VPIteration
&Instance
,
2485 bool IfPredicateInstr
) {
2486 assert(!Instr
->getType()->isAggregateType() && "Can't handle vectors");
2488 setDebugLocFromInst(Builder
, Instr
);
2490 // Does this instruction return a value ?
2491 bool IsVoidRetTy
= Instr
->getType()->isVoidTy();
2493 Instruction
*Cloned
= Instr
->clone();
2495 Cloned
->setName(Instr
->getName() + ".cloned");
2497 // Replace the operands of the cloned instructions with their scalar
2498 // equivalents in the new loop.
2499 for (unsigned op
= 0, e
= Instr
->getNumOperands(); op
!= e
; ++op
) {
2500 auto *NewOp
= getOrCreateScalarValue(Instr
->getOperand(op
), Instance
);
2501 Cloned
->setOperand(op
, NewOp
);
2503 addNewMetadata(Cloned
, Instr
);
2505 // Place the cloned scalar in the new loop.
2506 Builder
.Insert(Cloned
);
2508 // Add the cloned scalar to the scalar map entry.
2509 VectorLoopValueMap
.setScalarValue(Instr
, Instance
, Cloned
);
2511 // If we just cloned a new assumption, add it the assumption cache.
2512 if (auto *II
= dyn_cast
<IntrinsicInst
>(Cloned
))
2513 if (II
->getIntrinsicID() == Intrinsic::assume
)
2514 AC
->registerAssumption(II
);
2517 if (IfPredicateInstr
)
2518 PredicatedInstructions
.push_back(Cloned
);
2521 PHINode
*InnerLoopVectorizer::createInductionVariable(Loop
*L
, Value
*Start
,
2522 Value
*End
, Value
*Step
,
2524 BasicBlock
*Header
= L
->getHeader();
2525 BasicBlock
*Latch
= L
->getLoopLatch();
2526 // As we're just creating this loop, it's possible no latch exists
2527 // yet. If so, use the header as this will be a single block loop.
2531 IRBuilder
<> Builder(&*Header
->getFirstInsertionPt());
2532 Instruction
*OldInst
= getDebugLocFromInstOrOperands(OldInduction
);
2533 setDebugLocFromInst(Builder
, OldInst
);
2534 auto *Induction
= Builder
.CreatePHI(Start
->getType(), 2, "index");
2536 Builder
.SetInsertPoint(Latch
->getTerminator());
2537 setDebugLocFromInst(Builder
, OldInst
);
2539 // Create i+1 and fill the PHINode.
2540 Value
*Next
= Builder
.CreateAdd(Induction
, Step
, "index.next");
2541 Induction
->addIncoming(Start
, L
->getLoopPreheader());
2542 Induction
->addIncoming(Next
, Latch
);
2543 // Create the compare.
2544 Value
*ICmp
= Builder
.CreateICmpEQ(Next
, End
);
2545 Builder
.CreateCondBr(ICmp
, L
->getExitBlock(), Header
);
2547 // Now we have two terminators. Remove the old one from the block.
2548 Latch
->getTerminator()->eraseFromParent();
2553 Value
*InnerLoopVectorizer::getOrCreateTripCount(Loop
*L
) {
2557 assert(L
&& "Create Trip Count for null loop.");
2558 IRBuilder
<> Builder(L
->getLoopPreheader()->getTerminator());
2559 // Find the loop boundaries.
2560 ScalarEvolution
*SE
= PSE
.getSE();
2561 const SCEV
*BackedgeTakenCount
= PSE
.getBackedgeTakenCount();
2562 assert(BackedgeTakenCount
!= SE
->getCouldNotCompute() &&
2563 "Invalid loop count");
2565 Type
*IdxTy
= Legal
->getWidestInductionType();
2566 assert(IdxTy
&& "No type for induction");
2568 // The exit count might have the type of i64 while the phi is i32. This can
2569 // happen if we have an induction variable that is sign extended before the
2570 // compare. The only way that we get a backedge taken count is that the
2571 // induction variable was signed and as such will not overflow. In such a case
2572 // truncation is legal.
2573 if (BackedgeTakenCount
->getType()->getPrimitiveSizeInBits() >
2574 IdxTy
->getPrimitiveSizeInBits())
2575 BackedgeTakenCount
= SE
->getTruncateOrNoop(BackedgeTakenCount
, IdxTy
);
2576 BackedgeTakenCount
= SE
->getNoopOrZeroExtend(BackedgeTakenCount
, IdxTy
);
2578 // Get the total trip count from the count by adding 1.
2579 const SCEV
*ExitCount
= SE
->getAddExpr(
2580 BackedgeTakenCount
, SE
->getOne(BackedgeTakenCount
->getType()));
2582 const DataLayout
&DL
= L
->getHeader()->getModule()->getDataLayout();
2584 // Expand the trip count and place the new instructions in the preheader.
2585 // Notice that the pre-header does not change, only the loop body.
2586 SCEVExpander
Exp(*SE
, DL
, "induction");
2588 // Count holds the overall loop count (N).
2589 TripCount
= Exp
.expandCodeFor(ExitCount
, ExitCount
->getType(),
2590 L
->getLoopPreheader()->getTerminator());
2592 if (TripCount
->getType()->isPointerTy())
2594 CastInst::CreatePointerCast(TripCount
, IdxTy
, "exitcount.ptrcnt.to.int",
2595 L
->getLoopPreheader()->getTerminator());
2600 Value
*InnerLoopVectorizer::getOrCreateVectorTripCount(Loop
*L
) {
2601 if (VectorTripCount
)
2602 return VectorTripCount
;
2604 Value
*TC
= getOrCreateTripCount(L
);
2605 IRBuilder
<> Builder(L
->getLoopPreheader()->getTerminator());
2607 Type
*Ty
= TC
->getType();
2608 Constant
*Step
= ConstantInt::get(Ty
, VF
* UF
);
2610 // If the tail is to be folded by masking, round the number of iterations N
2611 // up to a multiple of Step instead of rounding down. This is done by first
2612 // adding Step-1 and then rounding down. Note that it's ok if this addition
2613 // overflows: the vector induction variable will eventually wrap to zero given
2614 // that it starts at zero and its Step is a power of two; the loop will then
2615 // exit, with the last early-exit vector comparison also producing all-true.
2616 if (Cost
->foldTailByMasking()) {
2617 assert(isPowerOf2_32(VF
* UF
) &&
2618 "VF*UF must be a power of 2 when folding tail by masking");
2619 TC
= Builder
.CreateAdd(TC
, ConstantInt::get(Ty
, VF
* UF
- 1), "n.rnd.up");
2622 // Now we need to generate the expression for the part of the loop that the
2623 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2624 // iterations are not required for correctness, or N - Step, otherwise. Step
2625 // is equal to the vectorization factor (number of SIMD elements) times the
2626 // unroll factor (number of SIMD instructions).
2627 Value
*R
= Builder
.CreateURem(TC
, Step
, "n.mod.vf");
2629 // If there is a non-reversed interleaved group that may speculatively access
2630 // memory out-of-bounds, we need to ensure that there will be at least one
2631 // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2632 // the trip count, we set the remainder to be equal to the step. If the step
2633 // does not evenly divide the trip count, no adjustment is necessary since
2634 // there will already be scalar iterations. Note that the minimum iterations
2635 // check ensures that N >= Step.
2636 if (VF
> 1 && Cost
->requiresScalarEpilogue()) {
2637 auto *IsZero
= Builder
.CreateICmpEQ(R
, ConstantInt::get(R
->getType(), 0));
2638 R
= Builder
.CreateSelect(IsZero
, Step
, R
);
2641 VectorTripCount
= Builder
.CreateSub(TC
, R
, "n.vec");
2643 return VectorTripCount
;
2646 Value
*InnerLoopVectorizer::createBitOrPointerCast(Value
*V
, VectorType
*DstVTy
,
2647 const DataLayout
&DL
) {
2648 // Verify that V is a vector type with same number of elements as DstVTy.
2649 unsigned VF
= DstVTy
->getNumElements();
2650 VectorType
*SrcVecTy
= cast
<VectorType
>(V
->getType());
2651 assert((VF
== SrcVecTy
->getNumElements()) && "Vector dimensions do not match");
2652 Type
*SrcElemTy
= SrcVecTy
->getElementType();
2653 Type
*DstElemTy
= DstVTy
->getElementType();
2654 assert((DL
.getTypeSizeInBits(SrcElemTy
) == DL
.getTypeSizeInBits(DstElemTy
)) &&
2655 "Vector elements must have same size");
2657 // Do a direct cast if element types are castable.
2658 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy
, DstElemTy
, DL
)) {
2659 return Builder
.CreateBitOrPointerCast(V
, DstVTy
);
2661 // V cannot be directly casted to desired vector type.
2662 // May happen when V is a floating point vector but DstVTy is a vector of
2663 // pointers or vice-versa. Handle this using a two-step bitcast using an
2664 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2665 assert((DstElemTy
->isPointerTy() != SrcElemTy
->isPointerTy()) &&
2666 "Only one type should be a pointer type");
2667 assert((DstElemTy
->isFloatingPointTy() != SrcElemTy
->isFloatingPointTy()) &&
2668 "Only one type should be a floating point type");
2670 IntegerType::getIntNTy(V
->getContext(), DL
.getTypeSizeInBits(SrcElemTy
));
2671 VectorType
*VecIntTy
= VectorType::get(IntTy
, VF
);
2672 Value
*CastVal
= Builder
.CreateBitOrPointerCast(V
, VecIntTy
);
2673 return Builder
.CreateBitOrPointerCast(CastVal
, DstVTy
);
2676 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop
*L
,
2677 BasicBlock
*Bypass
) {
2678 Value
*Count
= getOrCreateTripCount(L
);
2679 BasicBlock
*BB
= L
->getLoopPreheader();
2680 IRBuilder
<> Builder(BB
->getTerminator());
2682 // Generate code to check if the loop's trip count is less than VF * UF, or
2683 // equal to it in case a scalar epilogue is required; this implies that the
2684 // vector trip count is zero. This check also covers the case where adding one
2685 // to the backedge-taken count overflowed leading to an incorrect trip count
2686 // of zero. In this case we will also jump to the scalar loop.
2687 auto P
= Cost
->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2688 : ICmpInst::ICMP_ULT
;
2690 // If tail is to be folded, vector loop takes care of all iterations.
2691 Value
*CheckMinIters
= Builder
.getFalse();
2692 if (!Cost
->foldTailByMasking())
2693 CheckMinIters
= Builder
.CreateICmp(
2694 P
, Count
, ConstantInt::get(Count
->getType(), VF
* UF
),
2697 BasicBlock
*NewBB
= BB
->splitBasicBlock(BB
->getTerminator(), "vector.ph");
2698 // Update dominator tree immediately if the generated block is a
2699 // LoopBypassBlock because SCEV expansions to generate loop bypass
2700 // checks may query it before the current function is finished.
2701 DT
->addNewBlock(NewBB
, BB
);
2702 if (L
->getParentLoop())
2703 L
->getParentLoop()->addBasicBlockToLoop(NewBB
, *LI
);
2704 ReplaceInstWithInst(BB
->getTerminator(),
2705 BranchInst::Create(Bypass
, NewBB
, CheckMinIters
));
2706 LoopBypassBlocks
.push_back(BB
);
2709 void InnerLoopVectorizer::emitSCEVChecks(Loop
*L
, BasicBlock
*Bypass
) {
2710 BasicBlock
*BB
= L
->getLoopPreheader();
2712 // Generate the code to check that the SCEV assumptions that we made.
2713 // We want the new basic block to start at the first instruction in a
2714 // sequence of instructions that form a check.
2715 SCEVExpander
Exp(*PSE
.getSE(), Bypass
->getModule()->getDataLayout(),
2718 Exp
.expandCodeForPredicate(&PSE
.getUnionPredicate(), BB
->getTerminator());
2720 if (auto *C
= dyn_cast
<ConstantInt
>(SCEVCheck
))
2724 assert(!BB
->getParent()->hasOptSize() &&
2725 "Cannot SCEV check stride or overflow when optimizing for size");
2727 // Create a new block containing the stride check.
2728 BB
->setName("vector.scevcheck");
2729 auto *NewBB
= BB
->splitBasicBlock(BB
->getTerminator(), "vector.ph");
2730 // Update dominator tree immediately if the generated block is a
2731 // LoopBypassBlock because SCEV expansions to generate loop bypass
2732 // checks may query it before the current function is finished.
2733 DT
->addNewBlock(NewBB
, BB
);
2734 if (L
->getParentLoop())
2735 L
->getParentLoop()->addBasicBlockToLoop(NewBB
, *LI
);
2736 ReplaceInstWithInst(BB
->getTerminator(),
2737 BranchInst::Create(Bypass
, NewBB
, SCEVCheck
));
2738 LoopBypassBlocks
.push_back(BB
);
2739 AddedSafetyChecks
= true;
2742 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop
*L
, BasicBlock
*Bypass
) {
2743 // VPlan-native path does not do any analysis for runtime checks currently.
2744 if (EnableVPlanNativePath
)
2747 BasicBlock
*BB
= L
->getLoopPreheader();
2749 // Generate the code that checks in runtime if arrays overlap. We put the
2750 // checks into a separate block to make the more common case of few elements
2752 Instruction
*FirstCheckInst
;
2753 Instruction
*MemRuntimeCheck
;
2754 std::tie(FirstCheckInst
, MemRuntimeCheck
) =
2755 Legal
->getLAI()->addRuntimeChecks(BB
->getTerminator());
2756 if (!MemRuntimeCheck
)
2759 if (BB
->getParent()->hasOptSize()) {
2760 assert(Cost
->Hints
->getForce() == LoopVectorizeHints::FK_Enabled
&&
2761 "Cannot emit memory checks when optimizing for size, unless forced "
2764 return OptimizationRemarkAnalysis(DEBUG_TYPE
, "VectorizationCodeSize",
2765 L
->getStartLoc(), L
->getHeader())
2766 << "Code-size may be reduced by not forcing "
2767 "vectorization, or by source-code modifications "
2768 "eliminating the need for runtime checks "
2769 "(e.g., adding 'restrict').";
2773 // Create a new block containing the memory check.
2774 BB
->setName("vector.memcheck");
2775 auto *NewBB
= BB
->splitBasicBlock(BB
->getTerminator(), "vector.ph");
2776 // Update dominator tree immediately if the generated block is a
2777 // LoopBypassBlock because SCEV expansions to generate loop bypass
2778 // checks may query it before the current function is finished.
2779 DT
->addNewBlock(NewBB
, BB
);
2780 if (L
->getParentLoop())
2781 L
->getParentLoop()->addBasicBlockToLoop(NewBB
, *LI
);
2782 ReplaceInstWithInst(BB
->getTerminator(),
2783 BranchInst::Create(Bypass
, NewBB
, MemRuntimeCheck
));
2784 LoopBypassBlocks
.push_back(BB
);
2785 AddedSafetyChecks
= true;
2787 // We currently don't use LoopVersioning for the actual loop cloning but we
2788 // still use it to add the noalias metadata.
2789 LVer
= std::make_unique
<LoopVersioning
>(*Legal
->getLAI(), OrigLoop
, LI
, DT
,
2791 LVer
->prepareNoAliasMetadata();
2794 Value
*InnerLoopVectorizer::emitTransformedIndex(
2795 IRBuilder
<> &B
, Value
*Index
, ScalarEvolution
*SE
, const DataLayout
&DL
,
2796 const InductionDescriptor
&ID
) const {
2798 SCEVExpander
Exp(*SE
, DL
, "induction");
2799 auto Step
= ID
.getStep();
2800 auto StartValue
= ID
.getStartValue();
2801 assert(Index
->getType() == Step
->getType() &&
2802 "Index type does not match StepValue type");
2804 // Note: the IR at this point is broken. We cannot use SE to create any new
2805 // SCEV and then expand it, hoping that SCEV's simplification will give us
2806 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2807 // lead to various SCEV crashes. So all we can do is to use builder and rely
2808 // on InstCombine for future simplifications. Here we handle some trivial
2810 auto CreateAdd
= [&B
](Value
*X
, Value
*Y
) {
2811 assert(X
->getType() == Y
->getType() && "Types don't match!");
2812 if (auto *CX
= dyn_cast
<ConstantInt
>(X
))
2815 if (auto *CY
= dyn_cast
<ConstantInt
>(Y
))
2818 return B
.CreateAdd(X
, Y
);
2821 auto CreateMul
= [&B
](Value
*X
, Value
*Y
) {
2822 assert(X
->getType() == Y
->getType() && "Types don't match!");
2823 if (auto *CX
= dyn_cast
<ConstantInt
>(X
))
2826 if (auto *CY
= dyn_cast
<ConstantInt
>(Y
))
2829 return B
.CreateMul(X
, Y
);
2832 switch (ID
.getKind()) {
2833 case InductionDescriptor::IK_IntInduction
: {
2834 assert(Index
->getType() == StartValue
->getType() &&
2835 "Index type does not match StartValue type");
2836 if (ID
.getConstIntStepValue() && ID
.getConstIntStepValue()->isMinusOne())
2837 return B
.CreateSub(StartValue
, Index
);
2838 auto *Offset
= CreateMul(
2839 Index
, Exp
.expandCodeFor(Step
, Index
->getType(), &*B
.GetInsertPoint()));
2840 return CreateAdd(StartValue
, Offset
);
2842 case InductionDescriptor::IK_PtrInduction
: {
2843 assert(isa
<SCEVConstant
>(Step
) &&
2844 "Expected constant step for pointer induction");
2846 StartValue
->getType()->getPointerElementType(), StartValue
,
2847 CreateMul(Index
, Exp
.expandCodeFor(Step
, Index
->getType(),
2848 &*B
.GetInsertPoint())));
2850 case InductionDescriptor::IK_FpInduction
: {
2851 assert(Step
->getType()->isFloatingPointTy() && "Expected FP Step value");
2852 auto InductionBinOp
= ID
.getInductionBinOp();
2853 assert(InductionBinOp
&&
2854 (InductionBinOp
->getOpcode() == Instruction::FAdd
||
2855 InductionBinOp
->getOpcode() == Instruction::FSub
) &&
2856 "Original bin op should be defined for FP induction");
2858 Value
*StepValue
= cast
<SCEVUnknown
>(Step
)->getValue();
2860 // Floating point operations had to be 'fast' to enable the induction.
2861 FastMathFlags Flags
;
2864 Value
*MulExp
= B
.CreateFMul(StepValue
, Index
);
2865 if (isa
<Instruction
>(MulExp
))
2866 // We have to check, the MulExp may be a constant.
2867 cast
<Instruction
>(MulExp
)->setFastMathFlags(Flags
);
2869 Value
*BOp
= B
.CreateBinOp(InductionBinOp
->getOpcode(), StartValue
, MulExp
,
2871 if (isa
<Instruction
>(BOp
))
2872 cast
<Instruction
>(BOp
)->setFastMathFlags(Flags
);
2876 case InductionDescriptor::IK_NoInduction
:
2879 llvm_unreachable("invalid enum");
2882 BasicBlock
*InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2884 In this function we generate a new loop. The new loop will contain
2885 the vectorized instructions while the old loop will continue to run the
2888 [ ] <-- loop iteration number check.
2891 | [ ] <-- vector loop bypass (may consist of multiple blocks).
2894 || [ ] <-- vector pre header.
2898 | [ ]_| <-- vector loop.
2901 | -[ ] <--- middle-block.
2904 -|- >[ ] <--- new preheader.
2908 | [ ]_| <-- old scalar loop to handle remainder.
2911 >[ ] <-- exit block.
2915 BasicBlock
*OldBasicBlock
= OrigLoop
->getHeader();
2916 BasicBlock
*VectorPH
= OrigLoop
->getLoopPreheader();
2917 BasicBlock
*ExitBlock
= OrigLoop
->getExitBlock();
2918 MDNode
*OrigLoopID
= OrigLoop
->getLoopID();
2919 assert(VectorPH
&& "Invalid loop structure");
2920 assert(ExitBlock
&& "Must have an exit block");
2922 // Some loops have a single integer induction variable, while other loops
2923 // don't. One example is c++ iterators that often have multiple pointer
2924 // induction variables. In the code below we also support a case where we
2925 // don't have a single induction variable.
2927 // We try to obtain an induction variable from the original loop as hard
2928 // as possible. However if we don't find one that:
2930 // - counts from zero, stepping by one
2931 // - is the size of the widest induction variable type
2932 // then we create a new one.
2933 OldInduction
= Legal
->getPrimaryInduction();
2934 Type
*IdxTy
= Legal
->getWidestInductionType();
2936 // Split the single block loop into the two loop structure described above.
2937 BasicBlock
*VecBody
=
2938 VectorPH
->splitBasicBlock(VectorPH
->getTerminator(), "vector.body");
2939 BasicBlock
*MiddleBlock
=
2940 VecBody
->splitBasicBlock(VecBody
->getTerminator(), "middle.block");
2941 BasicBlock
*ScalarPH
=
2942 MiddleBlock
->splitBasicBlock(MiddleBlock
->getTerminator(), "scalar.ph");
2944 // Create and register the new vector loop.
2945 Loop
*Lp
= LI
->AllocateLoop();
2946 Loop
*ParentLoop
= OrigLoop
->getParentLoop();
2948 // Insert the new loop into the loop nest and register the new basic blocks
2949 // before calling any utilities such as SCEV that require valid LoopInfo.
2951 ParentLoop
->addChildLoop(Lp
);
2952 ParentLoop
->addBasicBlockToLoop(ScalarPH
, *LI
);
2953 ParentLoop
->addBasicBlockToLoop(MiddleBlock
, *LI
);
2955 LI
->addTopLevelLoop(Lp
);
2957 Lp
->addBasicBlockToLoop(VecBody
, *LI
);
2959 // Find the loop boundaries.
2960 Value
*Count
= getOrCreateTripCount(Lp
);
2962 Value
*StartIdx
= ConstantInt::get(IdxTy
, 0);
2964 // Now, compare the new count to zero. If it is zero skip the vector loop and
2965 // jump to the scalar loop. This check also covers the case where the
2966 // backedge-taken count is uint##_max: adding one to it will overflow leading
2967 // to an incorrect trip count of zero. In this (rare) case we will also jump
2968 // to the scalar loop.
2969 emitMinimumIterationCountCheck(Lp
, ScalarPH
);
2971 // Generate the code to check any assumptions that we've made for SCEV
2973 emitSCEVChecks(Lp
, ScalarPH
);
2975 // Generate the code that checks in runtime if arrays overlap. We put the
2976 // checks into a separate block to make the more common case of few elements
2978 emitMemRuntimeChecks(Lp
, ScalarPH
);
2980 // Generate the induction variable.
2981 // The loop step is equal to the vectorization factor (num of SIMD elements)
2982 // times the unroll factor (num of SIMD instructions).
2983 Value
*CountRoundDown
= getOrCreateVectorTripCount(Lp
);
2984 Constant
*Step
= ConstantInt::get(IdxTy
, VF
* UF
);
2986 createInductionVariable(Lp
, StartIdx
, CountRoundDown
, Step
,
2987 getDebugLocFromInstOrOperands(OldInduction
));
2989 // We are going to resume the execution of the scalar loop.
2990 // Go over all of the induction variables that we found and fix the
2991 // PHIs that are left in the scalar version of the loop.
2992 // The starting values of PHI nodes depend on the counter of the last
2993 // iteration in the vectorized loop.
2994 // If we come from a bypass edge then we need to start from the original
2997 // This variable saves the new starting index for the scalar loop. It is used
2998 // to test if there are any tail iterations left once the vector loop has
3000 LoopVectorizationLegality::InductionList
*List
= Legal
->getInductionVars();
3001 for (auto &InductionEntry
: *List
) {
3002 PHINode
*OrigPhi
= InductionEntry
.first
;
3003 InductionDescriptor II
= InductionEntry
.second
;
3005 // Create phi nodes to merge from the backedge-taken check block.
3006 PHINode
*BCResumeVal
= PHINode::Create(
3007 OrigPhi
->getType(), 3, "bc.resume.val", ScalarPH
->getTerminator());
3008 // Copy original phi DL over to the new one.
3009 BCResumeVal
->setDebugLoc(OrigPhi
->getDebugLoc());
3010 Value
*&EndValue
= IVEndValues
[OrigPhi
];
3011 if (OrigPhi
== OldInduction
) {
3012 // We know what the end value is.
3013 EndValue
= CountRoundDown
;
3015 IRBuilder
<> B(Lp
->getLoopPreheader()->getTerminator());
3016 Type
*StepType
= II
.getStep()->getType();
3017 Instruction::CastOps CastOp
=
3018 CastInst::getCastOpcode(CountRoundDown
, true, StepType
, true);
3019 Value
*CRD
= B
.CreateCast(CastOp
, CountRoundDown
, StepType
, "cast.crd");
3020 const DataLayout
&DL
= OrigLoop
->getHeader()->getModule()->getDataLayout();
3021 EndValue
= emitTransformedIndex(B
, CRD
, PSE
.getSE(), DL
, II
);
3022 EndValue
->setName("ind.end");
3025 // The new PHI merges the original incoming value, in case of a bypass,
3026 // or the value at the end of the vectorized loop.
3027 BCResumeVal
->addIncoming(EndValue
, MiddleBlock
);
3029 // Fix the scalar body counter (PHI node).
3030 // The old induction's phi node in the scalar body needs the truncated
3032 for (BasicBlock
*BB
: LoopBypassBlocks
)
3033 BCResumeVal
->addIncoming(II
.getStartValue(), BB
);
3034 OrigPhi
->setIncomingValueForBlock(ScalarPH
, BCResumeVal
);
3037 // We need the OrigLoop (scalar loop part) latch terminator to help
3038 // produce correct debug info for the middle block BB instructions.
3039 // The legality check stage guarantees that the loop will have a single
3041 assert(isa
<BranchInst
>(OrigLoop
->getLoopLatch()->getTerminator()) &&
3042 "Scalar loop latch terminator isn't a branch");
3043 BranchInst
*ScalarLatchBr
=
3044 cast
<BranchInst
>(OrigLoop
->getLoopLatch()->getTerminator());
3046 // Add a check in the middle block to see if we have completed
3047 // all of the iterations in the first vector loop.
3048 // If (N - N%VF) == N, then we *don't* need to run the remainder.
3049 // If tail is to be folded, we know we don't need to run the remainder.
3050 Value
*CmpN
= Builder
.getTrue();
3051 if (!Cost
->foldTailByMasking()) {
3053 CmpInst::Create(Instruction::ICmp
, CmpInst::ICMP_EQ
, Count
,
3054 CountRoundDown
, "cmp.n", MiddleBlock
->getTerminator());
3056 // Here we use the same DebugLoc as the scalar loop latch branch instead
3057 // of the corresponding compare because they may have ended up with
3058 // different line numbers and we want to avoid awkward line stepping while
3059 // debugging. Eg. if the compare has got a line number inside the loop.
3060 cast
<Instruction
>(CmpN
)->setDebugLoc(ScalarLatchBr
->getDebugLoc());
3063 BranchInst
*BrInst
= BranchInst::Create(ExitBlock
, ScalarPH
, CmpN
);
3064 BrInst
->setDebugLoc(ScalarLatchBr
->getDebugLoc());
3065 ReplaceInstWithInst(MiddleBlock
->getTerminator(), BrInst
);
3067 // Get ready to start creating new instructions into the vectorized body.
3068 Builder
.SetInsertPoint(&*VecBody
->getFirstInsertionPt());
3071 LoopVectorPreHeader
= Lp
->getLoopPreheader();
3072 LoopScalarPreHeader
= ScalarPH
;
3073 LoopMiddleBlock
= MiddleBlock
;
3074 LoopExitBlock
= ExitBlock
;
3075 LoopVectorBody
= VecBody
;
3076 LoopScalarBody
= OldBasicBlock
;
3078 Optional
<MDNode
*> VectorizedLoopID
=
3079 makeFollowupLoopID(OrigLoopID
, {LLVMLoopVectorizeFollowupAll
,
3080 LLVMLoopVectorizeFollowupVectorized
});
3081 if (VectorizedLoopID
.hasValue()) {
3082 Lp
->setLoopID(VectorizedLoopID
.getValue());
3084 // Do not setAlreadyVectorized if loop attributes have been defined
3086 return LoopVectorPreHeader
;
3089 // Keep all loop hints from the original loop on the vector loop (we'll
3090 // replace the vectorizer-specific hints below).
3091 if (MDNode
*LID
= OrigLoop
->getLoopID())
3094 LoopVectorizeHints
Hints(Lp
, true, *ORE
);
3095 Hints
.setAlreadyVectorized();
3097 return LoopVectorPreHeader
;
3100 // Fix up external users of the induction variable. At this point, we are
3101 // in LCSSA form, with all external PHIs that use the IV having one input value,
3102 // coming from the remainder loop. We need those PHIs to also have a correct
3103 // value for the IV when arriving directly from the middle block.
3104 void InnerLoopVectorizer::fixupIVUsers(PHINode
*OrigPhi
,
3105 const InductionDescriptor
&II
,
3106 Value
*CountRoundDown
, Value
*EndValue
,
3107 BasicBlock
*MiddleBlock
) {
3108 // There are two kinds of external IV usages - those that use the value
3109 // computed in the last iteration (the PHI) and those that use the penultimate
3110 // value (the value that feeds into the phi from the loop latch).
3111 // We allow both, but they, obviously, have different values.
3113 assert(OrigLoop
->getExitBlock() && "Expected a single exit block");
3115 DenseMap
<Value
*, Value
*> MissingVals
;
3117 // An external user of the last iteration's value should see the value that
3118 // the remainder loop uses to initialize its own IV.
3119 Value
*PostInc
= OrigPhi
->getIncomingValueForBlock(OrigLoop
->getLoopLatch());
3120 for (User
*U
: PostInc
->users()) {
3121 Instruction
*UI
= cast
<Instruction
>(U
);
3122 if (!OrigLoop
->contains(UI
)) {
3123 assert(isa
<PHINode
>(UI
) && "Expected LCSSA form");
3124 MissingVals
[UI
] = EndValue
;
3128 // An external user of the penultimate value need to see EndValue - Step.
3129 // The simplest way to get this is to recompute it from the constituent SCEVs,
3130 // that is Start + (Step * (CRD - 1)).
3131 for (User
*U
: OrigPhi
->users()) {
3132 auto *UI
= cast
<Instruction
>(U
);
3133 if (!OrigLoop
->contains(UI
)) {
3134 const DataLayout
&DL
=
3135 OrigLoop
->getHeader()->getModule()->getDataLayout();
3136 assert(isa
<PHINode
>(UI
) && "Expected LCSSA form");
3138 IRBuilder
<> B(MiddleBlock
->getTerminator());
3139 Value
*CountMinusOne
= B
.CreateSub(
3140 CountRoundDown
, ConstantInt::get(CountRoundDown
->getType(), 1));
3142 !II
.getStep()->getType()->isIntegerTy()
3143 ? B
.CreateCast(Instruction::SIToFP
, CountMinusOne
,
3144 II
.getStep()->getType())
3145 : B
.CreateSExtOrTrunc(CountMinusOne
, II
.getStep()->getType());
3146 CMO
->setName("cast.cmo");
3147 Value
*Escape
= emitTransformedIndex(B
, CMO
, PSE
.getSE(), DL
, II
);
3148 Escape
->setName("ind.escape");
3149 MissingVals
[UI
] = Escape
;
3153 for (auto &I
: MissingVals
) {
3154 PHINode
*PHI
= cast
<PHINode
>(I
.first
);
3155 // One corner case we have to handle is two IVs "chasing" each-other,
3156 // that is %IV2 = phi [...], [ %IV1, %latch ]
3157 // In this case, if IV1 has an external use, we need to avoid adding both
3158 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3159 // don't already have an incoming value for the middle block.
3160 if (PHI
->getBasicBlockIndex(MiddleBlock
) == -1)
3161 PHI
->addIncoming(I
.second
, MiddleBlock
);
3167 struct CSEDenseMapInfo
{
3168 static bool canHandle(const Instruction
*I
) {
3169 return isa
<InsertElementInst
>(I
) || isa
<ExtractElementInst
>(I
) ||
3170 isa
<ShuffleVectorInst
>(I
) || isa
<GetElementPtrInst
>(I
);
3173 static inline Instruction
*getEmptyKey() {
3174 return DenseMapInfo
<Instruction
*>::getEmptyKey();
3177 static inline Instruction
*getTombstoneKey() {
3178 return DenseMapInfo
<Instruction
*>::getTombstoneKey();
3181 static unsigned getHashValue(const Instruction
*I
) {
3182 assert(canHandle(I
) && "Unknown instruction!");
3183 return hash_combine(I
->getOpcode(), hash_combine_range(I
->value_op_begin(),
3184 I
->value_op_end()));
3187 static bool isEqual(const Instruction
*LHS
, const Instruction
*RHS
) {
3188 if (LHS
== getEmptyKey() || RHS
== getEmptyKey() ||
3189 LHS
== getTombstoneKey() || RHS
== getTombstoneKey())
3191 return LHS
->isIdenticalTo(RHS
);
3195 } // end anonymous namespace
3197 ///Perform cse of induction variable instructions.
3198 static void cse(BasicBlock
*BB
) {
3199 // Perform simple cse.
3200 SmallDenseMap
<Instruction
*, Instruction
*, 4, CSEDenseMapInfo
> CSEMap
;
3201 for (BasicBlock::iterator I
= BB
->begin(), E
= BB
->end(); I
!= E
;) {
3202 Instruction
*In
= &*I
++;
3204 if (!CSEDenseMapInfo::canHandle(In
))
3207 // Check if we can replace this instruction with any of the
3208 // visited instructions.
3209 if (Instruction
*V
= CSEMap
.lookup(In
)) {
3210 In
->replaceAllUsesWith(V
);
3211 In
->eraseFromParent();
3219 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst
*CI
,
3221 bool &NeedToScalarize
) {
3222 Function
*F
= CI
->getCalledFunction();
3223 StringRef FnName
= CI
->getCalledFunction()->getName();
3224 Type
*ScalarRetTy
= CI
->getType();
3225 SmallVector
<Type
*, 4> Tys
, ScalarTys
;
3226 for (auto &ArgOp
: CI
->arg_operands())
3227 ScalarTys
.push_back(ArgOp
->getType());
3229 // Estimate cost of scalarized vector call. The source operands are assumed
3230 // to be vectors, so we need to extract individual elements from there,
3231 // execute VF scalar calls, and then gather the result into the vector return
3233 unsigned ScalarCallCost
= TTI
.getCallInstrCost(F
, ScalarRetTy
, ScalarTys
);
3235 return ScalarCallCost
;
3237 // Compute corresponding vector type for return value and arguments.
3238 Type
*RetTy
= ToVectorTy(ScalarRetTy
, VF
);
3239 for (Type
*ScalarTy
: ScalarTys
)
3240 Tys
.push_back(ToVectorTy(ScalarTy
, VF
));
3242 // Compute costs of unpacking argument values for the scalar calls and
3243 // packing the return values to a vector.
3244 unsigned ScalarizationCost
= getScalarizationOverhead(CI
, VF
);
3246 unsigned Cost
= ScalarCallCost
* VF
+ ScalarizationCost
;
3248 // If we can't emit a vector call for this function, then the currently found
3249 // cost is the cost we need to return.
3250 NeedToScalarize
= true;
3251 if (!TLI
|| !TLI
->isFunctionVectorizable(FnName
, VF
) || CI
->isNoBuiltin())
3254 // If the corresponding vector cost is cheaper, return its cost.
3255 unsigned VectorCallCost
= TTI
.getCallInstrCost(nullptr, RetTy
, Tys
);
3256 if (VectorCallCost
< Cost
) {
3257 NeedToScalarize
= false;
3258 return VectorCallCost
;
3263 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst
*CI
,
3265 Intrinsic::ID ID
= getVectorIntrinsicIDForCall(CI
, TLI
);
3266 assert(ID
&& "Expected intrinsic call!");
3269 if (auto *FPMO
= dyn_cast
<FPMathOperator
>(CI
))
3270 FMF
= FPMO
->getFastMathFlags();
3272 SmallVector
<Value
*, 4> Operands(CI
->arg_operands());
3273 return TTI
.getIntrinsicInstrCost(ID
, CI
->getType(), Operands
, FMF
, VF
);
3276 static Type
*smallestIntegerVectorType(Type
*T1
, Type
*T2
) {
3277 auto *I1
= cast
<IntegerType
>(T1
->getVectorElementType());
3278 auto *I2
= cast
<IntegerType
>(T2
->getVectorElementType());
3279 return I1
->getBitWidth() < I2
->getBitWidth() ? T1
: T2
;
3281 static Type
*largestIntegerVectorType(Type
*T1
, Type
*T2
) {
3282 auto *I1
= cast
<IntegerType
>(T1
->getVectorElementType());
3283 auto *I2
= cast
<IntegerType
>(T2
->getVectorElementType());
3284 return I1
->getBitWidth() > I2
->getBitWidth() ? T1
: T2
;
3287 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3288 // For every instruction `I` in MinBWs, truncate the operands, create a
3289 // truncated version of `I` and reextend its result. InstCombine runs
3290 // later and will remove any ext/trunc pairs.
3291 SmallPtrSet
<Value
*, 4> Erased
;
3292 for (const auto &KV
: Cost
->getMinimalBitwidths()) {
3293 // If the value wasn't vectorized, we must maintain the original scalar
3294 // type. The absence of the value from VectorLoopValueMap indicates that it
3295 // wasn't vectorized.
3296 if (!VectorLoopValueMap
.hasAnyVectorValue(KV
.first
))
3298 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3299 Value
*I
= getOrCreateVectorValue(KV
.first
, Part
);
3300 if (Erased
.find(I
) != Erased
.end() || I
->use_empty() ||
3301 !isa
<Instruction
>(I
))
3303 Type
*OriginalTy
= I
->getType();
3304 Type
*ScalarTruncatedTy
=
3305 IntegerType::get(OriginalTy
->getContext(), KV
.second
);
3306 Type
*TruncatedTy
= VectorType::get(ScalarTruncatedTy
,
3307 OriginalTy
->getVectorNumElements());
3308 if (TruncatedTy
== OriginalTy
)
3311 IRBuilder
<> B(cast
<Instruction
>(I
));
3312 auto ShrinkOperand
= [&](Value
*V
) -> Value
* {
3313 if (auto *ZI
= dyn_cast
<ZExtInst
>(V
))
3314 if (ZI
->getSrcTy() == TruncatedTy
)
3315 return ZI
->getOperand(0);
3316 return B
.CreateZExtOrTrunc(V
, TruncatedTy
);
3319 // The actual instruction modification depends on the instruction type,
3321 Value
*NewI
= nullptr;
3322 if (auto *BO
= dyn_cast
<BinaryOperator
>(I
)) {
3323 NewI
= B
.CreateBinOp(BO
->getOpcode(), ShrinkOperand(BO
->getOperand(0)),
3324 ShrinkOperand(BO
->getOperand(1)));
3326 // Any wrapping introduced by shrinking this operation shouldn't be
3327 // considered undefined behavior. So, we can't unconditionally copy
3328 // arithmetic wrapping flags to NewI.
3329 cast
<BinaryOperator
>(NewI
)->copyIRFlags(I
, /*IncludeWrapFlags=*/false);
3330 } else if (auto *CI
= dyn_cast
<ICmpInst
>(I
)) {
3332 B
.CreateICmp(CI
->getPredicate(), ShrinkOperand(CI
->getOperand(0)),
3333 ShrinkOperand(CI
->getOperand(1)));
3334 } else if (auto *SI
= dyn_cast
<SelectInst
>(I
)) {
3335 NewI
= B
.CreateSelect(SI
->getCondition(),
3336 ShrinkOperand(SI
->getTrueValue()),
3337 ShrinkOperand(SI
->getFalseValue()));
3338 } else if (auto *CI
= dyn_cast
<CastInst
>(I
)) {
3339 switch (CI
->getOpcode()) {
3341 llvm_unreachable("Unhandled cast!");
3342 case Instruction::Trunc
:
3343 NewI
= ShrinkOperand(CI
->getOperand(0));
3345 case Instruction::SExt
:
3346 NewI
= B
.CreateSExtOrTrunc(
3348 smallestIntegerVectorType(OriginalTy
, TruncatedTy
));
3350 case Instruction::ZExt
:
3351 NewI
= B
.CreateZExtOrTrunc(
3353 smallestIntegerVectorType(OriginalTy
, TruncatedTy
));
3356 } else if (auto *SI
= dyn_cast
<ShuffleVectorInst
>(I
)) {
3357 auto Elements0
= SI
->getOperand(0)->getType()->getVectorNumElements();
3358 auto *O0
= B
.CreateZExtOrTrunc(
3359 SI
->getOperand(0), VectorType::get(ScalarTruncatedTy
, Elements0
));
3360 auto Elements1
= SI
->getOperand(1)->getType()->getVectorNumElements();
3361 auto *O1
= B
.CreateZExtOrTrunc(
3362 SI
->getOperand(1), VectorType::get(ScalarTruncatedTy
, Elements1
));
3364 NewI
= B
.CreateShuffleVector(O0
, O1
, SI
->getMask());
3365 } else if (isa
<LoadInst
>(I
) || isa
<PHINode
>(I
)) {
3366 // Don't do anything with the operands, just extend the result.
3368 } else if (auto *IE
= dyn_cast
<InsertElementInst
>(I
)) {
3369 auto Elements
= IE
->getOperand(0)->getType()->getVectorNumElements();
3370 auto *O0
= B
.CreateZExtOrTrunc(
3371 IE
->getOperand(0), VectorType::get(ScalarTruncatedTy
, Elements
));
3372 auto *O1
= B
.CreateZExtOrTrunc(IE
->getOperand(1), ScalarTruncatedTy
);
3373 NewI
= B
.CreateInsertElement(O0
, O1
, IE
->getOperand(2));
3374 } else if (auto *EE
= dyn_cast
<ExtractElementInst
>(I
)) {
3375 auto Elements
= EE
->getOperand(0)->getType()->getVectorNumElements();
3376 auto *O0
= B
.CreateZExtOrTrunc(
3377 EE
->getOperand(0), VectorType::get(ScalarTruncatedTy
, Elements
));
3378 NewI
= B
.CreateExtractElement(O0
, EE
->getOperand(2));
3380 // If we don't know what to do, be conservative and don't do anything.
3384 // Lastly, extend the result.
3385 NewI
->takeName(cast
<Instruction
>(I
));
3386 Value
*Res
= B
.CreateZExtOrTrunc(NewI
, OriginalTy
);
3387 I
->replaceAllUsesWith(Res
);
3388 cast
<Instruction
>(I
)->eraseFromParent();
3390 VectorLoopValueMap
.resetVectorValue(KV
.first
, Part
, Res
);
3394 // We'll have created a bunch of ZExts that are now parentless. Clean up.
3395 for (const auto &KV
: Cost
->getMinimalBitwidths()) {
3396 // If the value wasn't vectorized, we must maintain the original scalar
3397 // type. The absence of the value from VectorLoopValueMap indicates that it
3398 // wasn't vectorized.
3399 if (!VectorLoopValueMap
.hasAnyVectorValue(KV
.first
))
3401 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3402 Value
*I
= getOrCreateVectorValue(KV
.first
, Part
);
3403 ZExtInst
*Inst
= dyn_cast
<ZExtInst
>(I
);
3404 if (Inst
&& Inst
->use_empty()) {
3405 Value
*NewI
= Inst
->getOperand(0);
3406 Inst
->eraseFromParent();
3407 VectorLoopValueMap
.resetVectorValue(KV
.first
, Part
, NewI
);
3413 void InnerLoopVectorizer::fixVectorizedLoop() {
3414 // Insert truncates and extends for any truncated instructions as hints to
3417 truncateToMinimalBitwidths();
3419 // Fix widened non-induction PHIs by setting up the PHI operands.
3420 if (OrigPHIsToFix
.size()) {
3421 assert(EnableVPlanNativePath
&&
3422 "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3423 fixNonInductionPHIs();
3426 // At this point every instruction in the original loop is widened to a
3427 // vector form. Now we need to fix the recurrences in the loop. These PHI
3428 // nodes are currently empty because we did not want to introduce cycles.
3429 // This is the second stage of vectorizing recurrences.
3430 fixCrossIterationPHIs();
3432 // Update the dominator tree.
3434 // FIXME: After creating the structure of the new loop, the dominator tree is
3435 // no longer up-to-date, and it remains that way until we update it
3436 // here. An out-of-date dominator tree is problematic for SCEV,
3437 // because SCEVExpander uses it to guide code generation. The
3438 // vectorizer use SCEVExpanders in several places. Instead, we should
3439 // keep the dominator tree up-to-date as we go.
3442 // Fix-up external users of the induction variables.
3443 for (auto &Entry
: *Legal
->getInductionVars())
3444 fixupIVUsers(Entry
.first
, Entry
.second
,
3445 getOrCreateVectorTripCount(LI
->getLoopFor(LoopVectorBody
)),
3446 IVEndValues
[Entry
.first
], LoopMiddleBlock
);
3449 for (Instruction
*PI
: PredicatedInstructions
)
3450 sinkScalarOperands(&*PI
);
3452 // Remove redundant induction instructions.
3453 cse(LoopVectorBody
);
3456 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3457 // In order to support recurrences we need to be able to vectorize Phi nodes.
3458 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3459 // stage #2: We now need to fix the recurrences by adding incoming edges to
3460 // the currently empty PHI nodes. At this point every instruction in the
3461 // original loop is widened to a vector form so we can use them to construct
3462 // the incoming edges.
3463 for (PHINode
&Phi
: OrigLoop
->getHeader()->phis()) {
3464 // Handle first-order recurrences and reductions that need to be fixed.
3465 if (Legal
->isFirstOrderRecurrence(&Phi
))
3466 fixFirstOrderRecurrence(&Phi
);
3467 else if (Legal
->isReductionVariable(&Phi
))
3472 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode
*Phi
) {
3473 // This is the second phase of vectorizing first-order recurrences. An
3474 // overview of the transformation is described below. Suppose we have the
3477 // for (int i = 0; i < n; ++i)
3478 // b[i] = a[i] - a[i - 1];
3480 // There is a first-order recurrence on "a". For this loop, the shorthand
3481 // scalar IR looks like:
3488 // i = phi [0, scalar.ph], [i+1, scalar.body]
3489 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3492 // br cond, scalar.body, ...
3494 // In this example, s1 is a recurrence because it's value depends on the
3495 // previous iteration. In the first phase of vectorization, we created a
3496 // temporary value for s1. We now complete the vectorization and produce the
3497 // shorthand vector IR shown below (for VF = 4, UF = 1).
3500 // v_init = vector(..., ..., ..., a[-1])
3504 // i = phi [0, vector.ph], [i+4, vector.body]
3505 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3506 // v2 = a[i, i+1, i+2, i+3];
3507 // v3 = vector(v1(3), v2(0, 1, 2))
3508 // b[i, i+1, i+2, i+3] = v2 - v3
3509 // br cond, vector.body, middle.block
3516 // s_init = phi [x, middle.block], [a[-1], otherwise]
3519 // After execution completes the vector loop, we extract the next value of
3520 // the recurrence (x) to use as the initial value in the scalar loop.
3522 // Get the original loop preheader and single loop latch.
3523 auto *Preheader
= OrigLoop
->getLoopPreheader();
3524 auto *Latch
= OrigLoop
->getLoopLatch();
3526 // Get the initial and previous values of the scalar recurrence.
3527 auto *ScalarInit
= Phi
->getIncomingValueForBlock(Preheader
);
3528 auto *Previous
= Phi
->getIncomingValueForBlock(Latch
);
3530 // Create a vector from the initial value.
3531 auto *VectorInit
= ScalarInit
;
3533 Builder
.SetInsertPoint(LoopVectorPreHeader
->getTerminator());
3534 VectorInit
= Builder
.CreateInsertElement(
3535 UndefValue::get(VectorType::get(VectorInit
->getType(), VF
)), VectorInit
,
3536 Builder
.getInt32(VF
- 1), "vector.recur.init");
3539 // We constructed a temporary phi node in the first phase of vectorization.
3540 // This phi node will eventually be deleted.
3541 Builder
.SetInsertPoint(
3542 cast
<Instruction
>(VectorLoopValueMap
.getVectorValue(Phi
, 0)));
3544 // Create a phi node for the new recurrence. The current value will either be
3545 // the initial value inserted into a vector or loop-varying vector value.
3546 auto *VecPhi
= Builder
.CreatePHI(VectorInit
->getType(), 2, "vector.recur");
3547 VecPhi
->addIncoming(VectorInit
, LoopVectorPreHeader
);
3549 // Get the vectorized previous value of the last part UF - 1. It appears last
3550 // among all unrolled iterations, due to the order of their construction.
3551 Value
*PreviousLastPart
= getOrCreateVectorValue(Previous
, UF
- 1);
3553 // Set the insertion point after the previous value if it is an instruction.
3554 // Note that the previous value may have been constant-folded so it is not
3555 // guaranteed to be an instruction in the vector loop. Also, if the previous
3556 // value is a phi node, we should insert after all the phi nodes to avoid
3557 // breaking basic block verification.
3558 if (LI
->getLoopFor(LoopVectorBody
)->isLoopInvariant(PreviousLastPart
) ||
3559 isa
<PHINode
>(PreviousLastPart
))
3560 Builder
.SetInsertPoint(&*LoopVectorBody
->getFirstInsertionPt());
3562 Builder
.SetInsertPoint(
3563 &*++BasicBlock::iterator(cast
<Instruction
>(PreviousLastPart
)));
3565 // We will construct a vector for the recurrence by combining the values for
3566 // the current and previous iterations. This is the required shuffle mask.
3567 SmallVector
<Constant
*, 8> ShuffleMask(VF
);
3568 ShuffleMask
[0] = Builder
.getInt32(VF
- 1);
3569 for (unsigned I
= 1; I
< VF
; ++I
)
3570 ShuffleMask
[I
] = Builder
.getInt32(I
+ VF
- 1);
3572 // The vector from which to take the initial value for the current iteration
3573 // (actual or unrolled). Initially, this is the vector phi node.
3574 Value
*Incoming
= VecPhi
;
3576 // Shuffle the current and previous vector and update the vector parts.
3577 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3578 Value
*PreviousPart
= getOrCreateVectorValue(Previous
, Part
);
3579 Value
*PhiPart
= VectorLoopValueMap
.getVectorValue(Phi
, Part
);
3581 VF
> 1 ? Builder
.CreateShuffleVector(Incoming
, PreviousPart
,
3582 ConstantVector::get(ShuffleMask
))
3584 PhiPart
->replaceAllUsesWith(Shuffle
);
3585 cast
<Instruction
>(PhiPart
)->eraseFromParent();
3586 VectorLoopValueMap
.resetVectorValue(Phi
, Part
, Shuffle
);
3587 Incoming
= PreviousPart
;
3590 // Fix the latch value of the new recurrence in the vector loop.
3591 VecPhi
->addIncoming(Incoming
, LI
->getLoopFor(LoopVectorBody
)->getLoopLatch());
3593 // Extract the last vector element in the middle block. This will be the
3594 // initial value for the recurrence when jumping to the scalar loop.
3595 auto *ExtractForScalar
= Incoming
;
3597 Builder
.SetInsertPoint(LoopMiddleBlock
->getTerminator());
3598 ExtractForScalar
= Builder
.CreateExtractElement(
3599 ExtractForScalar
, Builder
.getInt32(VF
- 1), "vector.recur.extract");
3601 // Extract the second last element in the middle block if the
3602 // Phi is used outside the loop. We need to extract the phi itself
3603 // and not the last element (the phi update in the current iteration). This
3604 // will be the value when jumping to the exit block from the LoopMiddleBlock,
3605 // when the scalar loop is not run at all.
3606 Value
*ExtractForPhiUsedOutsideLoop
= nullptr;
3608 ExtractForPhiUsedOutsideLoop
= Builder
.CreateExtractElement(
3609 Incoming
, Builder
.getInt32(VF
- 2), "vector.recur.extract.for.phi");
3610 // When loop is unrolled without vectorizing, initialize
3611 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3612 // `Incoming`. This is analogous to the vectorized case above: extracting the
3613 // second last element when VF > 1.
3615 ExtractForPhiUsedOutsideLoop
= getOrCreateVectorValue(Previous
, UF
- 2);
3617 // Fix the initial value of the original recurrence in the scalar loop.
3618 Builder
.SetInsertPoint(&*LoopScalarPreHeader
->begin());
3619 auto *Start
= Builder
.CreatePHI(Phi
->getType(), 2, "scalar.recur.init");
3620 for (auto *BB
: predecessors(LoopScalarPreHeader
)) {
3621 auto *Incoming
= BB
== LoopMiddleBlock
? ExtractForScalar
: ScalarInit
;
3622 Start
->addIncoming(Incoming
, BB
);
3625 Phi
->setIncomingValueForBlock(LoopScalarPreHeader
, Start
);
3626 Phi
->setName("scalar.recur");
3628 // Finally, fix users of the recurrence outside the loop. The users will need
3629 // either the last value of the scalar recurrence or the last value of the
3630 // vector recurrence we extracted in the middle block. Since the loop is in
3631 // LCSSA form, we just need to find all the phi nodes for the original scalar
3632 // recurrence in the exit block, and then add an edge for the middle block.
3633 for (PHINode
&LCSSAPhi
: LoopExitBlock
->phis()) {
3634 if (LCSSAPhi
.getIncomingValue(0) == Phi
) {
3635 LCSSAPhi
.addIncoming(ExtractForPhiUsedOutsideLoop
, LoopMiddleBlock
);
3640 void InnerLoopVectorizer::fixReduction(PHINode
*Phi
) {
3641 Constant
*Zero
= Builder
.getInt32(0);
3643 // Get it's reduction variable descriptor.
3644 assert(Legal
->isReductionVariable(Phi
) &&
3645 "Unable to find the reduction variable");
3646 RecurrenceDescriptor RdxDesc
= (*Legal
->getReductionVars())[Phi
];
3648 RecurrenceDescriptor::RecurrenceKind RK
= RdxDesc
.getRecurrenceKind();
3649 TrackingVH
<Value
> ReductionStartValue
= RdxDesc
.getRecurrenceStartValue();
3650 Instruction
*LoopExitInst
= RdxDesc
.getLoopExitInstr();
3651 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind
=
3652 RdxDesc
.getMinMaxRecurrenceKind();
3653 setDebugLocFromInst(Builder
, ReductionStartValue
);
3655 // We need to generate a reduction vector from the incoming scalar.
3656 // To do so, we need to generate the 'identity' vector and override
3657 // one of the elements with the incoming scalar reduction. We need
3658 // to do it in the vector-loop preheader.
3659 Builder
.SetInsertPoint(LoopVectorPreHeader
->getTerminator());
3661 // This is the vector-clone of the value that leaves the loop.
3662 Type
*VecTy
= getOrCreateVectorValue(LoopExitInst
, 0)->getType();
3664 // Find the reduction identity variable. Zero for addition, or, xor,
3665 // one for multiplication, -1 for And.
3668 if (RK
== RecurrenceDescriptor::RK_IntegerMinMax
||
3669 RK
== RecurrenceDescriptor::RK_FloatMinMax
) {
3670 // MinMax reduction have the start value as their identify.
3672 VectorStart
= Identity
= ReductionStartValue
;
3674 VectorStart
= Identity
=
3675 Builder
.CreateVectorSplat(VF
, ReductionStartValue
, "minmax.ident");
3678 // Handle other reduction kinds:
3679 Constant
*Iden
= RecurrenceDescriptor::getRecurrenceIdentity(
3680 RK
, VecTy
->getScalarType());
3683 // This vector is the Identity vector where the first element is the
3684 // incoming scalar reduction.
3685 VectorStart
= ReductionStartValue
;
3687 Identity
= ConstantVector::getSplat(VF
, Iden
);
3689 // This vector is the Identity vector where the first element is the
3690 // incoming scalar reduction.
3692 Builder
.CreateInsertElement(Identity
, ReductionStartValue
, Zero
);
3696 // Fix the vector-loop phi.
3698 // Reductions do not have to start at zero. They can start with
3699 // any loop invariant values.
3700 BasicBlock
*Latch
= OrigLoop
->getLoopLatch();
3701 Value
*LoopVal
= Phi
->getIncomingValueForBlock(Latch
);
3702 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3703 Value
*VecRdxPhi
= getOrCreateVectorValue(Phi
, Part
);
3704 Value
*Val
= getOrCreateVectorValue(LoopVal
, Part
);
3705 // Make sure to add the reduction stat value only to the
3706 // first unroll part.
3707 Value
*StartVal
= (Part
== 0) ? VectorStart
: Identity
;
3708 cast
<PHINode
>(VecRdxPhi
)->addIncoming(StartVal
, LoopVectorPreHeader
);
3709 cast
<PHINode
>(VecRdxPhi
)
3710 ->addIncoming(Val
, LI
->getLoopFor(LoopVectorBody
)->getLoopLatch());
3713 // Before each round, move the insertion point right between
3714 // the PHIs and the values we are going to write.
3715 // This allows us to write both PHINodes and the extractelement
3717 Builder
.SetInsertPoint(&*LoopMiddleBlock
->getFirstInsertionPt());
3719 setDebugLocFromInst(Builder
, LoopExitInst
);
3721 // If tail is folded by masking, the vector value to leave the loop should be
3722 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3723 // instead of the former.
3724 if (Cost
->foldTailByMasking()) {
3725 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3726 Value
*VecLoopExitInst
=
3727 VectorLoopValueMap
.getVectorValue(LoopExitInst
, Part
);
3728 Value
*Sel
= nullptr;
3729 for (User
*U
: VecLoopExitInst
->users()) {
3730 if (isa
<SelectInst
>(U
)) {
3731 assert(!Sel
&& "Reduction exit feeding two selects");
3734 assert(isa
<PHINode
>(U
) && "Reduction exit must feed Phi's or select");
3736 assert(Sel
&& "Reduction exit feeds no select");
3737 VectorLoopValueMap
.resetVectorValue(LoopExitInst
, Part
, Sel
);
3741 // If the vector reduction can be performed in a smaller type, we truncate
3742 // then extend the loop exit value to enable InstCombine to evaluate the
3743 // entire expression in the smaller type.
3744 if (VF
> 1 && Phi
->getType() != RdxDesc
.getRecurrenceType()) {
3745 Type
*RdxVecTy
= VectorType::get(RdxDesc
.getRecurrenceType(), VF
);
3746 Builder
.SetInsertPoint(
3747 LI
->getLoopFor(LoopVectorBody
)->getLoopLatch()->getTerminator());
3748 VectorParts
RdxParts(UF
);
3749 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3750 RdxParts
[Part
] = VectorLoopValueMap
.getVectorValue(LoopExitInst
, Part
);
3751 Value
*Trunc
= Builder
.CreateTrunc(RdxParts
[Part
], RdxVecTy
);
3752 Value
*Extnd
= RdxDesc
.isSigned() ? Builder
.CreateSExt(Trunc
, VecTy
)
3753 : Builder
.CreateZExt(Trunc
, VecTy
);
3754 for (Value::user_iterator UI
= RdxParts
[Part
]->user_begin();
3755 UI
!= RdxParts
[Part
]->user_end();)
3757 (*UI
++)->replaceUsesOfWith(RdxParts
[Part
], Extnd
);
3758 RdxParts
[Part
] = Extnd
;
3763 Builder
.SetInsertPoint(&*LoopMiddleBlock
->getFirstInsertionPt());
3764 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3765 RdxParts
[Part
] = Builder
.CreateTrunc(RdxParts
[Part
], RdxVecTy
);
3766 VectorLoopValueMap
.resetVectorValue(LoopExitInst
, Part
, RdxParts
[Part
]);
3770 // Reduce all of the unrolled parts into a single vector.
3771 Value
*ReducedPartRdx
= VectorLoopValueMap
.getVectorValue(LoopExitInst
, 0);
3772 unsigned Op
= RecurrenceDescriptor::getRecurrenceBinOp(RK
);
3774 // The middle block terminator has already been assigned a DebugLoc here (the
3775 // OrigLoop's single latch terminator). We want the whole middle block to
3776 // appear to execute on this line because: (a) it is all compiler generated,
3777 // (b) these instructions are always executed after evaluating the latch
3778 // conditional branch, and (c) other passes may add new predecessors which
3779 // terminate on this line. This is the easiest way to ensure we don't
3780 // accidentally cause an extra step back into the loop while debugging.
3781 setDebugLocFromInst(Builder
, LoopMiddleBlock
->getTerminator());
3782 for (unsigned Part
= 1; Part
< UF
; ++Part
) {
3783 Value
*RdxPart
= VectorLoopValueMap
.getVectorValue(LoopExitInst
, Part
);
3784 if (Op
!= Instruction::ICmp
&& Op
!= Instruction::FCmp
)
3785 // Floating point operations had to be 'fast' to enable the reduction.
3786 ReducedPartRdx
= addFastMathFlag(
3787 Builder
.CreateBinOp((Instruction::BinaryOps
)Op
, RdxPart
,
3788 ReducedPartRdx
, "bin.rdx"),
3789 RdxDesc
.getFastMathFlags());
3791 ReducedPartRdx
= createMinMaxOp(Builder
, MinMaxKind
, ReducedPartRdx
,
3796 bool NoNaN
= Legal
->hasFunNoNaNAttr();
3798 createTargetReduction(Builder
, TTI
, RdxDesc
, ReducedPartRdx
, NoNaN
);
3799 // If the reduction can be performed in a smaller type, we need to extend
3800 // the reduction to the wider type before we branch to the original loop.
3801 if (Phi
->getType() != RdxDesc
.getRecurrenceType())
3804 ? Builder
.CreateSExt(ReducedPartRdx
, Phi
->getType())
3805 : Builder
.CreateZExt(ReducedPartRdx
, Phi
->getType());
3808 // Create a phi node that merges control-flow from the backedge-taken check
3809 // block and the middle block.
3810 PHINode
*BCBlockPhi
= PHINode::Create(Phi
->getType(), 2, "bc.merge.rdx",
3811 LoopScalarPreHeader
->getTerminator());
3812 for (unsigned I
= 0, E
= LoopBypassBlocks
.size(); I
!= E
; ++I
)
3813 BCBlockPhi
->addIncoming(ReductionStartValue
, LoopBypassBlocks
[I
]);
3814 BCBlockPhi
->addIncoming(ReducedPartRdx
, LoopMiddleBlock
);
3816 // Now, we need to fix the users of the reduction variable
3817 // inside and outside of the scalar remainder loop.
3818 // We know that the loop is in LCSSA form. We need to update the
3819 // PHI nodes in the exit blocks.
3820 for (PHINode
&LCSSAPhi
: LoopExitBlock
->phis()) {
3821 // All PHINodes need to have a single entry edge, or two if
3822 // we already fixed them.
3823 assert(LCSSAPhi
.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3825 // We found a reduction value exit-PHI. Update it with the
3826 // incoming bypass edge.
3827 if (LCSSAPhi
.getIncomingValue(0) == LoopExitInst
)
3828 LCSSAPhi
.addIncoming(ReducedPartRdx
, LoopMiddleBlock
);
3829 } // end of the LCSSA phi scan.
3831 // Fix the scalar loop reduction variable with the incoming reduction sum
3832 // from the vector body and from the backedge value.
3833 int IncomingEdgeBlockIdx
=
3834 Phi
->getBasicBlockIndex(OrigLoop
->getLoopLatch());
3835 assert(IncomingEdgeBlockIdx
>= 0 && "Invalid block index");
3836 // Pick the other block.
3837 int SelfEdgeBlockIdx
= (IncomingEdgeBlockIdx
? 0 : 1);
3838 Phi
->setIncomingValue(SelfEdgeBlockIdx
, BCBlockPhi
);
3839 Phi
->setIncomingValue(IncomingEdgeBlockIdx
, LoopExitInst
);
3842 void InnerLoopVectorizer::fixLCSSAPHIs() {
3843 for (PHINode
&LCSSAPhi
: LoopExitBlock
->phis()) {
3844 if (LCSSAPhi
.getNumIncomingValues() == 1) {
3845 auto *IncomingValue
= LCSSAPhi
.getIncomingValue(0);
3846 // Non-instruction incoming values will have only one value.
3847 unsigned LastLane
= 0;
3848 if (isa
<Instruction
>(IncomingValue
))
3849 LastLane
= Cost
->isUniformAfterVectorization(
3850 cast
<Instruction
>(IncomingValue
), VF
)
3853 // Can be a loop invariant incoming value or the last scalar value to be
3854 // extracted from the vectorized loop.
3855 Builder
.SetInsertPoint(LoopMiddleBlock
->getTerminator());
3856 Value
*lastIncomingValue
=
3857 getOrCreateScalarValue(IncomingValue
, { UF
- 1, LastLane
});
3858 LCSSAPhi
.addIncoming(lastIncomingValue
, LoopMiddleBlock
);
3863 void InnerLoopVectorizer::sinkScalarOperands(Instruction
*PredInst
) {
3864 // The basic block and loop containing the predicated instruction.
3865 auto *PredBB
= PredInst
->getParent();
3866 auto *VectorLoop
= LI
->getLoopFor(PredBB
);
3868 // Initialize a worklist with the operands of the predicated instruction.
3869 SetVector
<Value
*> Worklist(PredInst
->op_begin(), PredInst
->op_end());
3871 // Holds instructions that we need to analyze again. An instruction may be
3872 // reanalyzed if we don't yet know if we can sink it or not.
3873 SmallVector
<Instruction
*, 8> InstsToReanalyze
;
3875 // Returns true if a given use occurs in the predicated block. Phi nodes use
3876 // their operands in their corresponding predecessor blocks.
3877 auto isBlockOfUsePredicated
= [&](Use
&U
) -> bool {
3878 auto *I
= cast
<Instruction
>(U
.getUser());
3879 BasicBlock
*BB
= I
->getParent();
3880 if (auto *Phi
= dyn_cast
<PHINode
>(I
))
3881 BB
= Phi
->getIncomingBlock(
3882 PHINode::getIncomingValueNumForOperand(U
.getOperandNo()));
3883 return BB
== PredBB
;
3886 // Iteratively sink the scalarized operands of the predicated instruction
3887 // into the block we created for it. When an instruction is sunk, it's
3888 // operands are then added to the worklist. The algorithm ends after one pass
3889 // through the worklist doesn't sink a single instruction.
3892 // Add the instructions that need to be reanalyzed to the worklist, and
3893 // reset the changed indicator.
3894 Worklist
.insert(InstsToReanalyze
.begin(), InstsToReanalyze
.end());
3895 InstsToReanalyze
.clear();
3898 while (!Worklist
.empty()) {
3899 auto *I
= dyn_cast
<Instruction
>(Worklist
.pop_back_val());
3901 // We can't sink an instruction if it is a phi node, is already in the
3902 // predicated block, is not in the loop, or may have side effects.
3903 if (!I
|| isa
<PHINode
>(I
) || I
->getParent() == PredBB
||
3904 !VectorLoop
->contains(I
) || I
->mayHaveSideEffects())
3907 // It's legal to sink the instruction if all its uses occur in the
3908 // predicated block. Otherwise, there's nothing to do yet, and we may
3909 // need to reanalyze the instruction.
3910 if (!llvm::all_of(I
->uses(), isBlockOfUsePredicated
)) {
3911 InstsToReanalyze
.push_back(I
);
3915 // Move the instruction to the beginning of the predicated block, and add
3916 // it's operands to the worklist.
3917 I
->moveBefore(&*PredBB
->getFirstInsertionPt());
3918 Worklist
.insert(I
->op_begin(), I
->op_end());
3920 // The sinking may have enabled other instructions to be sunk, so we will
3927 void InnerLoopVectorizer::fixNonInductionPHIs() {
3928 for (PHINode
*OrigPhi
: OrigPHIsToFix
) {
3930 cast
<PHINode
>(VectorLoopValueMap
.getVectorValue(OrigPhi
, 0));
3931 unsigned NumIncomingValues
= OrigPhi
->getNumIncomingValues();
3933 SmallVector
<BasicBlock
*, 2> ScalarBBPredecessors(
3934 predecessors(OrigPhi
->getParent()));
3935 SmallVector
<BasicBlock
*, 2> VectorBBPredecessors(
3936 predecessors(NewPhi
->getParent()));
3937 assert(ScalarBBPredecessors
.size() == VectorBBPredecessors
.size() &&
3938 "Scalar and Vector BB should have the same number of predecessors");
3940 // The insertion point in Builder may be invalidated by the time we get
3941 // here. Force the Builder insertion point to something valid so that we do
3942 // not run into issues during insertion point restore in
3943 // getOrCreateVectorValue calls below.
3944 Builder
.SetInsertPoint(NewPhi
);
3946 // The predecessor order is preserved and we can rely on mapping between
3947 // scalar and vector block predecessors.
3948 for (unsigned i
= 0; i
< NumIncomingValues
; ++i
) {
3949 BasicBlock
*NewPredBB
= VectorBBPredecessors
[i
];
3951 // When looking up the new scalar/vector values to fix up, use incoming
3952 // values from original phi.
3954 OrigPhi
->getIncomingValueForBlock(ScalarBBPredecessors
[i
]);
3956 // Scalar incoming value may need a broadcast
3957 Value
*NewIncV
= getOrCreateVectorValue(ScIncV
, 0);
3958 NewPhi
->addIncoming(NewIncV
, NewPredBB
);
3963 void InnerLoopVectorizer::widenPHIInstruction(Instruction
*PN
, unsigned UF
,
3965 PHINode
*P
= cast
<PHINode
>(PN
);
3966 if (EnableVPlanNativePath
) {
3967 // Currently we enter here in the VPlan-native path for non-induction
3968 // PHIs where all control flow is uniform. We simply widen these PHIs.
3969 // Create a vector phi with no operands - the vector phi operands will be
3970 // set at the end of vector code generation.
3972 (VF
== 1) ? PN
->getType() : VectorType::get(PN
->getType(), VF
);
3973 Value
*VecPhi
= Builder
.CreatePHI(VecTy
, PN
->getNumOperands(), "vec.phi");
3974 VectorLoopValueMap
.setVectorValue(P
, 0, VecPhi
);
3975 OrigPHIsToFix
.push_back(P
);
3980 assert(PN
->getParent() == OrigLoop
->getHeader() &&
3981 "Non-header phis should have been handled elsewhere");
3983 // In order to support recurrences we need to be able to vectorize Phi nodes.
3984 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3985 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3986 // this value when we vectorize all of the instructions that use the PHI.
3987 if (Legal
->isReductionVariable(P
) || Legal
->isFirstOrderRecurrence(P
)) {
3988 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
3989 // This is phase one of vectorizing PHIs.
3991 (VF
== 1) ? PN
->getType() : VectorType::get(PN
->getType(), VF
);
3992 Value
*EntryPart
= PHINode::Create(
3993 VecTy
, 2, "vec.phi", &*LoopVectorBody
->getFirstInsertionPt());
3994 VectorLoopValueMap
.setVectorValue(P
, Part
, EntryPart
);
3999 setDebugLocFromInst(Builder
, P
);
4001 // This PHINode must be an induction variable.
4002 // Make sure that we know about it.
4003 assert(Legal
->getInductionVars()->count(P
) && "Not an induction variable");
4005 InductionDescriptor II
= Legal
->getInductionVars()->lookup(P
);
4006 const DataLayout
&DL
= OrigLoop
->getHeader()->getModule()->getDataLayout();
4008 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4009 // which can be found from the original scalar operations.
4010 switch (II
.getKind()) {
4011 case InductionDescriptor::IK_NoInduction
:
4012 llvm_unreachable("Unknown induction");
4013 case InductionDescriptor::IK_IntInduction
:
4014 case InductionDescriptor::IK_FpInduction
:
4015 llvm_unreachable("Integer/fp induction is handled elsewhere.");
4016 case InductionDescriptor::IK_PtrInduction
: {
4017 // Handle the pointer induction variable case.
4018 assert(P
->getType()->isPointerTy() && "Unexpected type.");
4019 // This is the normalized GEP that starts counting at zero.
4020 Value
*PtrInd
= Induction
;
4021 PtrInd
= Builder
.CreateSExtOrTrunc(PtrInd
, II
.getStep()->getType());
4022 // Determine the number of scalars we need to generate for each unroll
4023 // iteration. If the instruction is uniform, we only need to generate the
4024 // first lane. Otherwise, we generate all VF values.
4025 unsigned Lanes
= Cost
->isUniformAfterVectorization(P
, VF
) ? 1 : VF
;
4026 // These are the scalar results. Notice that we don't generate vector GEPs
4027 // because scalar GEPs result in better code.
4028 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4029 for (unsigned Lane
= 0; Lane
< Lanes
; ++Lane
) {
4030 Constant
*Idx
= ConstantInt::get(PtrInd
->getType(), Lane
+ Part
* VF
);
4031 Value
*GlobalIdx
= Builder
.CreateAdd(PtrInd
, Idx
);
4033 emitTransformedIndex(Builder
, GlobalIdx
, PSE
.getSE(), DL
, II
);
4034 SclrGep
->setName("next.gep");
4035 VectorLoopValueMap
.setScalarValue(P
, {Part
, Lane
}, SclrGep
);
4043 /// A helper function for checking whether an integer division-related
4044 /// instruction may divide by zero (in which case it must be predicated if
4045 /// executed conditionally in the scalar code).
4046 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4047 /// Non-zero divisors that are non compile-time constants will not be
4048 /// converted into multiplication, so we will still end up scalarizing
4049 /// the division, but can do so w/o predication.
4050 static bool mayDivideByZero(Instruction
&I
) {
4051 assert((I
.getOpcode() == Instruction::UDiv
||
4052 I
.getOpcode() == Instruction::SDiv
||
4053 I
.getOpcode() == Instruction::URem
||
4054 I
.getOpcode() == Instruction::SRem
) &&
4055 "Unexpected instruction");
4056 Value
*Divisor
= I
.getOperand(1);
4057 auto *CInt
= dyn_cast
<ConstantInt
>(Divisor
);
4058 return !CInt
|| CInt
->isZero();
4061 void InnerLoopVectorizer::widenInstruction(Instruction
&I
) {
4062 switch (I
.getOpcode()) {
4063 case Instruction::Br
:
4064 case Instruction::PHI
:
4065 llvm_unreachable("This instruction is handled by a different recipe.");
4066 case Instruction::GetElementPtr
: {
4067 // Construct a vector GEP by widening the operands of the scalar GEP as
4068 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4069 // results in a vector of pointers when at least one operand of the GEP
4070 // is vector-typed. Thus, to keep the representation compact, we only use
4071 // vector-typed operands for loop-varying values.
4072 auto *GEP
= cast
<GetElementPtrInst
>(&I
);
4074 if (VF
> 1 && OrigLoop
->hasLoopInvariantOperands(GEP
)) {
4075 // If we are vectorizing, but the GEP has only loop-invariant operands,
4076 // the GEP we build (by only using vector-typed operands for
4077 // loop-varying values) would be a scalar pointer. Thus, to ensure we
4078 // produce a vector of pointers, we need to either arbitrarily pick an
4079 // operand to broadcast, or broadcast a clone of the original GEP.
4080 // Here, we broadcast a clone of the original.
4082 // TODO: If at some point we decide to scalarize instructions having
4083 // loop-invariant operands, this special case will no longer be
4084 // required. We would add the scalarization decision to
4085 // collectLoopScalars() and teach getVectorValue() to broadcast
4086 // the lane-zero scalar value.
4087 auto *Clone
= Builder
.Insert(GEP
->clone());
4088 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4089 Value
*EntryPart
= Builder
.CreateVectorSplat(VF
, Clone
);
4090 VectorLoopValueMap
.setVectorValue(&I
, Part
, EntryPart
);
4091 addMetadata(EntryPart
, GEP
);
4094 // If the GEP has at least one loop-varying operand, we are sure to
4095 // produce a vector of pointers. But if we are only unrolling, we want
4096 // to produce a scalar GEP for each unroll part. Thus, the GEP we
4097 // produce with the code below will be scalar (if VF == 1) or vector
4098 // (otherwise). Note that for the unroll-only case, we still maintain
4099 // values in the vector mapping with initVector, as we do for other
4101 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4102 // The pointer operand of the new GEP. If it's loop-invariant, we
4103 // won't broadcast it.
4105 OrigLoop
->isLoopInvariant(GEP
->getPointerOperand())
4106 ? GEP
->getPointerOperand()
4107 : getOrCreateVectorValue(GEP
->getPointerOperand(), Part
);
4109 // Collect all the indices for the new GEP. If any index is
4110 // loop-invariant, we won't broadcast it.
4111 SmallVector
<Value
*, 4> Indices
;
4112 for (auto &U
: make_range(GEP
->idx_begin(), GEP
->idx_end())) {
4113 if (OrigLoop
->isLoopInvariant(U
.get()))
4114 Indices
.push_back(U
.get());
4116 Indices
.push_back(getOrCreateVectorValue(U
.get(), Part
));
4119 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4120 // but it should be a vector, otherwise.
4123 ? Builder
.CreateInBoundsGEP(GEP
->getSourceElementType(), Ptr
,
4125 : Builder
.CreateGEP(GEP
->getSourceElementType(), Ptr
, Indices
);
4126 assert((VF
== 1 || NewGEP
->getType()->isVectorTy()) &&
4127 "NewGEP is not a pointer vector");
4128 VectorLoopValueMap
.setVectorValue(&I
, Part
, NewGEP
);
4129 addMetadata(NewGEP
, GEP
);
4135 case Instruction::UDiv
:
4136 case Instruction::SDiv
:
4137 case Instruction::SRem
:
4138 case Instruction::URem
:
4139 case Instruction::Add
:
4140 case Instruction::FAdd
:
4141 case Instruction::Sub
:
4142 case Instruction::FSub
:
4143 case Instruction::FNeg
:
4144 case Instruction::Mul
:
4145 case Instruction::FMul
:
4146 case Instruction::FDiv
:
4147 case Instruction::FRem
:
4148 case Instruction::Shl
:
4149 case Instruction::LShr
:
4150 case Instruction::AShr
:
4151 case Instruction::And
:
4152 case Instruction::Or
:
4153 case Instruction::Xor
: {
4154 // Just widen unops and binops.
4155 setDebugLocFromInst(Builder
, &I
);
4157 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4158 SmallVector
<Value
*, 2> Ops
;
4159 for (Value
*Op
: I
.operands())
4160 Ops
.push_back(getOrCreateVectorValue(Op
, Part
));
4162 Value
*V
= Builder
.CreateNAryOp(I
.getOpcode(), Ops
);
4164 if (auto *VecOp
= dyn_cast
<Instruction
>(V
))
4165 VecOp
->copyIRFlags(&I
);
4167 // Use this vector value for all users of the original instruction.
4168 VectorLoopValueMap
.setVectorValue(&I
, Part
, V
);
4174 case Instruction::Select
: {
4176 // If the selector is loop invariant we can create a select
4177 // instruction with a scalar condition. Otherwise, use vector-select.
4178 auto *SE
= PSE
.getSE();
4179 bool InvariantCond
=
4180 SE
->isLoopInvariant(PSE
.getSCEV(I
.getOperand(0)), OrigLoop
);
4181 setDebugLocFromInst(Builder
, &I
);
4183 // The condition can be loop invariant but still defined inside the
4184 // loop. This means that we can't just use the original 'cond' value.
4185 // We have to take the 'vectorized' value and pick the first lane.
4186 // Instcombine will make this a no-op.
4188 auto *ScalarCond
= getOrCreateScalarValue(I
.getOperand(0), {0, 0});
4190 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4191 Value
*Cond
= getOrCreateVectorValue(I
.getOperand(0), Part
);
4192 Value
*Op0
= getOrCreateVectorValue(I
.getOperand(1), Part
);
4193 Value
*Op1
= getOrCreateVectorValue(I
.getOperand(2), Part
);
4195 Builder
.CreateSelect(InvariantCond
? ScalarCond
: Cond
, Op0
, Op1
);
4196 VectorLoopValueMap
.setVectorValue(&I
, Part
, Sel
);
4197 addMetadata(Sel
, &I
);
4203 case Instruction::ICmp
:
4204 case Instruction::FCmp
: {
4205 // Widen compares. Generate vector compares.
4206 bool FCmp
= (I
.getOpcode() == Instruction::FCmp
);
4207 auto *Cmp
= cast
<CmpInst
>(&I
);
4208 setDebugLocFromInst(Builder
, Cmp
);
4209 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4210 Value
*A
= getOrCreateVectorValue(Cmp
->getOperand(0), Part
);
4211 Value
*B
= getOrCreateVectorValue(Cmp
->getOperand(1), Part
);
4214 // Propagate fast math flags.
4215 IRBuilder
<>::FastMathFlagGuard
FMFG(Builder
);
4216 Builder
.setFastMathFlags(Cmp
->getFastMathFlags());
4217 C
= Builder
.CreateFCmp(Cmp
->getPredicate(), A
, B
);
4219 C
= Builder
.CreateICmp(Cmp
->getPredicate(), A
, B
);
4221 VectorLoopValueMap
.setVectorValue(&I
, Part
, C
);
4228 case Instruction::ZExt
:
4229 case Instruction::SExt
:
4230 case Instruction::FPToUI
:
4231 case Instruction::FPToSI
:
4232 case Instruction::FPExt
:
4233 case Instruction::PtrToInt
:
4234 case Instruction::IntToPtr
:
4235 case Instruction::SIToFP
:
4236 case Instruction::UIToFP
:
4237 case Instruction::Trunc
:
4238 case Instruction::FPTrunc
:
4239 case Instruction::BitCast
: {
4240 auto *CI
= cast
<CastInst
>(&I
);
4241 setDebugLocFromInst(Builder
, CI
);
4243 /// Vectorize casts.
4245 (VF
== 1) ? CI
->getType() : VectorType::get(CI
->getType(), VF
);
4247 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4248 Value
*A
= getOrCreateVectorValue(CI
->getOperand(0), Part
);
4249 Value
*Cast
= Builder
.CreateCast(CI
->getOpcode(), A
, DestTy
);
4250 VectorLoopValueMap
.setVectorValue(&I
, Part
, Cast
);
4251 addMetadata(Cast
, &I
);
4256 case Instruction::Call
: {
4257 // Ignore dbg intrinsics.
4258 if (isa
<DbgInfoIntrinsic
>(I
))
4260 setDebugLocFromInst(Builder
, &I
);
4262 Module
*M
= I
.getParent()->getParent()->getParent();
4263 auto *CI
= cast
<CallInst
>(&I
);
4265 StringRef FnName
= CI
->getCalledFunction()->getName();
4266 Function
*F
= CI
->getCalledFunction();
4267 Type
*RetTy
= ToVectorTy(CI
->getType(), VF
);
4268 SmallVector
<Type
*, 4> Tys
;
4269 for (Value
*ArgOperand
: CI
->arg_operands())
4270 Tys
.push_back(ToVectorTy(ArgOperand
->getType(), VF
));
4272 Intrinsic::ID ID
= getVectorIntrinsicIDForCall(CI
, TLI
);
4274 // The flag shows whether we use Intrinsic or a usual Call for vectorized
4275 // version of the instruction.
4276 // Is it beneficial to perform intrinsic call compared to lib call?
4277 bool NeedToScalarize
;
4278 unsigned CallCost
= Cost
->getVectorCallCost(CI
, VF
, NeedToScalarize
);
4279 bool UseVectorIntrinsic
=
4280 ID
&& Cost
->getVectorIntrinsicCost(CI
, VF
) <= CallCost
;
4281 assert((UseVectorIntrinsic
|| !NeedToScalarize
) &&
4282 "Instruction should be scalarized elsewhere.");
4284 for (unsigned Part
= 0; Part
< UF
; ++Part
) {
4285 SmallVector
<Value
*, 4> Args
;
4286 for (unsigned i
= 0, ie
= CI
->getNumArgOperands(); i
!= ie
; ++i
) {
4287 Value
*Arg
= CI
->getArgOperand(i
);
4288 // Some intrinsics have a scalar argument - don't replace it with a
4290 if (!UseVectorIntrinsic
|| !hasVectorInstrinsicScalarOpd(ID
, i
))
4291 Arg
= getOrCreateVectorValue(CI
->getArgOperand(i
), Part
);
4292 Args
.push_back(Arg
);
4296 if (UseVectorIntrinsic
) {
4297 // Use vector version of the intrinsic.
4298 Type
*TysForDecl
[] = {CI
->getType()};
4300 TysForDecl
[0] = VectorType::get(CI
->getType()->getScalarType(), VF
);
4301 VectorF
= Intrinsic::getDeclaration(M
, ID
, TysForDecl
);
4303 // Use vector version of the library call.
4304 StringRef VFnName
= TLI
->getVectorizedFunction(FnName
, VF
);
4305 assert(!VFnName
.empty() && "Vector function name is empty.");
4306 VectorF
= M
->getFunction(VFnName
);
4308 // Generate a declaration
4309 FunctionType
*FTy
= FunctionType::get(RetTy
, Tys
, false);
4311 Function::Create(FTy
, Function::ExternalLinkage
, VFnName
, M
);
4312 VectorF
->copyAttributesFrom(F
);
4315 assert(VectorF
&& "Can't create vector function.");
4317 SmallVector
<OperandBundleDef
, 1> OpBundles
;
4318 CI
->getOperandBundlesAsDefs(OpBundles
);
4319 CallInst
*V
= Builder
.CreateCall(VectorF
, Args
, OpBundles
);
4321 if (isa
<FPMathOperator
>(V
))
4322 V
->copyFastMathFlags(CI
);
4324 VectorLoopValueMap
.setVectorValue(&I
, Part
, V
);
4332 // This instruction is not vectorized by simple widening.
4333 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I
);
4334 llvm_unreachable("Unhandled instruction!");
4338 void InnerLoopVectorizer::updateAnalysis() {
4339 // Forget the original basic block.
4340 PSE
.getSE()->forgetLoop(OrigLoop
);
4342 // DT is not kept up-to-date for outer loop vectorization
4343 if (EnableVPlanNativePath
)
4346 // Update the dominator tree information.
4347 assert(DT
->properlyDominates(LoopBypassBlocks
.front(), LoopExitBlock
) &&
4348 "Entry does not dominate exit.");
4350 DT
->addNewBlock(LoopMiddleBlock
,
4351 LI
->getLoopFor(LoopVectorBody
)->getLoopLatch());
4352 DT
->addNewBlock(LoopScalarPreHeader
, LoopBypassBlocks
[0]);
4353 DT
->changeImmediateDominator(LoopScalarBody
, LoopScalarPreHeader
);
4354 DT
->changeImmediateDominator(LoopExitBlock
, LoopBypassBlocks
[0]);
4355 assert(DT
->verify(DominatorTree::VerificationLevel::Fast
));
4358 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF
) {
4359 // We should not collect Scalars more than once per VF. Right now, this
4360 // function is called from collectUniformsAndScalars(), which already does
4361 // this check. Collecting Scalars for VF=1 does not make any sense.
4362 assert(VF
>= 2 && Scalars
.find(VF
) == Scalars
.end() &&
4363 "This function should not be visited twice for the same VF");
4365 SmallSetVector
<Instruction
*, 8> Worklist
;
4367 // These sets are used to seed the analysis with pointers used by memory
4368 // accesses that will remain scalar.
4369 SmallSetVector
<Instruction
*, 8> ScalarPtrs
;
4370 SmallPtrSet
<Instruction
*, 8> PossibleNonScalarPtrs
;
4372 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4373 // The pointer operands of loads and stores will be scalar as long as the
4374 // memory access is not a gather or scatter operation. The value operand of a
4375 // store will remain scalar if the store is scalarized.
4376 auto isScalarUse
= [&](Instruction
*MemAccess
, Value
*Ptr
) {
4377 InstWidening WideningDecision
= getWideningDecision(MemAccess
, VF
);
4378 assert(WideningDecision
!= CM_Unknown
&&
4379 "Widening decision should be ready at this moment");
4380 if (auto *Store
= dyn_cast
<StoreInst
>(MemAccess
))
4381 if (Ptr
== Store
->getValueOperand())
4382 return WideningDecision
== CM_Scalarize
;
4383 assert(Ptr
== getLoadStorePointerOperand(MemAccess
) &&
4384 "Ptr is neither a value or pointer operand");
4385 return WideningDecision
!= CM_GatherScatter
;
4388 // A helper that returns true if the given value is a bitcast or
4389 // getelementptr instruction contained in the loop.
4390 auto isLoopVaryingBitCastOrGEP
= [&](Value
*V
) {
4391 return ((isa
<BitCastInst
>(V
) && V
->getType()->isPointerTy()) ||
4392 isa
<GetElementPtrInst
>(V
)) &&
4393 !TheLoop
->isLoopInvariant(V
);
4396 // A helper that evaluates a memory access's use of a pointer. If the use
4397 // will be a scalar use, and the pointer is only used by memory accesses, we
4398 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4399 // PossibleNonScalarPtrs.
4400 auto evaluatePtrUse
= [&](Instruction
*MemAccess
, Value
*Ptr
) {
4401 // We only care about bitcast and getelementptr instructions contained in
4403 if (!isLoopVaryingBitCastOrGEP(Ptr
))
4406 // If the pointer has already been identified as scalar (e.g., if it was
4407 // also identified as uniform), there's nothing to do.
4408 auto *I
= cast
<Instruction
>(Ptr
);
4409 if (Worklist
.count(I
))
4412 // If the use of the pointer will be a scalar use, and all users of the
4413 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4414 // place the pointer in PossibleNonScalarPtrs.
4415 if (isScalarUse(MemAccess
, Ptr
) && llvm::all_of(I
->users(), [&](User
*U
) {
4416 return isa
<LoadInst
>(U
) || isa
<StoreInst
>(U
);
4418 ScalarPtrs
.insert(I
);
4420 PossibleNonScalarPtrs
.insert(I
);
4423 // We seed the scalars analysis with three classes of instructions: (1)
4424 // instructions marked uniform-after-vectorization, (2) bitcast and
4425 // getelementptr instructions used by memory accesses requiring a scalar use,
4426 // and (3) pointer induction variables and their update instructions (we
4427 // currently only scalarize these).
4429 // (1) Add to the worklist all instructions that have been identified as
4430 // uniform-after-vectorization.
4431 Worklist
.insert(Uniforms
[VF
].begin(), Uniforms
[VF
].end());
4433 // (2) Add to the worklist all bitcast and getelementptr instructions used by
4434 // memory accesses requiring a scalar use. The pointer operands of loads and
4435 // stores will be scalar as long as the memory accesses is not a gather or
4436 // scatter operation. The value operand of a store will remain scalar if the
4437 // store is scalarized.
4438 for (auto *BB
: TheLoop
->blocks())
4439 for (auto &I
: *BB
) {
4440 if (auto *Load
= dyn_cast
<LoadInst
>(&I
)) {
4441 evaluatePtrUse(Load
, Load
->getPointerOperand());
4442 } else if (auto *Store
= dyn_cast
<StoreInst
>(&I
)) {
4443 evaluatePtrUse(Store
, Store
->getPointerOperand());
4444 evaluatePtrUse(Store
, Store
->getValueOperand());
4447 for (auto *I
: ScalarPtrs
)
4448 if (PossibleNonScalarPtrs
.find(I
) == PossibleNonScalarPtrs
.end()) {
4449 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I
<< "\n");
4453 // (3) Add to the worklist all pointer induction variables and their update
4456 // TODO: Once we are able to vectorize pointer induction variables we should
4457 // no longer insert them into the worklist here.
4458 auto *Latch
= TheLoop
->getLoopLatch();
4459 for (auto &Induction
: *Legal
->getInductionVars()) {
4460 auto *Ind
= Induction
.first
;
4461 auto *IndUpdate
= cast
<Instruction
>(Ind
->getIncomingValueForBlock(Latch
));
4462 if (Induction
.second
.getKind() != InductionDescriptor::IK_PtrInduction
)
4464 Worklist
.insert(Ind
);
4465 Worklist
.insert(IndUpdate
);
4466 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind
<< "\n");
4467 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4471 // Insert the forced scalars.
4472 // FIXME: Currently widenPHIInstruction() often creates a dead vector
4473 // induction variable when the PHI user is scalarized.
4474 auto ForcedScalar
= ForcedScalars
.find(VF
);
4475 if (ForcedScalar
!= ForcedScalars
.end())
4476 for (auto *I
: ForcedScalar
->second
)
4479 // Expand the worklist by looking through any bitcasts and getelementptr
4480 // instructions we've already identified as scalar. This is similar to the
4481 // expansion step in collectLoopUniforms(); however, here we're only
4482 // expanding to include additional bitcasts and getelementptr instructions.
4484 while (Idx
!= Worklist
.size()) {
4485 Instruction
*Dst
= Worklist
[Idx
++];
4486 if (!isLoopVaryingBitCastOrGEP(Dst
->getOperand(0)))
4488 auto *Src
= cast
<Instruction
>(Dst
->getOperand(0));
4489 if (llvm::all_of(Src
->users(), [&](User
*U
) -> bool {
4490 auto *J
= cast
<Instruction
>(U
);
4491 return !TheLoop
->contains(J
) || Worklist
.count(J
) ||
4492 ((isa
<LoadInst
>(J
) || isa
<StoreInst
>(J
)) &&
4493 isScalarUse(J
, Src
));
4495 Worklist
.insert(Src
);
4496 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src
<< "\n");
4500 // An induction variable will remain scalar if all users of the induction
4501 // variable and induction variable update remain scalar.
4502 for (auto &Induction
: *Legal
->getInductionVars()) {
4503 auto *Ind
= Induction
.first
;
4504 auto *IndUpdate
= cast
<Instruction
>(Ind
->getIncomingValueForBlock(Latch
));
4506 // We already considered pointer induction variables, so there's no reason
4507 // to look at their users again.
4509 // TODO: Once we are able to vectorize pointer induction variables we
4510 // should no longer skip over them here.
4511 if (Induction
.second
.getKind() == InductionDescriptor::IK_PtrInduction
)
4514 // Determine if all users of the induction variable are scalar after
4516 auto ScalarInd
= llvm::all_of(Ind
->users(), [&](User
*U
) -> bool {
4517 auto *I
= cast
<Instruction
>(U
);
4518 return I
== IndUpdate
|| !TheLoop
->contains(I
) || Worklist
.count(I
);
4523 // Determine if all users of the induction variable update instruction are
4524 // scalar after vectorization.
4525 auto ScalarIndUpdate
=
4526 llvm::all_of(IndUpdate
->users(), [&](User
*U
) -> bool {
4527 auto *I
= cast
<Instruction
>(U
);
4528 return I
== Ind
|| !TheLoop
->contains(I
) || Worklist
.count(I
);
4530 if (!ScalarIndUpdate
)
4533 // The induction variable and its update instruction will remain scalar.
4534 Worklist
.insert(Ind
);
4535 Worklist
.insert(IndUpdate
);
4536 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind
<< "\n");
4537 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4541 Scalars
[VF
].insert(Worklist
.begin(), Worklist
.end());
4544 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction
*I
, unsigned VF
) {
4545 if (!blockNeedsPredication(I
->getParent()))
4547 switch(I
->getOpcode()) {
4550 case Instruction::Load
:
4551 case Instruction::Store
: {
4552 if (!Legal
->isMaskRequired(I
))
4554 auto *Ptr
= getLoadStorePointerOperand(I
);
4555 auto *Ty
= getMemInstValueType(I
);
4556 // We have already decided how to vectorize this instruction, get that
4559 InstWidening WideningDecision
= getWideningDecision(I
, VF
);
4560 assert(WideningDecision
!= CM_Unknown
&&
4561 "Widening decision should be ready at this moment");
4562 return WideningDecision
== CM_Scalarize
;
4564 const MaybeAlign Alignment
= getLoadStoreAlignment(I
);
4565 return isa
<LoadInst
>(I
) ?
4566 !(isLegalMaskedLoad(Ty
, Ptr
, Alignment
) || isLegalMaskedGather(Ty
))
4567 : !(isLegalMaskedStore(Ty
, Ptr
, Alignment
) || isLegalMaskedScatter(Ty
));
4569 case Instruction::UDiv
:
4570 case Instruction::SDiv
:
4571 case Instruction::SRem
:
4572 case Instruction::URem
:
4573 return mayDivideByZero(*I
);
4578 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction
*I
,
4580 assert(isAccessInterleaved(I
) && "Expecting interleaved access.");
4581 assert(getWideningDecision(I
, VF
) == CM_Unknown
&&
4582 "Decision should not be set yet.");
4583 auto *Group
= getInterleavedAccessGroup(I
);
4584 assert(Group
&& "Must have a group.");
4586 // If the instruction's allocated size doesn't equal it's type size, it
4587 // requires padding and will be scalarized.
4588 auto &DL
= I
->getModule()->getDataLayout();
4589 auto *ScalarTy
= getMemInstValueType(I
);
4590 if (hasIrregularType(ScalarTy
, DL
, VF
))
4593 // Check if masking is required.
4594 // A Group may need masking for one of two reasons: it resides in a block that
4595 // needs predication, or it was decided to use masking to deal with gaps.
4596 bool PredicatedAccessRequiresMasking
=
4597 Legal
->blockNeedsPredication(I
->getParent()) && Legal
->isMaskRequired(I
);
4598 bool AccessWithGapsRequiresMasking
=
4599 Group
->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4600 if (!PredicatedAccessRequiresMasking
&& !AccessWithGapsRequiresMasking
)
4603 // If masked interleaving is required, we expect that the user/target had
4604 // enabled it, because otherwise it either wouldn't have been created or
4605 // it should have been invalidated by the CostModel.
4606 assert(useMaskedInterleavedAccesses(TTI
) &&
4607 "Masked interleave-groups for predicated accesses are not enabled.");
4609 auto *Ty
= getMemInstValueType(I
);
4610 const MaybeAlign Alignment
= getLoadStoreAlignment(I
);
4611 return isa
<LoadInst
>(I
) ? TTI
.isLegalMaskedLoad(Ty
, Alignment
)
4612 : TTI
.isLegalMaskedStore(Ty
, Alignment
);
4615 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction
*I
,
4617 // Get and ensure we have a valid memory instruction.
4618 LoadInst
*LI
= dyn_cast
<LoadInst
>(I
);
4619 StoreInst
*SI
= dyn_cast
<StoreInst
>(I
);
4620 assert((LI
|| SI
) && "Invalid memory instruction");
4622 auto *Ptr
= getLoadStorePointerOperand(I
);
4624 // In order to be widened, the pointer should be consecutive, first of all.
4625 if (!Legal
->isConsecutivePtr(Ptr
))
4628 // If the instruction is a store located in a predicated block, it will be
4630 if (isScalarWithPredication(I
))
4633 // If the instruction's allocated size doesn't equal it's type size, it
4634 // requires padding and will be scalarized.
4635 auto &DL
= I
->getModule()->getDataLayout();
4636 auto *ScalarTy
= LI
? LI
->getType() : SI
->getValueOperand()->getType();
4637 if (hasIrregularType(ScalarTy
, DL
, VF
))
4643 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF
) {
4644 // We should not collect Uniforms more than once per VF. Right now,
4645 // this function is called from collectUniformsAndScalars(), which
4646 // already does this check. Collecting Uniforms for VF=1 does not make any
4649 assert(VF
>= 2 && Uniforms
.find(VF
) == Uniforms
.end() &&
4650 "This function should not be visited twice for the same VF");
4652 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4653 // not analyze again. Uniforms.count(VF) will return 1.
4654 Uniforms
[VF
].clear();
4656 // We now know that the loop is vectorizable!
4657 // Collect instructions inside the loop that will remain uniform after
4660 // Global values, params and instructions outside of current loop are out of
4662 auto isOutOfScope
= [&](Value
*V
) -> bool {
4663 Instruction
*I
= dyn_cast
<Instruction
>(V
);
4664 return (!I
|| !TheLoop
->contains(I
));
4667 SetVector
<Instruction
*> Worklist
;
4668 BasicBlock
*Latch
= TheLoop
->getLoopLatch();
4670 // Start with the conditional branch. If the branch condition is an
4671 // instruction contained in the loop that is only used by the branch, it is
4673 auto *Cmp
= dyn_cast
<Instruction
>(Latch
->getTerminator()->getOperand(0));
4674 if (Cmp
&& TheLoop
->contains(Cmp
) && Cmp
->hasOneUse()) {
4675 Worklist
.insert(Cmp
);
4676 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp
<< "\n");
4679 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4680 // are pointers that are treated like consecutive pointers during
4681 // vectorization. The pointer operands of interleaved accesses are an
4683 SmallSetVector
<Instruction
*, 8> ConsecutiveLikePtrs
;
4685 // Holds pointer operands of instructions that are possibly non-uniform.
4686 SmallPtrSet
<Instruction
*, 8> PossibleNonUniformPtrs
;
4688 auto isUniformDecision
= [&](Instruction
*I
, unsigned VF
) {
4689 InstWidening WideningDecision
= getWideningDecision(I
, VF
);
4690 assert(WideningDecision
!= CM_Unknown
&&
4691 "Widening decision should be ready at this moment");
4693 return (WideningDecision
== CM_Widen
||
4694 WideningDecision
== CM_Widen_Reverse
||
4695 WideningDecision
== CM_Interleave
);
4697 // Iterate over the instructions in the loop, and collect all
4698 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4699 // that a consecutive-like pointer operand will be scalarized, we collect it
4700 // in PossibleNonUniformPtrs instead. We use two sets here because a single
4701 // getelementptr instruction can be used by both vectorized and scalarized
4702 // memory instructions. For example, if a loop loads and stores from the same
4703 // location, but the store is conditional, the store will be scalarized, and
4704 // the getelementptr won't remain uniform.
4705 for (auto *BB
: TheLoop
->blocks())
4706 for (auto &I
: *BB
) {
4707 // If there's no pointer operand, there's nothing to do.
4708 auto *Ptr
= dyn_cast_or_null
<Instruction
>(getLoadStorePointerOperand(&I
));
4712 // True if all users of Ptr are memory accesses that have Ptr as their
4714 auto UsersAreMemAccesses
=
4715 llvm::all_of(Ptr
->users(), [&](User
*U
) -> bool {
4716 return getLoadStorePointerOperand(U
) == Ptr
;
4719 // Ensure the memory instruction will not be scalarized or used by
4720 // gather/scatter, making its pointer operand non-uniform. If the pointer
4721 // operand is used by any instruction other than a memory access, we
4722 // conservatively assume the pointer operand may be non-uniform.
4723 if (!UsersAreMemAccesses
|| !isUniformDecision(&I
, VF
))
4724 PossibleNonUniformPtrs
.insert(Ptr
);
4726 // If the memory instruction will be vectorized and its pointer operand
4727 // is consecutive-like, or interleaving - the pointer operand should
4730 ConsecutiveLikePtrs
.insert(Ptr
);
4733 // Add to the Worklist all consecutive and consecutive-like pointers that
4734 // aren't also identified as possibly non-uniform.
4735 for (auto *V
: ConsecutiveLikePtrs
)
4736 if (PossibleNonUniformPtrs
.find(V
) == PossibleNonUniformPtrs
.end()) {
4737 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V
<< "\n");
4741 // Expand Worklist in topological order: whenever a new instruction
4742 // is added , its users should be already inside Worklist. It ensures
4743 // a uniform instruction will only be used by uniform instructions.
4745 while (idx
!= Worklist
.size()) {
4746 Instruction
*I
= Worklist
[idx
++];
4748 for (auto OV
: I
->operand_values()) {
4749 // isOutOfScope operands cannot be uniform instructions.
4750 if (isOutOfScope(OV
))
4752 // First order recurrence Phi's should typically be considered
4754 auto *OP
= dyn_cast
<PHINode
>(OV
);
4755 if (OP
&& Legal
->isFirstOrderRecurrence(OP
))
4757 // If all the users of the operand are uniform, then add the
4758 // operand into the uniform worklist.
4759 auto *OI
= cast
<Instruction
>(OV
);
4760 if (llvm::all_of(OI
->users(), [&](User
*U
) -> bool {
4761 auto *J
= cast
<Instruction
>(U
);
4762 return Worklist
.count(J
) ||
4763 (OI
== getLoadStorePointerOperand(J
) &&
4764 isUniformDecision(J
, VF
));
4766 Worklist
.insert(OI
);
4767 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI
<< "\n");
4772 // Returns true if Ptr is the pointer operand of a memory access instruction
4773 // I, and I is known to not require scalarization.
4774 auto isVectorizedMemAccessUse
= [&](Instruction
*I
, Value
*Ptr
) -> bool {
4775 return getLoadStorePointerOperand(I
) == Ptr
&& isUniformDecision(I
, VF
);
4778 // For an instruction to be added into Worklist above, all its users inside
4779 // the loop should also be in Worklist. However, this condition cannot be
4780 // true for phi nodes that form a cyclic dependence. We must process phi
4781 // nodes separately. An induction variable will remain uniform if all users
4782 // of the induction variable and induction variable update remain uniform.
4783 // The code below handles both pointer and non-pointer induction variables.
4784 for (auto &Induction
: *Legal
->getInductionVars()) {
4785 auto *Ind
= Induction
.first
;
4786 auto *IndUpdate
= cast
<Instruction
>(Ind
->getIncomingValueForBlock(Latch
));
4788 // Determine if all users of the induction variable are uniform after
4790 auto UniformInd
= llvm::all_of(Ind
->users(), [&](User
*U
) -> bool {
4791 auto *I
= cast
<Instruction
>(U
);
4792 return I
== IndUpdate
|| !TheLoop
->contains(I
) || Worklist
.count(I
) ||
4793 isVectorizedMemAccessUse(I
, Ind
);
4798 // Determine if all users of the induction variable update instruction are
4799 // uniform after vectorization.
4800 auto UniformIndUpdate
=
4801 llvm::all_of(IndUpdate
->users(), [&](User
*U
) -> bool {
4802 auto *I
= cast
<Instruction
>(U
);
4803 return I
== Ind
|| !TheLoop
->contains(I
) || Worklist
.count(I
) ||
4804 isVectorizedMemAccessUse(I
, IndUpdate
);
4806 if (!UniformIndUpdate
)
4809 // The induction variable and its update instruction will remain uniform.
4810 Worklist
.insert(Ind
);
4811 Worklist
.insert(IndUpdate
);
4812 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind
<< "\n");
4813 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4817 Uniforms
[VF
].insert(Worklist
.begin(), Worklist
.end());
4820 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4821 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4823 if (Legal
->getRuntimePointerChecking()->Need
) {
4824 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4825 "runtime pointer checks needed. Enable vectorization of this "
4826 "loop with '#pragma clang loop vectorize(enable)' when "
4827 "compiling with -Os/-Oz",
4828 "CantVersionLoopWithOptForSize", ORE
, TheLoop
);
4832 if (!PSE
.getUnionPredicate().getPredicates().empty()) {
4833 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4834 "runtime SCEV checks needed. Enable vectorization of this "
4835 "loop with '#pragma clang loop vectorize(enable)' when "
4836 "compiling with -Os/-Oz",
4837 "CantVersionLoopWithOptForSize", ORE
, TheLoop
);
4841 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4842 if (!Legal
->getLAI()->getSymbolicStrides().empty()) {
4843 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4844 "runtime stride == 1 checks needed. Enable vectorization of "
4845 "this loop with '#pragma clang loop vectorize(enable)' when "
4846 "compiling with -Os/-Oz",
4847 "CantVersionLoopWithOptForSize", ORE
, TheLoop
);
4854 Optional
<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4855 if (Legal
->getRuntimePointerChecking()->Need
&& TTI
.hasBranchDivergence()) {
4856 // TODO: It may by useful to do since it's still likely to be dynamically
4857 // uniform if the target can skip.
4858 reportVectorizationFailure(
4859 "Not inserting runtime ptr check for divergent target",
4860 "runtime pointer checks needed. Not enabled for divergent target",
4861 "CantVersionLoopWithDivergentTarget", ORE
, TheLoop
);
4865 unsigned TC
= PSE
.getSE()->getSmallConstantTripCount(TheLoop
);
4866 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC
<< '\n');
4868 reportVectorizationFailure("Single iteration (non) loop",
4869 "loop trip count is one, irrelevant for vectorization",
4870 "SingleIterationLoop", ORE
, TheLoop
);
4874 switch (ScalarEpilogueStatus
) {
4875 case CM_ScalarEpilogueAllowed
:
4876 return computeFeasibleMaxVF(TC
);
4877 case CM_ScalarEpilogueNotNeededUsePredicate
:
4879 dbgs() << "LV: vector predicate hint/switch found.\n"
4880 << "LV: Not allowing scalar epilogue, creating predicated "
4881 << "vector loop.\n");
4883 case CM_ScalarEpilogueNotAllowedLowTripLoop
:
4884 // fallthrough as a special case of OptForSize
4885 case CM_ScalarEpilogueNotAllowedOptSize
:
4886 if (ScalarEpilogueStatus
== CM_ScalarEpilogueNotAllowedOptSize
)
4888 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4890 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4893 // Bail if runtime checks are required, which are not good when optimising
4895 if (runtimeChecksRequired())
4900 // Now try the tail folding
4902 // Invalidate interleave groups that require an epilogue if we can't mask
4903 // the interleave-group.
4904 if (!useMaskedInterleavedAccesses(TTI
))
4905 InterleaveInfo
.invalidateGroupsRequiringScalarEpilogue();
4907 unsigned MaxVF
= computeFeasibleMaxVF(TC
);
4908 if (TC
> 0 && TC
% MaxVF
== 0) {
4909 // Accept MaxVF if we do not have a tail.
4910 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4914 // If we don't know the precise trip count, or if the trip count that we
4915 // found modulo the vectorization factor is not zero, try to fold the tail
4917 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4918 if (Legal
->prepareToFoldTailByMasking()) {
4919 FoldTailByMasking
= true;
4924 reportVectorizationFailure(
4925 "Unable to calculate the loop count due to complex control flow",
4926 "unable to calculate the loop count due to complex control flow",
4927 "UnknownLoopCountComplexCFG", ORE
, TheLoop
);
4931 reportVectorizationFailure(
4932 "Cannot optimize for size and vectorize at the same time.",
4933 "cannot optimize for size and vectorize at the same time. "
4934 "Enable vectorization of this loop with '#pragma clang loop "
4935 "vectorize(enable)' when compiling with -Os/-Oz",
4936 "NoTailLoopWithOptForSize", ORE
, TheLoop
);
4941 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount
) {
4942 MinBWs
= computeMinimumValueSizes(TheLoop
->getBlocks(), *DB
, &TTI
);
4943 unsigned SmallestType
, WidestType
;
4944 std::tie(SmallestType
, WidestType
) = getSmallestAndWidestTypes();
4945 unsigned WidestRegister
= TTI
.getRegisterBitWidth(true);
4947 // Get the maximum safe dependence distance in bits computed by LAA.
4948 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4949 // the memory accesses that is most restrictive (involved in the smallest
4950 // dependence distance).
4951 unsigned MaxSafeRegisterWidth
= Legal
->getMaxSafeRegisterWidth();
4953 WidestRegister
= std::min(WidestRegister
, MaxSafeRegisterWidth
);
4955 unsigned MaxVectorSize
= WidestRegister
/ WidestType
;
4957 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4958 << " / " << WidestType
<< " bits.\n");
4959 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4960 << WidestRegister
<< " bits.\n");
4962 assert(MaxVectorSize
<= 256 && "Did not expect to pack so many elements"
4963 " into one vector!");
4964 if (MaxVectorSize
== 0) {
4965 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4967 return MaxVectorSize
;
4968 } else if (ConstTripCount
&& ConstTripCount
< MaxVectorSize
&&
4969 isPowerOf2_32(ConstTripCount
)) {
4970 // We need to clamp the VF to be the ConstTripCount. There is no point in
4971 // choosing a higher viable VF as done in the loop below.
4972 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4973 << ConstTripCount
<< "\n");
4974 MaxVectorSize
= ConstTripCount
;
4975 return MaxVectorSize
;
4978 unsigned MaxVF
= MaxVectorSize
;
4979 if (TTI
.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
4980 (MaximizeBandwidth
&& isScalarEpilogueAllowed())) {
4981 // Collect all viable vectorization factors larger than the default MaxVF
4982 // (i.e. MaxVectorSize).
4983 SmallVector
<unsigned, 8> VFs
;
4984 unsigned NewMaxVectorSize
= WidestRegister
/ SmallestType
;
4985 for (unsigned VS
= MaxVectorSize
* 2; VS
<= NewMaxVectorSize
; VS
*= 2)
4988 // For each VF calculate its register usage.
4989 auto RUs
= calculateRegisterUsage(VFs
);
4991 // Select the largest VF which doesn't require more registers than existing
4993 for (int i
= RUs
.size() - 1; i
>= 0; --i
) {
4994 bool Selected
= true;
4995 for (auto& pair
: RUs
[i
].MaxLocalUsers
) {
4996 unsigned TargetNumRegisters
= TTI
.getNumberOfRegisters(pair
.first
);
4997 if (pair
.second
> TargetNumRegisters
)
5005 if (unsigned MinVF
= TTI
.getMinimumVF(SmallestType
)) {
5006 if (MaxVF
< MinVF
) {
5007 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5008 << ") with target's minimum: " << MinVF
<< '\n');
5017 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF
) {
5018 float Cost
= expectedCost(1).first
;
5019 const float ScalarCost
= Cost
;
5021 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost
<< ".\n");
5023 bool ForceVectorization
= Hints
->getForce() == LoopVectorizeHints::FK_Enabled
;
5024 if (ForceVectorization
&& MaxVF
> 1) {
5025 // Ignore scalar width, because the user explicitly wants vectorization.
5026 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5028 Cost
= std::numeric_limits
<float>::max();
5031 for (unsigned i
= 2; i
<= MaxVF
; i
*= 2) {
5032 // Notice that the vector loop needs to be executed less times, so
5033 // we need to divide the cost of the vector loops by the width of
5034 // the vector elements.
5035 VectorizationCostTy C
= expectedCost(i
);
5036 float VectorCost
= C
.first
/ (float)i
;
5037 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5038 << " costs: " << (int)VectorCost
<< ".\n");
5039 if (!C
.second
&& !ForceVectorization
) {
5041 dbgs() << "LV: Not considering vector loop of width " << i
5042 << " because it will not generate any vector instructions.\n");
5045 if (VectorCost
< Cost
) {
5051 if (!EnableCondStoresVectorization
&& NumPredStores
) {
5052 reportVectorizationFailure("There are conditional stores.",
5053 "store that is conditionally executed prevents vectorization",
5054 "ConditionalStore", ORE
, TheLoop
);
5059 LLVM_DEBUG(if (ForceVectorization
&& Width
> 1 && Cost
>= ScalarCost
) dbgs()
5060 << "LV: Vectorization seems to be not beneficial, "
5061 << "but was forced by a user.\n");
5062 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width
<< ".\n");
5063 VectorizationFactor Factor
= {Width
, (unsigned)(Width
* Cost
)};
5067 std::pair
<unsigned, unsigned>
5068 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5069 unsigned MinWidth
= -1U;
5070 unsigned MaxWidth
= 8;
5071 const DataLayout
&DL
= TheFunction
->getParent()->getDataLayout();
5074 for (BasicBlock
*BB
: TheLoop
->blocks()) {
5075 // For each instruction in the loop.
5076 for (Instruction
&I
: BB
->instructionsWithoutDebug()) {
5077 Type
*T
= I
.getType();
5079 // Skip ignored values.
5080 if (ValuesToIgnore
.find(&I
) != ValuesToIgnore
.end())
5083 // Only examine Loads, Stores and PHINodes.
5084 if (!isa
<LoadInst
>(I
) && !isa
<StoreInst
>(I
) && !isa
<PHINode
>(I
))
5087 // Examine PHI nodes that are reduction variables. Update the type to
5088 // account for the recurrence type.
5089 if (auto *PN
= dyn_cast
<PHINode
>(&I
)) {
5090 if (!Legal
->isReductionVariable(PN
))
5092 RecurrenceDescriptor RdxDesc
= (*Legal
->getReductionVars())[PN
];
5093 T
= RdxDesc
.getRecurrenceType();
5096 // Examine the stored values.
5097 if (auto *ST
= dyn_cast
<StoreInst
>(&I
))
5098 T
= ST
->getValueOperand()->getType();
5100 // Ignore loaded pointer types and stored pointer types that are not
5103 // FIXME: The check here attempts to predict whether a load or store will
5104 // be vectorized. We only know this for certain after a VF has
5105 // been selected. Here, we assume that if an access can be
5106 // vectorized, it will be. We should also look at extending this
5107 // optimization to non-pointer types.
5109 if (T
->isPointerTy() && !isConsecutiveLoadOrStore(&I
) &&
5110 !isAccessInterleaved(&I
) && !isLegalGatherOrScatter(&I
))
5113 MinWidth
= std::min(MinWidth
,
5114 (unsigned)DL
.getTypeSizeInBits(T
->getScalarType()));
5115 MaxWidth
= std::max(MaxWidth
,
5116 (unsigned)DL
.getTypeSizeInBits(T
->getScalarType()));
5120 return {MinWidth
, MaxWidth
};
5123 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF
,
5124 unsigned LoopCost
) {
5125 // -- The interleave heuristics --
5126 // We interleave the loop in order to expose ILP and reduce the loop overhead.
5127 // There are many micro-architectural considerations that we can't predict
5128 // at this level. For example, frontend pressure (on decode or fetch) due to
5129 // code size, or the number and capabilities of the execution ports.
5131 // We use the following heuristics to select the interleave count:
5132 // 1. If the code has reductions, then we interleave to break the cross
5133 // iteration dependency.
5134 // 2. If the loop is really small, then we interleave to reduce the loop
5136 // 3. We don't interleave if we think that we will spill registers to memory
5137 // due to the increased register pressure.
5139 if (!isScalarEpilogueAllowed())
5142 // We used the distance for the interleave count.
5143 if (Legal
->getMaxSafeDepDistBytes() != -1U)
5146 // Do not interleave loops with a relatively small trip count.
5147 unsigned TC
= PSE
.getSE()->getSmallConstantTripCount(TheLoop
);
5148 if (TC
> 1 && TC
< TinyTripCountInterleaveThreshold
)
5151 RegisterUsage R
= calculateRegisterUsage({VF
})[0];
5152 // We divide by these constants so assume that we have at least one
5153 // instruction that uses at least one register.
5154 for (auto& pair
: R
.MaxLocalUsers
) {
5155 pair
.second
= std::max(pair
.second
, 1U);
5158 // We calculate the interleave count using the following formula.
5159 // Subtract the number of loop invariants from the number of available
5160 // registers. These registers are used by all of the interleaved instances.
5161 // Next, divide the remaining registers by the number of registers that is
5162 // required by the loop, in order to estimate how many parallel instances
5163 // fit without causing spills. All of this is rounded down if necessary to be
5164 // a power of two. We want power of two interleave count to simplify any
5165 // addressing operations or alignment considerations.
5166 // We also want power of two interleave counts to ensure that the induction
5167 // variable of the vector loop wraps to zero, when tail is folded by masking;
5168 // this currently happens when OptForSize, in which case IC is set to 1 above.
5169 unsigned IC
= UINT_MAX
;
5171 for (auto& pair
: R
.MaxLocalUsers
) {
5172 unsigned TargetNumRegisters
= TTI
.getNumberOfRegisters(pair
.first
);
5173 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5175 << TTI
.getRegisterClassName(pair
.first
) << " register class\n");
5177 if (ForceTargetNumScalarRegs
.getNumOccurrences() > 0)
5178 TargetNumRegisters
= ForceTargetNumScalarRegs
;
5180 if (ForceTargetNumVectorRegs
.getNumOccurrences() > 0)
5181 TargetNumRegisters
= ForceTargetNumVectorRegs
;
5183 unsigned MaxLocalUsers
= pair
.second
;
5184 unsigned LoopInvariantRegs
= 0;
5185 if (R
.LoopInvariantRegs
.find(pair
.first
) != R
.LoopInvariantRegs
.end())
5186 LoopInvariantRegs
= R
.LoopInvariantRegs
[pair
.first
];
5188 unsigned TmpIC
= PowerOf2Floor((TargetNumRegisters
- LoopInvariantRegs
) / MaxLocalUsers
);
5189 // Don't count the induction variable as interleaved.
5190 if (EnableIndVarRegisterHeur
) {
5192 PowerOf2Floor((TargetNumRegisters
- LoopInvariantRegs
- 1) /
5193 std::max(1U, (MaxLocalUsers
- 1)));
5196 IC
= std::min(IC
, TmpIC
);
5199 // Clamp the interleave ranges to reasonable counts.
5200 unsigned MaxInterleaveCount
= TTI
.getMaxInterleaveFactor(VF
);
5202 // Check if the user has overridden the max.
5204 if (ForceTargetMaxScalarInterleaveFactor
.getNumOccurrences() > 0)
5205 MaxInterleaveCount
= ForceTargetMaxScalarInterleaveFactor
;
5207 if (ForceTargetMaxVectorInterleaveFactor
.getNumOccurrences() > 0)
5208 MaxInterleaveCount
= ForceTargetMaxVectorInterleaveFactor
;
5211 // If the trip count is constant, limit the interleave count to be less than
5212 // the trip count divided by VF.
5214 assert(TC
>= VF
&& "VF exceeds trip count?");
5215 if ((TC
/ VF
) < MaxInterleaveCount
)
5216 MaxInterleaveCount
= (TC
/ VF
);
5219 // If we did not calculate the cost for VF (because the user selected the VF)
5220 // then we calculate the cost of VF here.
5222 LoopCost
= expectedCost(VF
).first
;
5224 assert(LoopCost
&& "Non-zero loop cost expected");
5226 // Clamp the calculated IC to be between the 1 and the max interleave count
5227 // that the target and trip count allows.
5228 if (IC
> MaxInterleaveCount
)
5229 IC
= MaxInterleaveCount
;
5233 // Interleave if we vectorized this loop and there is a reduction that could
5234 // benefit from interleaving.
5235 if (VF
> 1 && !Legal
->getReductionVars()->empty()) {
5236 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5240 // Note that if we've already vectorized the loop we will have done the
5241 // runtime check and so interleaving won't require further checks.
5242 bool InterleavingRequiresRuntimePointerCheck
=
5243 (VF
== 1 && Legal
->getRuntimePointerChecking()->Need
);
5245 // We want to interleave small loops in order to reduce the loop overhead and
5246 // potentially expose ILP opportunities.
5247 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost
<< '\n');
5248 if (!InterleavingRequiresRuntimePointerCheck
&& LoopCost
< SmallLoopCost
) {
5249 // We assume that the cost overhead is 1 and we use the cost model
5250 // to estimate the cost of the loop and interleave until the cost of the
5251 // loop overhead is about 5% of the cost of the loop.
5253 std::min(IC
, (unsigned)PowerOf2Floor(SmallLoopCost
/ LoopCost
));
5255 // Interleave until store/load ports (estimated by max interleave count) are
5257 unsigned NumStores
= Legal
->getNumStores();
5258 unsigned NumLoads
= Legal
->getNumLoads();
5259 unsigned StoresIC
= IC
/ (NumStores
? NumStores
: 1);
5260 unsigned LoadsIC
= IC
/ (NumLoads
? NumLoads
: 1);
5262 // If we have a scalar reduction (vector reductions are already dealt with
5263 // by this point), we can increase the critical path length if the loop
5264 // we're interleaving is inside another loop. Limit, by default to 2, so the
5265 // critical path only gets increased by one reduction operation.
5266 if (!Legal
->getReductionVars()->empty() && TheLoop
->getLoopDepth() > 1) {
5267 unsigned F
= static_cast<unsigned>(MaxNestedScalarReductionIC
);
5268 SmallIC
= std::min(SmallIC
, F
);
5269 StoresIC
= std::min(StoresIC
, F
);
5270 LoadsIC
= std::min(LoadsIC
, F
);
5273 if (EnableLoadStoreRuntimeInterleave
&&
5274 std::max(StoresIC
, LoadsIC
) > SmallIC
) {
5276 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5277 return std::max(StoresIC
, LoadsIC
);
5280 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5284 // Interleave if this is a large loop (small loops are already dealt with by
5285 // this point) that could benefit from interleaving.
5286 bool HasReductions
= !Legal
->getReductionVars()->empty();
5287 if (TTI
.enableAggressiveInterleaving(HasReductions
)) {
5288 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5292 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5296 SmallVector
<LoopVectorizationCostModel::RegisterUsage
, 8>
5297 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef
<unsigned> VFs
) {
5298 // This function calculates the register usage by measuring the highest number
5299 // of values that are alive at a single location. Obviously, this is a very
5300 // rough estimation. We scan the loop in a topological order in order and
5301 // assign a number to each instruction. We use RPO to ensure that defs are
5302 // met before their users. We assume that each instruction that has in-loop
5303 // users starts an interval. We record every time that an in-loop value is
5304 // used, so we have a list of the first and last occurrences of each
5305 // instruction. Next, we transpose this data structure into a multi map that
5306 // holds the list of intervals that *end* at a specific location. This multi
5307 // map allows us to perform a linear search. We scan the instructions linearly
5308 // and record each time that a new interval starts, by placing it in a set.
5309 // If we find this value in the multi-map then we remove it from the set.
5310 // The max register usage is the maximum size of the set.
5311 // We also search for instructions that are defined outside the loop, but are
5312 // used inside the loop. We need this number separately from the max-interval
5313 // usage number because when we unroll, loop-invariant values do not take
5315 LoopBlocksDFS
DFS(TheLoop
);
5320 // Each 'key' in the map opens a new interval. The values
5321 // of the map are the index of the 'last seen' usage of the
5322 // instruction that is the key.
5323 using IntervalMap
= DenseMap
<Instruction
*, unsigned>;
5325 // Maps instruction to its index.
5326 SmallVector
<Instruction
*, 64> IdxToInstr
;
5327 // Marks the end of each interval.
5328 IntervalMap EndPoint
;
5329 // Saves the list of instruction indices that are used in the loop.
5330 SmallPtrSet
<Instruction
*, 8> Ends
;
5331 // Saves the list of values that are used in the loop but are
5332 // defined outside the loop, such as arguments and constants.
5333 SmallPtrSet
<Value
*, 8> LoopInvariants
;
5335 for (BasicBlock
*BB
: make_range(DFS
.beginRPO(), DFS
.endRPO())) {
5336 for (Instruction
&I
: BB
->instructionsWithoutDebug()) {
5337 IdxToInstr
.push_back(&I
);
5339 // Save the end location of each USE.
5340 for (Value
*U
: I
.operands()) {
5341 auto *Instr
= dyn_cast
<Instruction
>(U
);
5343 // Ignore non-instruction values such as arguments, constants, etc.
5347 // If this instruction is outside the loop then record it and continue.
5348 if (!TheLoop
->contains(Instr
)) {
5349 LoopInvariants
.insert(Instr
);
5353 // Overwrite previous end points.
5354 EndPoint
[Instr
] = IdxToInstr
.size();
5360 // Saves the list of intervals that end with the index in 'key'.
5361 using InstrList
= SmallVector
<Instruction
*, 2>;
5362 DenseMap
<unsigned, InstrList
> TransposeEnds
;
5364 // Transpose the EndPoints to a list of values that end at each index.
5365 for (auto &Interval
: EndPoint
)
5366 TransposeEnds
[Interval
.second
].push_back(Interval
.first
);
5368 SmallPtrSet
<Instruction
*, 8> OpenIntervals
;
5370 // Get the size of the widest register.
5371 unsigned MaxSafeDepDist
= -1U;
5372 if (Legal
->getMaxSafeDepDistBytes() != -1U)
5373 MaxSafeDepDist
= Legal
->getMaxSafeDepDistBytes() * 8;
5374 unsigned WidestRegister
=
5375 std::min(TTI
.getRegisterBitWidth(true), MaxSafeDepDist
);
5376 const DataLayout
&DL
= TheFunction
->getParent()->getDataLayout();
5378 SmallVector
<RegisterUsage
, 8> RUs(VFs
.size());
5379 SmallVector
<SmallMapVector
<unsigned, unsigned, 4>, 8> MaxUsages(VFs
.size());
5381 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5383 // A lambda that gets the register usage for the given type and VF.
5384 auto GetRegUsage
= [&DL
, WidestRegister
](Type
*Ty
, unsigned VF
) {
5385 if (Ty
->isTokenTy())
5387 unsigned TypeSize
= DL
.getTypeSizeInBits(Ty
->getScalarType());
5388 return std::max
<unsigned>(1, VF
* TypeSize
/ WidestRegister
);
5391 for (unsigned int i
= 0, s
= IdxToInstr
.size(); i
< s
; ++i
) {
5392 Instruction
*I
= IdxToInstr
[i
];
5394 // Remove all of the instructions that end at this location.
5395 InstrList
&List
= TransposeEnds
[i
];
5396 for (Instruction
*ToRemove
: List
)
5397 OpenIntervals
.erase(ToRemove
);
5399 // Ignore instructions that are never used within the loop.
5400 if (Ends
.find(I
) == Ends
.end())
5403 // Skip ignored values.
5404 if (ValuesToIgnore
.find(I
) != ValuesToIgnore
.end())
5407 // For each VF find the maximum usage of registers.
5408 for (unsigned j
= 0, e
= VFs
.size(); j
< e
; ++j
) {
5409 // Count the number of live intervals.
5410 SmallMapVector
<unsigned, unsigned, 4> RegUsage
;
5413 for (auto Inst
: OpenIntervals
) {
5414 unsigned ClassID
= TTI
.getRegisterClassForType(false, Inst
->getType());
5415 if (RegUsage
.find(ClassID
) == RegUsage
.end())
5416 RegUsage
[ClassID
] = 1;
5418 RegUsage
[ClassID
] += 1;
5421 collectUniformsAndScalars(VFs
[j
]);
5422 for (auto Inst
: OpenIntervals
) {
5423 // Skip ignored values for VF > 1.
5424 if (VecValuesToIgnore
.find(Inst
) != VecValuesToIgnore
.end())
5426 if (isScalarAfterVectorization(Inst
, VFs
[j
])) {
5427 unsigned ClassID
= TTI
.getRegisterClassForType(false, Inst
->getType());
5428 if (RegUsage
.find(ClassID
) == RegUsage
.end())
5429 RegUsage
[ClassID
] = 1;
5431 RegUsage
[ClassID
] += 1;
5433 unsigned ClassID
= TTI
.getRegisterClassForType(true, Inst
->getType());
5434 if (RegUsage
.find(ClassID
) == RegUsage
.end())
5435 RegUsage
[ClassID
] = GetRegUsage(Inst
->getType(), VFs
[j
]);
5437 RegUsage
[ClassID
] += GetRegUsage(Inst
->getType(), VFs
[j
]);
5442 for (auto& pair
: RegUsage
) {
5443 if (MaxUsages
[j
].find(pair
.first
) != MaxUsages
[j
].end())
5444 MaxUsages
[j
][pair
.first
] = std::max(MaxUsages
[j
][pair
.first
], pair
.second
);
5446 MaxUsages
[j
][pair
.first
] = pair
.second
;
5450 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i
<< " Interval # "
5451 << OpenIntervals
.size() << '\n');
5453 // Add the current instruction to the list of open intervals.
5454 OpenIntervals
.insert(I
);
5457 for (unsigned i
= 0, e
= VFs
.size(); i
< e
; ++i
) {
5458 SmallMapVector
<unsigned, unsigned, 4> Invariant
;
5460 for (auto Inst
: LoopInvariants
) {
5461 unsigned Usage
= VFs
[i
] == 1 ? 1 : GetRegUsage(Inst
->getType(), VFs
[i
]);
5462 unsigned ClassID
= TTI
.getRegisterClassForType(VFs
[i
] > 1, Inst
->getType());
5463 if (Invariant
.find(ClassID
) == Invariant
.end())
5464 Invariant
[ClassID
] = Usage
;
5466 Invariant
[ClassID
] += Usage
;
5470 dbgs() << "LV(REG): VF = " << VFs
[i
] << '\n';
5471 dbgs() << "LV(REG): Found max usage: " << MaxUsages
[i
].size()
5473 for (const auto &pair
: MaxUsages
[i
]) {
5474 dbgs() << "LV(REG): RegisterClass: "
5475 << TTI
.getRegisterClassName(pair
.first
) << ", " << pair
.second
5478 dbgs() << "LV(REG): Found invariant usage: " << Invariant
.size()
5480 for (const auto &pair
: Invariant
) {
5481 dbgs() << "LV(REG): RegisterClass: "
5482 << TTI
.getRegisterClassName(pair
.first
) << ", " << pair
.second
5487 RU
.LoopInvariantRegs
= Invariant
;
5488 RU
.MaxLocalUsers
= MaxUsages
[i
];
5495 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction
*I
){
5496 // TODO: Cost model for emulated masked load/store is completely
5497 // broken. This hack guides the cost model to use an artificially
5498 // high enough value to practically disable vectorization with such
5499 // operations, except where previously deployed legality hack allowed
5500 // using very low cost values. This is to avoid regressions coming simply
5501 // from moving "masked load/store" check from legality to cost model.
5502 // Masked Load/Gather emulation was previously never allowed.
5503 // Limited number of Masked Store/Scatter emulation was allowed.
5504 assert(isPredicatedInst(I
) && "Expecting a scalar emulated instruction");
5505 return isa
<LoadInst
>(I
) ||
5506 (isa
<StoreInst
>(I
) &&
5507 NumPredStores
> NumberOfStoresToPredicate
);
5510 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF
) {
5511 // If we aren't vectorizing the loop, or if we've already collected the
5512 // instructions to scalarize, there's nothing to do. Collection may already
5513 // have occurred if we have a user-selected VF and are now computing the
5514 // expected cost for interleaving.
5515 if (VF
< 2 || InstsToScalarize
.find(VF
) != InstsToScalarize
.end())
5518 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5519 // not profitable to scalarize any instructions, the presence of VF in the
5520 // map will indicate that we've analyzed it already.
5521 ScalarCostsTy
&ScalarCostsVF
= InstsToScalarize
[VF
];
5523 // Find all the instructions that are scalar with predication in the loop and
5524 // determine if it would be better to not if-convert the blocks they are in.
5525 // If so, we also record the instructions to scalarize.
5526 for (BasicBlock
*BB
: TheLoop
->blocks()) {
5527 if (!blockNeedsPredication(BB
))
5529 for (Instruction
&I
: *BB
)
5530 if (isScalarWithPredication(&I
)) {
5531 ScalarCostsTy ScalarCosts
;
5532 // Do not apply discount logic if hacked cost is needed
5533 // for emulated masked memrefs.
5534 if (!useEmulatedMaskMemRefHack(&I
) &&
5535 computePredInstDiscount(&I
, ScalarCosts
, VF
) >= 0)
5536 ScalarCostsVF
.insert(ScalarCosts
.begin(), ScalarCosts
.end());
5537 // Remember that BB will remain after vectorization.
5538 PredicatedBBsAfterVectorization
.insert(BB
);
5543 int LoopVectorizationCostModel::computePredInstDiscount(
5544 Instruction
*PredInst
, DenseMap
<Instruction
*, unsigned> &ScalarCosts
,
5546 assert(!isUniformAfterVectorization(PredInst
, VF
) &&
5547 "Instruction marked uniform-after-vectorization will be predicated");
5549 // Initialize the discount to zero, meaning that the scalar version and the
5550 // vector version cost the same.
5553 // Holds instructions to analyze. The instructions we visit are mapped in
5554 // ScalarCosts. Those instructions are the ones that would be scalarized if
5555 // we find that the scalar version costs less.
5556 SmallVector
<Instruction
*, 8> Worklist
;
5558 // Returns true if the given instruction can be scalarized.
5559 auto canBeScalarized
= [&](Instruction
*I
) -> bool {
5560 // We only attempt to scalarize instructions forming a single-use chain
5561 // from the original predicated block that would otherwise be vectorized.
5562 // Although not strictly necessary, we give up on instructions we know will
5563 // already be scalar to avoid traversing chains that are unlikely to be
5565 if (!I
->hasOneUse() || PredInst
->getParent() != I
->getParent() ||
5566 isScalarAfterVectorization(I
, VF
))
5569 // If the instruction is scalar with predication, it will be analyzed
5570 // separately. We ignore it within the context of PredInst.
5571 if (isScalarWithPredication(I
))
5574 // If any of the instruction's operands are uniform after vectorization,
5575 // the instruction cannot be scalarized. This prevents, for example, a
5576 // masked load from being scalarized.
5578 // We assume we will only emit a value for lane zero of an instruction
5579 // marked uniform after vectorization, rather than VF identical values.
5580 // Thus, if we scalarize an instruction that uses a uniform, we would
5581 // create uses of values corresponding to the lanes we aren't emitting code
5582 // for. This behavior can be changed by allowing getScalarValue to clone
5583 // the lane zero values for uniforms rather than asserting.
5584 for (Use
&U
: I
->operands())
5585 if (auto *J
= dyn_cast
<Instruction
>(U
.get()))
5586 if (isUniformAfterVectorization(J
, VF
))
5589 // Otherwise, we can scalarize the instruction.
5593 // Compute the expected cost discount from scalarizing the entire expression
5594 // feeding the predicated instruction. We currently only consider expressions
5595 // that are single-use instruction chains.
5596 Worklist
.push_back(PredInst
);
5597 while (!Worklist
.empty()) {
5598 Instruction
*I
= Worklist
.pop_back_val();
5600 // If we've already analyzed the instruction, there's nothing to do.
5601 if (ScalarCosts
.find(I
) != ScalarCosts
.end())
5604 // Compute the cost of the vector instruction. Note that this cost already
5605 // includes the scalarization overhead of the predicated instruction.
5606 unsigned VectorCost
= getInstructionCost(I
, VF
).first
;
5608 // Compute the cost of the scalarized instruction. This cost is the cost of
5609 // the instruction as if it wasn't if-converted and instead remained in the
5610 // predicated block. We will scale this cost by block probability after
5611 // computing the scalarization overhead.
5612 unsigned ScalarCost
= VF
* getInstructionCost(I
, 1).first
;
5614 // Compute the scalarization overhead of needed insertelement instructions
5616 if (isScalarWithPredication(I
) && !I
->getType()->isVoidTy()) {
5617 ScalarCost
+= TTI
.getScalarizationOverhead(ToVectorTy(I
->getType(), VF
),
5619 ScalarCost
+= VF
* TTI
.getCFInstrCost(Instruction::PHI
);
5622 // Compute the scalarization overhead of needed extractelement
5623 // instructions. For each of the instruction's operands, if the operand can
5624 // be scalarized, add it to the worklist; otherwise, account for the
5626 for (Use
&U
: I
->operands())
5627 if (auto *J
= dyn_cast
<Instruction
>(U
.get())) {
5628 assert(VectorType::isValidElementType(J
->getType()) &&
5629 "Instruction has non-scalar type");
5630 if (canBeScalarized(J
))
5631 Worklist
.push_back(J
);
5632 else if (needsExtract(J
, VF
))
5633 ScalarCost
+= TTI
.getScalarizationOverhead(
5634 ToVectorTy(J
->getType(),VF
), false, true);
5637 // Scale the total scalar cost by block probability.
5638 ScalarCost
/= getReciprocalPredBlockProb();
5640 // Compute the discount. A non-negative discount means the vector version
5641 // of the instruction costs more, and scalarizing would be beneficial.
5642 Discount
+= VectorCost
- ScalarCost
;
5643 ScalarCosts
[I
] = ScalarCost
;
5649 LoopVectorizationCostModel::VectorizationCostTy
5650 LoopVectorizationCostModel::expectedCost(unsigned VF
) {
5651 VectorizationCostTy Cost
;
5654 for (BasicBlock
*BB
: TheLoop
->blocks()) {
5655 VectorizationCostTy BlockCost
;
5657 // For each instruction in the old loop.
5658 for (Instruction
&I
: BB
->instructionsWithoutDebug()) {
5659 // Skip ignored values.
5660 if (ValuesToIgnore
.find(&I
) != ValuesToIgnore
.end() ||
5661 (VF
> 1 && VecValuesToIgnore
.find(&I
) != VecValuesToIgnore
.end()))
5664 VectorizationCostTy C
= getInstructionCost(&I
, VF
);
5666 // Check if we should override the cost.
5667 if (ForceTargetInstructionCost
.getNumOccurrences() > 0)
5668 C
.first
= ForceTargetInstructionCost
;
5670 BlockCost
.first
+= C
.first
;
5671 BlockCost
.second
|= C
.second
;
5672 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C
.first
5673 << " for VF " << VF
<< " For instruction: " << I
5677 // If we are vectorizing a predicated block, it will have been
5678 // if-converted. This means that the block's instructions (aside from
5679 // stores and instructions that may divide by zero) will now be
5680 // unconditionally executed. For the scalar case, we may not always execute
5681 // the predicated block. Thus, scale the block's cost by the probability of
5683 if (VF
== 1 && blockNeedsPredication(BB
))
5684 BlockCost
.first
/= getReciprocalPredBlockProb();
5686 Cost
.first
+= BlockCost
.first
;
5687 Cost
.second
|= BlockCost
.second
;
5693 /// Gets Address Access SCEV after verifying that the access pattern
5694 /// is loop invariant except the induction variable dependence.
5696 /// This SCEV can be sent to the Target in order to estimate the address
5697 /// calculation cost.
5698 static const SCEV
*getAddressAccessSCEV(
5700 LoopVectorizationLegality
*Legal
,
5701 PredicatedScalarEvolution
&PSE
,
5702 const Loop
*TheLoop
) {
5704 auto *Gep
= dyn_cast
<GetElementPtrInst
>(Ptr
);
5708 // We are looking for a gep with all loop invariant indices except for one
5709 // which should be an induction variable.
5710 auto SE
= PSE
.getSE();
5711 unsigned NumOperands
= Gep
->getNumOperands();
5712 for (unsigned i
= 1; i
< NumOperands
; ++i
) {
5713 Value
*Opd
= Gep
->getOperand(i
);
5714 if (!SE
->isLoopInvariant(SE
->getSCEV(Opd
), TheLoop
) &&
5715 !Legal
->isInductionVariable(Opd
))
5719 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5720 return PSE
.getSCEV(Ptr
);
5723 static bool isStrideMul(Instruction
*I
, LoopVectorizationLegality
*Legal
) {
5724 return Legal
->hasStride(I
->getOperand(0)) ||
5725 Legal
->hasStride(I
->getOperand(1));
5728 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction
*I
,
5730 assert(VF
> 1 && "Scalarization cost of instruction implies vectorization.");
5731 Type
*ValTy
= getMemInstValueType(I
);
5732 auto SE
= PSE
.getSE();
5734 unsigned AS
= getLoadStoreAddressSpace(I
);
5735 Value
*Ptr
= getLoadStorePointerOperand(I
);
5736 Type
*PtrTy
= ToVectorTy(Ptr
->getType(), VF
);
5738 // Figure out whether the access is strided and get the stride value
5739 // if it's known in compile time
5740 const SCEV
*PtrSCEV
= getAddressAccessSCEV(Ptr
, Legal
, PSE
, TheLoop
);
5742 // Get the cost of the scalar memory instruction and address computation.
5743 unsigned Cost
= VF
* TTI
.getAddressComputationCost(PtrTy
, SE
, PtrSCEV
);
5745 // Don't pass *I here, since it is scalar but will actually be part of a
5746 // vectorized loop where the user of it is a vectorized instruction.
5747 const MaybeAlign Alignment
= getLoadStoreAlignment(I
);
5748 Cost
+= VF
* TTI
.getMemoryOpCost(I
->getOpcode(), ValTy
->getScalarType(),
5749 Alignment
? Alignment
->value() : 0, AS
);
5751 // Get the overhead of the extractelement and insertelement instructions
5752 // we might create due to scalarization.
5753 Cost
+= getScalarizationOverhead(I
, VF
);
5755 // If we have a predicated store, it may not be executed for each vector
5756 // lane. Scale the cost by the probability of executing the predicated
5758 if (isPredicatedInst(I
)) {
5759 Cost
/= getReciprocalPredBlockProb();
5761 if (useEmulatedMaskMemRefHack(I
))
5762 // Artificially setting to a high enough value to practically disable
5763 // vectorization with such operations.
5770 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction
*I
,
5772 Type
*ValTy
= getMemInstValueType(I
);
5773 Type
*VectorTy
= ToVectorTy(ValTy
, VF
);
5774 Value
*Ptr
= getLoadStorePointerOperand(I
);
5775 unsigned AS
= getLoadStoreAddressSpace(I
);
5776 int ConsecutiveStride
= Legal
->isConsecutivePtr(Ptr
);
5778 assert((ConsecutiveStride
== 1 || ConsecutiveStride
== -1) &&
5779 "Stride should be 1 or -1 for consecutive memory access");
5780 const MaybeAlign Alignment
= getLoadStoreAlignment(I
);
5782 if (Legal
->isMaskRequired(I
))
5783 Cost
+= TTI
.getMaskedMemoryOpCost(I
->getOpcode(), VectorTy
,
5784 Alignment
? Alignment
->value() : 0, AS
);
5786 Cost
+= TTI
.getMemoryOpCost(I
->getOpcode(), VectorTy
,
5787 Alignment
? Alignment
->value() : 0, AS
, I
);
5789 bool Reverse
= ConsecutiveStride
< 0;
5791 Cost
+= TTI
.getShuffleCost(TargetTransformInfo::SK_Reverse
, VectorTy
, 0);
5795 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction
*I
,
5797 Type
*ValTy
= getMemInstValueType(I
);
5798 Type
*VectorTy
= ToVectorTy(ValTy
, VF
);
5799 const MaybeAlign Alignment
= getLoadStoreAlignment(I
);
5800 unsigned AS
= getLoadStoreAddressSpace(I
);
5801 if (isa
<LoadInst
>(I
)) {
5802 return TTI
.getAddressComputationCost(ValTy
) +
5803 TTI
.getMemoryOpCost(Instruction::Load
, ValTy
,
5804 Alignment
? Alignment
->value() : 0, AS
) +
5805 TTI
.getShuffleCost(TargetTransformInfo::SK_Broadcast
, VectorTy
);
5807 StoreInst
*SI
= cast
<StoreInst
>(I
);
5809 bool isLoopInvariantStoreValue
= Legal
->isUniform(SI
->getValueOperand());
5810 return TTI
.getAddressComputationCost(ValTy
) +
5811 TTI
.getMemoryOpCost(Instruction::Store
, ValTy
,
5812 Alignment
? Alignment
->value() : 0, AS
) +
5813 (isLoopInvariantStoreValue
5815 : TTI
.getVectorInstrCost(Instruction::ExtractElement
, VectorTy
,
5819 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction
*I
,
5821 Type
*ValTy
= getMemInstValueType(I
);
5822 Type
*VectorTy
= ToVectorTy(ValTy
, VF
);
5823 const MaybeAlign Alignment
= getLoadStoreAlignment(I
);
5824 Value
*Ptr
= getLoadStorePointerOperand(I
);
5826 return TTI
.getAddressComputationCost(VectorTy
) +
5827 TTI
.getGatherScatterOpCost(I
->getOpcode(), VectorTy
, Ptr
,
5828 Legal
->isMaskRequired(I
),
5829 Alignment
? Alignment
->value() : 0);
5832 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction
*I
,
5834 Type
*ValTy
= getMemInstValueType(I
);
5835 Type
*VectorTy
= ToVectorTy(ValTy
, VF
);
5836 unsigned AS
= getLoadStoreAddressSpace(I
);
5838 auto Group
= getInterleavedAccessGroup(I
);
5839 assert(Group
&& "Fail to get an interleaved access group.");
5841 unsigned InterleaveFactor
= Group
->getFactor();
5842 Type
*WideVecTy
= VectorType::get(ValTy
, VF
* InterleaveFactor
);
5844 // Holds the indices of existing members in an interleaved load group.
5845 // An interleaved store group doesn't need this as it doesn't allow gaps.
5846 SmallVector
<unsigned, 4> Indices
;
5847 if (isa
<LoadInst
>(I
)) {
5848 for (unsigned i
= 0; i
< InterleaveFactor
; i
++)
5849 if (Group
->getMember(i
))
5850 Indices
.push_back(i
);
5853 // Calculate the cost of the whole interleaved group.
5854 bool UseMaskForGaps
=
5855 Group
->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5856 unsigned Cost
= TTI
.getInterleavedMemoryOpCost(
5857 I
->getOpcode(), WideVecTy
, Group
->getFactor(), Indices
,
5858 Group
->getAlignment(), AS
, Legal
->isMaskRequired(I
), UseMaskForGaps
);
5860 if (Group
->isReverse()) {
5861 // TODO: Add support for reversed masked interleaved access.
5862 assert(!Legal
->isMaskRequired(I
) &&
5863 "Reverse masked interleaved access not supported.");
5864 Cost
+= Group
->getNumMembers() *
5865 TTI
.getShuffleCost(TargetTransformInfo::SK_Reverse
, VectorTy
, 0);
5870 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction
*I
,
5872 // Calculate scalar cost only. Vectorization cost should be ready at this
5875 Type
*ValTy
= getMemInstValueType(I
);
5876 const MaybeAlign Alignment
= getLoadStoreAlignment(I
);
5877 unsigned AS
= getLoadStoreAddressSpace(I
);
5879 return TTI
.getAddressComputationCost(ValTy
) +
5880 TTI
.getMemoryOpCost(I
->getOpcode(), ValTy
,
5881 Alignment
? Alignment
->value() : 0, AS
, I
);
5883 return getWideningCost(I
, VF
);
5886 LoopVectorizationCostModel::VectorizationCostTy
5887 LoopVectorizationCostModel::getInstructionCost(Instruction
*I
, unsigned VF
) {
5888 // If we know that this instruction will remain uniform, check the cost of
5889 // the scalar version.
5890 if (isUniformAfterVectorization(I
, VF
))
5893 if (VF
> 1 && isProfitableToScalarize(I
, VF
))
5894 return VectorizationCostTy(InstsToScalarize
[VF
][I
], false);
5896 // Forced scalars do not have any scalarization overhead.
5897 auto ForcedScalar
= ForcedScalars
.find(VF
);
5898 if (VF
> 1 && ForcedScalar
!= ForcedScalars
.end()) {
5899 auto InstSet
= ForcedScalar
->second
;
5900 if (InstSet
.find(I
) != InstSet
.end())
5901 return VectorizationCostTy((getInstructionCost(I
, 1).first
* VF
), false);
5905 unsigned C
= getInstructionCost(I
, VF
, VectorTy
);
5907 bool TypeNotScalarized
=
5908 VF
> 1 && VectorTy
->isVectorTy() && TTI
.getNumberOfParts(VectorTy
) < VF
;
5909 return VectorizationCostTy(C
, TypeNotScalarized
);
5912 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction
*I
,
5919 Type
*RetTy
= ToVectorTy(I
->getType(), VF
);
5920 if (!RetTy
->isVoidTy() &&
5921 (!isa
<LoadInst
>(I
) || !TTI
.supportsEfficientVectorElementLoadStore()))
5922 Cost
+= TTI
.getScalarizationOverhead(RetTy
, true, false);
5924 // Some targets keep addresses scalar.
5925 if (isa
<LoadInst
>(I
) && !TTI
.prefersVectorizedAddressing())
5928 // Some targets support efficient element stores.
5929 if (isa
<StoreInst
>(I
) && TTI
.supportsEfficientVectorElementLoadStore())
5932 // Collect operands to consider.
5933 CallInst
*CI
= dyn_cast
<CallInst
>(I
);
5934 Instruction::op_range Ops
= CI
? CI
->arg_operands() : I
->operands();
5936 // Skip operands that do not require extraction/scalarization and do not incur
5938 return Cost
+ TTI
.getOperandsScalarizationOverhead(
5939 filterExtractingOperands(Ops
, VF
), VF
);
5942 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF
) {
5946 for (BasicBlock
*BB
: TheLoop
->blocks()) {
5947 // For each instruction in the old loop.
5948 for (Instruction
&I
: *BB
) {
5949 Value
*Ptr
= getLoadStorePointerOperand(&I
);
5953 // TODO: We should generate better code and update the cost model for
5954 // predicated uniform stores. Today they are treated as any other
5955 // predicated store (see added test cases in
5956 // invariant-store-vectorization.ll).
5957 if (isa
<StoreInst
>(&I
) && isScalarWithPredication(&I
))
5960 if (Legal
->isUniform(Ptr
) &&
5961 // Conditional loads and stores should be scalarized and predicated.
5962 // isScalarWithPredication cannot be used here since masked
5963 // gather/scatters are not considered scalar with predication.
5964 !Legal
->blockNeedsPredication(I
.getParent())) {
5965 // TODO: Avoid replicating loads and stores instead of
5966 // relying on instcombine to remove them.
5967 // Load: Scalar load + broadcast
5968 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5969 unsigned Cost
= getUniformMemOpCost(&I
, VF
);
5970 setWideningDecision(&I
, VF
, CM_Scalarize
, Cost
);
5974 // We assume that widening is the best solution when possible.
5975 if (memoryInstructionCanBeWidened(&I
, VF
)) {
5976 unsigned Cost
= getConsecutiveMemOpCost(&I
, VF
);
5977 int ConsecutiveStride
=
5978 Legal
->isConsecutivePtr(getLoadStorePointerOperand(&I
));
5979 assert((ConsecutiveStride
== 1 || ConsecutiveStride
== -1) &&
5980 "Expected consecutive stride.");
5981 InstWidening Decision
=
5982 ConsecutiveStride
== 1 ? CM_Widen
: CM_Widen_Reverse
;
5983 setWideningDecision(&I
, VF
, Decision
, Cost
);
5987 // Choose between Interleaving, Gather/Scatter or Scalarization.
5988 unsigned InterleaveCost
= std::numeric_limits
<unsigned>::max();
5989 unsigned NumAccesses
= 1;
5990 if (isAccessInterleaved(&I
)) {
5991 auto Group
= getInterleavedAccessGroup(&I
);
5992 assert(Group
&& "Fail to get an interleaved access group.");
5994 // Make one decision for the whole group.
5995 if (getWideningDecision(&I
, VF
) != CM_Unknown
)
5998 NumAccesses
= Group
->getNumMembers();
5999 if (interleavedAccessCanBeWidened(&I
, VF
))
6000 InterleaveCost
= getInterleaveGroupCost(&I
, VF
);
6003 unsigned GatherScatterCost
=
6004 isLegalGatherOrScatter(&I
)
6005 ? getGatherScatterCost(&I
, VF
) * NumAccesses
6006 : std::numeric_limits
<unsigned>::max();
6008 unsigned ScalarizationCost
=
6009 getMemInstScalarizationCost(&I
, VF
) * NumAccesses
;
6011 // Choose better solution for the current VF,
6012 // write down this decision and use it during vectorization.
6014 InstWidening Decision
;
6015 if (InterleaveCost
<= GatherScatterCost
&&
6016 InterleaveCost
< ScalarizationCost
) {
6017 Decision
= CM_Interleave
;
6018 Cost
= InterleaveCost
;
6019 } else if (GatherScatterCost
< ScalarizationCost
) {
6020 Decision
= CM_GatherScatter
;
6021 Cost
= GatherScatterCost
;
6023 Decision
= CM_Scalarize
;
6024 Cost
= ScalarizationCost
;
6026 // If the instructions belongs to an interleave group, the whole group
6027 // receives the same decision. The whole group receives the cost, but
6028 // the cost will actually be assigned to one instruction.
6029 if (auto Group
= getInterleavedAccessGroup(&I
))
6030 setWideningDecision(Group
, VF
, Decision
, Cost
);
6032 setWideningDecision(&I
, VF
, Decision
, Cost
);
6036 // Make sure that any load of address and any other address computation
6037 // remains scalar unless there is gather/scatter support. This avoids
6038 // inevitable extracts into address registers, and also has the benefit of
6039 // activating LSR more, since that pass can't optimize vectorized
6041 if (TTI
.prefersVectorizedAddressing())
6044 // Start with all scalar pointer uses.
6045 SmallPtrSet
<Instruction
*, 8> AddrDefs
;
6046 for (BasicBlock
*BB
: TheLoop
->blocks())
6047 for (Instruction
&I
: *BB
) {
6048 Instruction
*PtrDef
=
6049 dyn_cast_or_null
<Instruction
>(getLoadStorePointerOperand(&I
));
6050 if (PtrDef
&& TheLoop
->contains(PtrDef
) &&
6051 getWideningDecision(&I
, VF
) != CM_GatherScatter
)
6052 AddrDefs
.insert(PtrDef
);
6055 // Add all instructions used to generate the addresses.
6056 SmallVector
<Instruction
*, 4> Worklist
;
6057 for (auto *I
: AddrDefs
)
6058 Worklist
.push_back(I
);
6059 while (!Worklist
.empty()) {
6060 Instruction
*I
= Worklist
.pop_back_val();
6061 for (auto &Op
: I
->operands())
6062 if (auto *InstOp
= dyn_cast
<Instruction
>(Op
))
6063 if ((InstOp
->getParent() == I
->getParent()) && !isa
<PHINode
>(InstOp
) &&
6064 AddrDefs
.insert(InstOp
).second
)
6065 Worklist
.push_back(InstOp
);
6068 for (auto *I
: AddrDefs
) {
6069 if (isa
<LoadInst
>(I
)) {
6070 // Setting the desired widening decision should ideally be handled in
6071 // by cost functions, but since this involves the task of finding out
6072 // if the loaded register is involved in an address computation, it is
6073 // instead changed here when we know this is the case.
6074 InstWidening Decision
= getWideningDecision(I
, VF
);
6075 if (Decision
== CM_Widen
|| Decision
== CM_Widen_Reverse
)
6076 // Scalarize a widened load of address.
6077 setWideningDecision(I
, VF
, CM_Scalarize
,
6078 (VF
* getMemoryInstructionCost(I
, 1)));
6079 else if (auto Group
= getInterleavedAccessGroup(I
)) {
6080 // Scalarize an interleave group of address loads.
6081 for (unsigned I
= 0; I
< Group
->getFactor(); ++I
) {
6082 if (Instruction
*Member
= Group
->getMember(I
))
6083 setWideningDecision(Member
, VF
, CM_Scalarize
,
6084 (VF
* getMemoryInstructionCost(Member
, 1)));
6088 // Make sure I gets scalarized and a cost estimate without
6089 // scalarization overhead.
6090 ForcedScalars
[VF
].insert(I
);
6094 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction
*I
,
6097 Type
*RetTy
= I
->getType();
6098 if (canTruncateToMinimalBitwidth(I
, VF
))
6099 RetTy
= IntegerType::get(RetTy
->getContext(), MinBWs
[I
]);
6100 VectorTy
= isScalarAfterVectorization(I
, VF
) ? RetTy
: ToVectorTy(RetTy
, VF
);
6101 auto SE
= PSE
.getSE();
6103 // TODO: We need to estimate the cost of intrinsic calls.
6104 switch (I
->getOpcode()) {
6105 case Instruction::GetElementPtr
:
6106 // We mark this instruction as zero-cost because the cost of GEPs in
6107 // vectorized code depends on whether the corresponding memory instruction
6108 // is scalarized or not. Therefore, we handle GEPs with the memory
6109 // instruction cost.
6111 case Instruction::Br
: {
6112 // In cases of scalarized and predicated instructions, there will be VF
6113 // predicated blocks in the vectorized loop. Each branch around these
6114 // blocks requires also an extract of its vector compare i1 element.
6115 bool ScalarPredicatedBB
= false;
6116 BranchInst
*BI
= cast
<BranchInst
>(I
);
6117 if (VF
> 1 && BI
->isConditional() &&
6118 (PredicatedBBsAfterVectorization
.find(BI
->getSuccessor(0)) !=
6119 PredicatedBBsAfterVectorization
.end() ||
6120 PredicatedBBsAfterVectorization
.find(BI
->getSuccessor(1)) !=
6121 PredicatedBBsAfterVectorization
.end()))
6122 ScalarPredicatedBB
= true;
6124 if (ScalarPredicatedBB
) {
6125 // Return cost for branches around scalarized and predicated blocks.
6127 VectorType::get(IntegerType::getInt1Ty(RetTy
->getContext()), VF
);
6128 return (TTI
.getScalarizationOverhead(Vec_i1Ty
, false, true) +
6129 (TTI
.getCFInstrCost(Instruction::Br
) * VF
));
6130 } else if (I
->getParent() == TheLoop
->getLoopLatch() || VF
== 1)
6131 // The back-edge branch will remain, as will all scalar branches.
6132 return TTI
.getCFInstrCost(Instruction::Br
);
6134 // This branch will be eliminated by if-conversion.
6136 // Note: We currently assume zero cost for an unconditional branch inside
6137 // a predicated block since it will become a fall-through, although we
6138 // may decide in the future to call TTI for all branches.
6140 case Instruction::PHI
: {
6141 auto *Phi
= cast
<PHINode
>(I
);
6143 // First-order recurrences are replaced by vector shuffles inside the loop.
6144 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6145 if (VF
> 1 && Legal
->isFirstOrderRecurrence(Phi
))
6146 return TTI
.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector
,
6147 VectorTy
, VF
- 1, VectorType::get(RetTy
, 1));
6149 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6150 // converted into select instructions. We require N - 1 selects per phi
6151 // node, where N is the number of incoming values.
6152 if (VF
> 1 && Phi
->getParent() != TheLoop
->getHeader())
6153 return (Phi
->getNumIncomingValues() - 1) *
6154 TTI
.getCmpSelInstrCost(
6155 Instruction::Select
, ToVectorTy(Phi
->getType(), VF
),
6156 ToVectorTy(Type::getInt1Ty(Phi
->getContext()), VF
));
6158 return TTI
.getCFInstrCost(Instruction::PHI
);
6160 case Instruction::UDiv
:
6161 case Instruction::SDiv
:
6162 case Instruction::URem
:
6163 case Instruction::SRem
:
6164 // If we have a predicated instruction, it may not be executed for each
6165 // vector lane. Get the scalarization cost and scale this amount by the
6166 // probability of executing the predicated block. If the instruction is not
6167 // predicated, we fall through to the next case.
6168 if (VF
> 1 && isScalarWithPredication(I
)) {
6171 // These instructions have a non-void type, so account for the phi nodes
6172 // that we will create. This cost is likely to be zero. The phi node
6173 // cost, if any, should be scaled by the block probability because it
6174 // models a copy at the end of each predicated block.
6175 Cost
+= VF
* TTI
.getCFInstrCost(Instruction::PHI
);
6177 // The cost of the non-predicated instruction.
6178 Cost
+= VF
* TTI
.getArithmeticInstrCost(I
->getOpcode(), RetTy
);
6180 // The cost of insertelement and extractelement instructions needed for
6182 Cost
+= getScalarizationOverhead(I
, VF
);
6184 // Scale the cost by the probability of executing the predicated blocks.
6185 // This assumes the predicated block for each vector lane is equally
6187 return Cost
/ getReciprocalPredBlockProb();
6190 case Instruction::Add
:
6191 case Instruction::FAdd
:
6192 case Instruction::Sub
:
6193 case Instruction::FSub
:
6194 case Instruction::Mul
:
6195 case Instruction::FMul
:
6196 case Instruction::FDiv
:
6197 case Instruction::FRem
:
6198 case Instruction::Shl
:
6199 case Instruction::LShr
:
6200 case Instruction::AShr
:
6201 case Instruction::And
:
6202 case Instruction::Or
:
6203 case Instruction::Xor
: {
6204 // Since we will replace the stride by 1 the multiplication should go away.
6205 if (I
->getOpcode() == Instruction::Mul
&& isStrideMul(I
, Legal
))
6207 // Certain instructions can be cheaper to vectorize if they have a constant
6208 // second vector operand. One example of this are shifts on x86.
6209 Value
*Op2
= I
->getOperand(1);
6210 TargetTransformInfo::OperandValueProperties Op2VP
;
6211 TargetTransformInfo::OperandValueKind Op2VK
=
6212 TTI
.getOperandInfo(Op2
, Op2VP
);
6213 if (Op2VK
== TargetTransformInfo::OK_AnyValue
&& Legal
->isUniform(Op2
))
6214 Op2VK
= TargetTransformInfo::OK_UniformValue
;
6216 SmallVector
<const Value
*, 4> Operands(I
->operand_values());
6217 unsigned N
= isScalarAfterVectorization(I
, VF
) ? VF
: 1;
6218 return N
* TTI
.getArithmeticInstrCost(
6219 I
->getOpcode(), VectorTy
, TargetTransformInfo::OK_AnyValue
,
6220 Op2VK
, TargetTransformInfo::OP_None
, Op2VP
, Operands
);
6222 case Instruction::FNeg
: {
6223 unsigned N
= isScalarAfterVectorization(I
, VF
) ? VF
: 1;
6224 return N
* TTI
.getArithmeticInstrCost(
6225 I
->getOpcode(), VectorTy
, TargetTransformInfo::OK_AnyValue
,
6226 TargetTransformInfo::OK_AnyValue
,
6227 TargetTransformInfo::OP_None
, TargetTransformInfo::OP_None
,
6230 case Instruction::Select
: {
6231 SelectInst
*SI
= cast
<SelectInst
>(I
);
6232 const SCEV
*CondSCEV
= SE
->getSCEV(SI
->getCondition());
6233 bool ScalarCond
= (SE
->isLoopInvariant(CondSCEV
, TheLoop
));
6234 Type
*CondTy
= SI
->getCondition()->getType();
6236 CondTy
= VectorType::get(CondTy
, VF
);
6238 return TTI
.getCmpSelInstrCost(I
->getOpcode(), VectorTy
, CondTy
, I
);
6240 case Instruction::ICmp
:
6241 case Instruction::FCmp
: {
6242 Type
*ValTy
= I
->getOperand(0)->getType();
6243 Instruction
*Op0AsInstruction
= dyn_cast
<Instruction
>(I
->getOperand(0));
6244 if (canTruncateToMinimalBitwidth(Op0AsInstruction
, VF
))
6245 ValTy
= IntegerType::get(ValTy
->getContext(), MinBWs
[Op0AsInstruction
]);
6246 VectorTy
= ToVectorTy(ValTy
, VF
);
6247 return TTI
.getCmpSelInstrCost(I
->getOpcode(), VectorTy
, nullptr, I
);
6249 case Instruction::Store
:
6250 case Instruction::Load
: {
6251 unsigned Width
= VF
;
6253 InstWidening Decision
= getWideningDecision(I
, Width
);
6254 assert(Decision
!= CM_Unknown
&&
6255 "CM decision should be taken at this point");
6256 if (Decision
== CM_Scalarize
)
6259 VectorTy
= ToVectorTy(getMemInstValueType(I
), Width
);
6260 return getMemoryInstructionCost(I
, VF
);
6262 case Instruction::ZExt
:
6263 case Instruction::SExt
:
6264 case Instruction::FPToUI
:
6265 case Instruction::FPToSI
:
6266 case Instruction::FPExt
:
6267 case Instruction::PtrToInt
:
6268 case Instruction::IntToPtr
:
6269 case Instruction::SIToFP
:
6270 case Instruction::UIToFP
:
6271 case Instruction::Trunc
:
6272 case Instruction::FPTrunc
:
6273 case Instruction::BitCast
: {
6274 // We optimize the truncation of induction variables having constant
6275 // integer steps. The cost of these truncations is the same as the scalar
6277 if (isOptimizableIVTruncate(I
, VF
)) {
6278 auto *Trunc
= cast
<TruncInst
>(I
);
6279 return TTI
.getCastInstrCost(Instruction::Trunc
, Trunc
->getDestTy(),
6280 Trunc
->getSrcTy(), Trunc
);
6283 Type
*SrcScalarTy
= I
->getOperand(0)->getType();
6285 VectorTy
->isVectorTy() ? ToVectorTy(SrcScalarTy
, VF
) : SrcScalarTy
;
6286 if (canTruncateToMinimalBitwidth(I
, VF
)) {
6287 // This cast is going to be shrunk. This may remove the cast or it might
6288 // turn it into slightly different cast. For example, if MinBW == 16,
6289 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6291 // Calculate the modified src and dest types.
6292 Type
*MinVecTy
= VectorTy
;
6293 if (I
->getOpcode() == Instruction::Trunc
) {
6294 SrcVecTy
= smallestIntegerVectorType(SrcVecTy
, MinVecTy
);
6296 largestIntegerVectorType(ToVectorTy(I
->getType(), VF
), MinVecTy
);
6297 } else if (I
->getOpcode() == Instruction::ZExt
||
6298 I
->getOpcode() == Instruction::SExt
) {
6299 SrcVecTy
= largestIntegerVectorType(SrcVecTy
, MinVecTy
);
6301 smallestIntegerVectorType(ToVectorTy(I
->getType(), VF
), MinVecTy
);
6305 unsigned N
= isScalarAfterVectorization(I
, VF
) ? VF
: 1;
6306 return N
* TTI
.getCastInstrCost(I
->getOpcode(), VectorTy
, SrcVecTy
, I
);
6308 case Instruction::Call
: {
6309 bool NeedToScalarize
;
6310 CallInst
*CI
= cast
<CallInst
>(I
);
6311 unsigned CallCost
= getVectorCallCost(CI
, VF
, NeedToScalarize
);
6312 if (getVectorIntrinsicIDForCall(CI
, TLI
))
6313 return std::min(CallCost
, getVectorIntrinsicCost(CI
, VF
));
6317 // The cost of executing VF copies of the scalar instruction. This opcode
6318 // is unknown. Assume that it is the same as 'mul'.
6319 return VF
* TTI
.getArithmeticInstrCost(Instruction::Mul
, VectorTy
) +
6320 getScalarizationOverhead(I
, VF
);
6324 char LoopVectorize::ID
= 0;
6326 static const char lv_name
[] = "Loop Vectorization";
6328 INITIALIZE_PASS_BEGIN(LoopVectorize
, LV_NAME
, lv_name
, false, false)
6329 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass
)
6330 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass
)
6331 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass
)
6332 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass
)
6333 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker
)
6334 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass
)
6335 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass
)
6336 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass
)
6337 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass
)
6338 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis
)
6339 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass
)
6340 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass
)
6341 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass
)
6342 INITIALIZE_PASS_END(LoopVectorize
, LV_NAME
, lv_name
, false, false)
6346 Pass
*createLoopVectorizePass() { return new LoopVectorize(); }
6348 Pass
*createLoopVectorizePass(bool InterleaveOnlyWhenForced
,
6349 bool VectorizeOnlyWhenForced
) {
6350 return new LoopVectorize(InterleaveOnlyWhenForced
, VectorizeOnlyWhenForced
);
6353 } // end namespace llvm
6355 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction
*Inst
) {
6356 // Check if the pointer operand of a load or store instruction is
6358 if (auto *Ptr
= getLoadStorePointerOperand(Inst
))
6359 return Legal
->isConsecutivePtr(Ptr
);
6363 void LoopVectorizationCostModel::collectValuesToIgnore() {
6364 // Ignore ephemeral values.
6365 CodeMetrics::collectEphemeralValues(TheLoop
, AC
, ValuesToIgnore
);
6367 // Ignore type-promoting instructions we identified during reduction
6369 for (auto &Reduction
: *Legal
->getReductionVars()) {
6370 RecurrenceDescriptor
&RedDes
= Reduction
.second
;
6371 SmallPtrSetImpl
<Instruction
*> &Casts
= RedDes
.getCastInsts();
6372 VecValuesToIgnore
.insert(Casts
.begin(), Casts
.end());
6374 // Ignore type-casting instructions we identified during induction
6376 for (auto &Induction
: *Legal
->getInductionVars()) {
6377 InductionDescriptor
&IndDes
= Induction
.second
;
6378 const SmallVectorImpl
<Instruction
*> &Casts
= IndDes
.getCastInsts();
6379 VecValuesToIgnore
.insert(Casts
.begin(), Casts
.end());
6383 // TODO: we could return a pair of values that specify the max VF and
6384 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6385 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6386 // doesn't have a cost model that can choose which plan to execute if
6387 // more than one is generated.
6388 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits
,
6389 LoopVectorizationCostModel
&CM
) {
6390 unsigned WidestType
;
6391 std::tie(std::ignore
, WidestType
) = CM
.getSmallestAndWidestTypes();
6392 return WidestVectorRegBits
/ WidestType
;
6396 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF
) {
6397 unsigned VF
= UserVF
;
6398 // Outer loop handling: They may require CFG and instruction level
6399 // transformations before even evaluating whether vectorization is profitable.
6400 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6401 // the vectorization pipeline.
6402 if (!OrigLoop
->empty()) {
6403 // If the user doesn't provide a vectorization factor, determine a
6406 VF
= determineVPlanVF(TTI
->getRegisterBitWidth(true /* Vector*/), CM
);
6407 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF
<< ".\n");
6409 // Make sure we have a VF > 1 for stress testing.
6410 if (VPlanBuildStressTest
&& VF
< 2) {
6411 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6412 << "overriding computed VF.\n");
6416 assert(EnableVPlanNativePath
&& "VPlan-native path is not enabled.");
6417 assert(isPowerOf2_32(VF
) && "VF needs to be a power of two");
6418 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF
? "user " : "") << "VF " << VF
6419 << " to build VPlans.\n");
6420 buildVPlans(VF
, VF
);
6422 // For VPlan build stress testing, we bail out after VPlan construction.
6423 if (VPlanBuildStressTest
)
6424 return VectorizationFactor::Disabled();
6430 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6431 "VPlan-native path.\n");
6432 return VectorizationFactor::Disabled();
6435 Optional
<VectorizationFactor
> LoopVectorizationPlanner::plan(unsigned UserVF
) {
6436 assert(OrigLoop
->empty() && "Inner loop expected.");
6437 Optional
<unsigned> MaybeMaxVF
= CM
.computeMaxVF();
6438 if (!MaybeMaxVF
) // Cases that should not to be vectorized nor interleaved.
6441 // Invalidate interleave groups if all blocks of loop will be predicated.
6442 if (CM
.blockNeedsPredication(OrigLoop
->getHeader()) &&
6443 !useMaskedInterleavedAccesses(*TTI
)) {
6446 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6447 "which requires masked-interleaved support.\n");
6448 CM
.InterleaveInfo
.reset();
6452 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF
<< ".\n");
6453 assert(isPowerOf2_32(UserVF
) && "VF needs to be a power of two");
6454 // Collect the instructions (and their associated costs) that will be more
6455 // profitable to scalarize.
6456 CM
.selectUserVectorizationFactor(UserVF
);
6457 buildVPlansWithVPRecipes(UserVF
, UserVF
);
6458 LLVM_DEBUG(printPlans(dbgs()));
6459 return {{UserVF
, 0}};
6462 unsigned MaxVF
= MaybeMaxVF
.getValue();
6463 assert(MaxVF
!= 0 && "MaxVF is zero.");
6465 for (unsigned VF
= 1; VF
<= MaxVF
; VF
*= 2) {
6466 // Collect Uniform and Scalar instructions after vectorization with VF.
6467 CM
.collectUniformsAndScalars(VF
);
6469 // Collect the instructions (and their associated costs) that will be more
6470 // profitable to scalarize.
6472 CM
.collectInstsToScalarize(VF
);
6475 buildVPlansWithVPRecipes(1, MaxVF
);
6476 LLVM_DEBUG(printPlans(dbgs()));
6478 return VectorizationFactor::Disabled();
6480 // Select the optimal vectorization factor.
6481 return CM
.selectVectorizationFactor(MaxVF
);
6484 void LoopVectorizationPlanner::setBestPlan(unsigned VF
, unsigned UF
) {
6485 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF
<< ", UF=" << UF
6490 erase_if(VPlans
, [VF
](const VPlanPtr
&Plan
) {
6491 return !Plan
->hasVF(VF
);
6493 assert(VPlans
.size() == 1 && "Best VF has not a single VPlan.");
6496 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer
&ILV
,
6497 DominatorTree
*DT
) {
6498 // Perform the actual loop transformation.
6500 // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6501 VPCallbackILV
CallbackILV(ILV
);
6503 VPTransformState State
{BestVF
, BestUF
, LI
,
6504 DT
, ILV
.Builder
, ILV
.VectorLoopValueMap
,
6506 State
.CFG
.PrevBB
= ILV
.createVectorizedLoopSkeleton();
6507 State
.TripCount
= ILV
.getOrCreateTripCount(nullptr);
6509 //===------------------------------------------------===//
6511 // Notice: any optimization or new instruction that go
6512 // into the code below should also be implemented in
6515 //===------------------------------------------------===//
6517 // 2. Copy and widen instructions from the old loop into the new loop.
6518 assert(VPlans
.size() == 1 && "Not a single VPlan to execute.");
6519 VPlans
.front()->execute(&State
);
6521 // 3. Fix the vectorized code: take care of header phi's, live-outs,
6522 // predication, updating analyses.
6523 ILV
.fixVectorizedLoop();
6526 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6527 SmallPtrSetImpl
<Instruction
*> &DeadInstructions
) {
6528 BasicBlock
*Latch
= OrigLoop
->getLoopLatch();
6530 // We create new control-flow for the vectorized loop, so the original
6531 // condition will be dead after vectorization if it's only used by the
6533 auto *Cmp
= dyn_cast
<Instruction
>(Latch
->getTerminator()->getOperand(0));
6534 if (Cmp
&& Cmp
->hasOneUse())
6535 DeadInstructions
.insert(Cmp
);
6537 // We create new "steps" for induction variable updates to which the original
6538 // induction variables map. An original update instruction will be dead if
6539 // all its users except the induction variable are dead.
6540 for (auto &Induction
: *Legal
->getInductionVars()) {
6541 PHINode
*Ind
= Induction
.first
;
6542 auto *IndUpdate
= cast
<Instruction
>(Ind
->getIncomingValueForBlock(Latch
));
6543 if (llvm::all_of(IndUpdate
->users(), [&](User
*U
) -> bool {
6544 return U
== Ind
|| DeadInstructions
.find(cast
<Instruction
>(U
)) !=
6545 DeadInstructions
.end();
6547 DeadInstructions
.insert(IndUpdate
);
6549 // We record as "Dead" also the type-casting instructions we had identified
6550 // during induction analysis. We don't need any handling for them in the
6551 // vectorized loop because we have proven that, under a proper runtime
6552 // test guarding the vectorized loop, the value of the phi, and the casted
6553 // value of the phi, are the same. The last instruction in this casting chain
6554 // will get its scalar/vector/widened def from the scalar/vector/widened def
6555 // of the respective phi node. Any other casts in the induction def-use chain
6556 // have no other uses outside the phi update chain, and will be ignored.
6557 InductionDescriptor
&IndDes
= Induction
.second
;
6558 const SmallVectorImpl
<Instruction
*> &Casts
= IndDes
.getCastInsts();
6559 DeadInstructions
.insert(Casts
.begin(), Casts
.end());
6563 Value
*InnerLoopUnroller::reverseVector(Value
*Vec
) { return Vec
; }
6565 Value
*InnerLoopUnroller::getBroadcastInstrs(Value
*V
) { return V
; }
6567 Value
*InnerLoopUnroller::getStepVector(Value
*Val
, int StartIdx
, Value
*Step
,
6568 Instruction::BinaryOps BinOp
) {
6569 // When unrolling and the VF is 1, we only need to add a simple scalar.
6570 Type
*Ty
= Val
->getType();
6571 assert(!Ty
->isVectorTy() && "Val must be a scalar");
6573 if (Ty
->isFloatingPointTy()) {
6574 Constant
*C
= ConstantFP::get(Ty
, (double)StartIdx
);
6576 // Floating point operations had to be 'fast' to enable the unrolling.
6577 Value
*MulOp
= addFastMathFlag(Builder
.CreateFMul(C
, Step
));
6578 return addFastMathFlag(Builder
.CreateBinOp(BinOp
, Val
, MulOp
));
6580 Constant
*C
= ConstantInt::get(Ty
, StartIdx
);
6581 return Builder
.CreateAdd(Val
, Builder
.CreateMul(C
, Step
), "induction");
6584 static void AddRuntimeUnrollDisableMetaData(Loop
*L
) {
6585 SmallVector
<Metadata
*, 4> MDs
;
6586 // Reserve first location for self reference to the LoopID metadata node.
6587 MDs
.push_back(nullptr);
6588 bool IsUnrollMetadata
= false;
6589 MDNode
*LoopID
= L
->getLoopID();
6591 // First find existing loop unrolling disable metadata.
6592 for (unsigned i
= 1, ie
= LoopID
->getNumOperands(); i
< ie
; ++i
) {
6593 auto *MD
= dyn_cast
<MDNode
>(LoopID
->getOperand(i
));
6595 const auto *S
= dyn_cast
<MDString
>(MD
->getOperand(0));
6597 S
&& S
->getString().startswith("llvm.loop.unroll.disable");
6599 MDs
.push_back(LoopID
->getOperand(i
));
6603 if (!IsUnrollMetadata
) {
6604 // Add runtime unroll disable metadata.
6605 LLVMContext
&Context
= L
->getHeader()->getContext();
6606 SmallVector
<Metadata
*, 1> DisableOperands
;
6607 DisableOperands
.push_back(
6608 MDString::get(Context
, "llvm.loop.unroll.runtime.disable"));
6609 MDNode
*DisableNode
= MDNode::get(Context
, DisableOperands
);
6610 MDs
.push_back(DisableNode
);
6611 MDNode
*NewLoopID
= MDNode::get(Context
, MDs
);
6612 // Set operand 0 to refer to the loop id itself.
6613 NewLoopID
->replaceOperandWith(0, NewLoopID
);
6614 L
->setLoopID(NewLoopID
);
6618 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6619 const std::function
<bool(unsigned)> &Predicate
, VFRange
&Range
) {
6620 assert(Range
.End
> Range
.Start
&& "Trying to test an empty VF range.");
6621 bool PredicateAtRangeStart
= Predicate(Range
.Start
);
6623 for (unsigned TmpVF
= Range
.Start
* 2; TmpVF
< Range
.End
; TmpVF
*= 2)
6624 if (Predicate(TmpVF
) != PredicateAtRangeStart
) {
6629 return PredicateAtRangeStart
;
6632 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6633 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6634 /// of VF's starting at a given VF and extending it as much as possible. Each
6635 /// vectorization decision can potentially shorten this sub-range during
6637 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF
, unsigned MaxVF
) {
6638 for (unsigned VF
= MinVF
; VF
< MaxVF
+ 1;) {
6639 VFRange SubRange
= {VF
, MaxVF
+ 1};
6640 VPlans
.push_back(buildVPlan(SubRange
));
6645 VPValue
*VPRecipeBuilder::createEdgeMask(BasicBlock
*Src
, BasicBlock
*Dst
,
6647 assert(is_contained(predecessors(Dst
), Src
) && "Invalid edge");
6649 // Look for cached value.
6650 std::pair
<BasicBlock
*, BasicBlock
*> Edge(Src
, Dst
);
6651 EdgeMaskCacheTy::iterator ECEntryIt
= EdgeMaskCache
.find(Edge
);
6652 if (ECEntryIt
!= EdgeMaskCache
.end())
6653 return ECEntryIt
->second
;
6655 VPValue
*SrcMask
= createBlockInMask(Src
, Plan
);
6657 // The terminator has to be a branch inst!
6658 BranchInst
*BI
= dyn_cast
<BranchInst
>(Src
->getTerminator());
6659 assert(BI
&& "Unexpected terminator found");
6661 if (!BI
->isConditional())
6662 return EdgeMaskCache
[Edge
] = SrcMask
;
6664 VPValue
*EdgeMask
= Plan
->getVPValue(BI
->getCondition());
6665 assert(EdgeMask
&& "No Edge Mask found for condition");
6667 if (BI
->getSuccessor(0) != Dst
)
6668 EdgeMask
= Builder
.createNot(EdgeMask
);
6670 if (SrcMask
) // Otherwise block in-mask is all-one, no need to AND.
6671 EdgeMask
= Builder
.createAnd(EdgeMask
, SrcMask
);
6673 return EdgeMaskCache
[Edge
] = EdgeMask
;
6676 VPValue
*VPRecipeBuilder::createBlockInMask(BasicBlock
*BB
, VPlanPtr
&Plan
) {
6677 assert(OrigLoop
->contains(BB
) && "Block is not a part of a loop");
6679 // Look for cached value.
6680 BlockMaskCacheTy::iterator BCEntryIt
= BlockMaskCache
.find(BB
);
6681 if (BCEntryIt
!= BlockMaskCache
.end())
6682 return BCEntryIt
->second
;
6684 // All-one mask is modelled as no-mask following the convention for masked
6685 // load/store/gather/scatter. Initialize BlockMask to no-mask.
6686 VPValue
*BlockMask
= nullptr;
6688 if (OrigLoop
->getHeader() == BB
) {
6689 if (!CM
.blockNeedsPredication(BB
))
6690 return BlockMaskCache
[BB
] = BlockMask
; // Loop incoming mask is all-one.
6692 // Introduce the early-exit compare IV <= BTC to form header block mask.
6693 // This is used instead of IV < TC because TC may wrap, unlike BTC.
6694 VPValue
*IV
= Plan
->getVPValue(Legal
->getPrimaryInduction());
6695 VPValue
*BTC
= Plan
->getOrCreateBackedgeTakenCount();
6696 BlockMask
= Builder
.createNaryOp(VPInstruction::ICmpULE
, {IV
, BTC
});
6697 return BlockMaskCache
[BB
] = BlockMask
;
6700 // This is the block mask. We OR all incoming edges.
6701 for (auto *Predecessor
: predecessors(BB
)) {
6702 VPValue
*EdgeMask
= createEdgeMask(Predecessor
, BB
, Plan
);
6703 if (!EdgeMask
) // Mask of predecessor is all-one so mask of block is too.
6704 return BlockMaskCache
[BB
] = EdgeMask
;
6706 if (!BlockMask
) { // BlockMask has its initialized nullptr value.
6707 BlockMask
= EdgeMask
;
6711 BlockMask
= Builder
.createOr(BlockMask
, EdgeMask
);
6714 return BlockMaskCache
[BB
] = BlockMask
;
6717 VPInterleaveRecipe
*VPRecipeBuilder::tryToInterleaveMemory(Instruction
*I
,
6720 const InterleaveGroup
<Instruction
> *IG
= CM
.getInterleavedAccessGroup(I
);
6724 // Now check if IG is relevant for VF's in the given range.
6725 auto isIGMember
= [&](Instruction
*I
) -> std::function
<bool(unsigned)> {
6726 return [=](unsigned VF
) -> bool {
6727 return (VF
>= 2 && // Query is illegal for VF == 1
6728 CM
.getWideningDecision(I
, VF
) ==
6729 LoopVectorizationCostModel::CM_Interleave
);
6732 if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I
), Range
))
6735 // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
6736 // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
6737 // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
6738 assert(I
== IG
->getInsertPos() &&
6739 "Generating a recipe for an adjunct member of an interleave group");
6741 VPValue
*Mask
= nullptr;
6742 if (Legal
->isMaskRequired(I
))
6743 Mask
= createBlockInMask(I
->getParent(), Plan
);
6745 return new VPInterleaveRecipe(IG
, Mask
);
6748 VPWidenMemoryInstructionRecipe
*
6749 VPRecipeBuilder::tryToWidenMemory(Instruction
*I
, VFRange
&Range
,
6751 if (!isa
<LoadInst
>(I
) && !isa
<StoreInst
>(I
))
6754 auto willWiden
= [&](unsigned VF
) -> bool {
6757 if (CM
.isScalarAfterVectorization(I
, VF
) ||
6758 CM
.isProfitableToScalarize(I
, VF
))
6760 LoopVectorizationCostModel::InstWidening Decision
=
6761 CM
.getWideningDecision(I
, VF
);
6762 assert(Decision
!= LoopVectorizationCostModel::CM_Unknown
&&
6763 "CM decision should be taken at this point.");
6764 assert(Decision
!= LoopVectorizationCostModel::CM_Interleave
&&
6765 "Interleave memory opportunity should be caught earlier.");
6766 return Decision
!= LoopVectorizationCostModel::CM_Scalarize
;
6769 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden
, Range
))
6772 VPValue
*Mask
= nullptr;
6773 if (Legal
->isMaskRequired(I
))
6774 Mask
= createBlockInMask(I
->getParent(), Plan
);
6776 return new VPWidenMemoryInstructionRecipe(*I
, Mask
);
6779 VPWidenIntOrFpInductionRecipe
*
6780 VPRecipeBuilder::tryToOptimizeInduction(Instruction
*I
, VFRange
&Range
) {
6781 if (PHINode
*Phi
= dyn_cast
<PHINode
>(I
)) {
6782 // Check if this is an integer or fp induction. If so, build the recipe that
6783 // produces its scalar and vector values.
6784 InductionDescriptor II
= Legal
->getInductionVars()->lookup(Phi
);
6785 if (II
.getKind() == InductionDescriptor::IK_IntInduction
||
6786 II
.getKind() == InductionDescriptor::IK_FpInduction
)
6787 return new VPWidenIntOrFpInductionRecipe(Phi
);
6792 // Optimize the special case where the source is a constant integer
6793 // induction variable. Notice that we can only optimize the 'trunc' case
6794 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6795 // (c) other casts depend on pointer size.
6797 // Determine whether \p K is a truncation based on an induction variable that
6798 // can be optimized.
6799 auto isOptimizableIVTruncate
=
6800 [&](Instruction
*K
) -> std::function
<bool(unsigned)> {
6802 [=](unsigned VF
) -> bool { return CM
.isOptimizableIVTruncate(K
, VF
); };
6805 if (isa
<TruncInst
>(I
) && LoopVectorizationPlanner::getDecisionAndClampRange(
6806 isOptimizableIVTruncate(I
), Range
))
6807 return new VPWidenIntOrFpInductionRecipe(cast
<PHINode
>(I
->getOperand(0)),
6808 cast
<TruncInst
>(I
));
6812 VPBlendRecipe
*VPRecipeBuilder::tryToBlend(Instruction
*I
, VPlanPtr
&Plan
) {
6813 PHINode
*Phi
= dyn_cast
<PHINode
>(I
);
6814 if (!Phi
|| Phi
->getParent() == OrigLoop
->getHeader())
6817 // We know that all PHIs in non-header blocks are converted into selects, so
6818 // we don't have to worry about the insertion order and we can just use the
6819 // builder. At this point we generate the predication tree. There may be
6820 // duplications since this is a simple recursive scan, but future
6821 // optimizations will clean it up.
6823 SmallVector
<VPValue
*, 2> Masks
;
6824 unsigned NumIncoming
= Phi
->getNumIncomingValues();
6825 for (unsigned In
= 0; In
< NumIncoming
; In
++) {
6827 createEdgeMask(Phi
->getIncomingBlock(In
), Phi
->getParent(), Plan
);
6828 assert((EdgeMask
|| NumIncoming
== 1) &&
6829 "Multiple predecessors with one having a full mask");
6831 Masks
.push_back(EdgeMask
);
6833 return new VPBlendRecipe(Phi
, Masks
);
6836 bool VPRecipeBuilder::tryToWiden(Instruction
*I
, VPBasicBlock
*VPBB
,
6839 bool IsPredicated
= LoopVectorizationPlanner::getDecisionAndClampRange(
6840 [&](unsigned VF
) { return CM
.isScalarWithPredication(I
, VF
); }, Range
);
6845 auto IsVectorizableOpcode
= [](unsigned Opcode
) {
6847 case Instruction::Add
:
6848 case Instruction::And
:
6849 case Instruction::AShr
:
6850 case Instruction::BitCast
:
6851 case Instruction::Br
:
6852 case Instruction::Call
:
6853 case Instruction::FAdd
:
6854 case Instruction::FCmp
:
6855 case Instruction::FDiv
:
6856 case Instruction::FMul
:
6857 case Instruction::FNeg
:
6858 case Instruction::FPExt
:
6859 case Instruction::FPToSI
:
6860 case Instruction::FPToUI
:
6861 case Instruction::FPTrunc
:
6862 case Instruction::FRem
:
6863 case Instruction::FSub
:
6864 case Instruction::GetElementPtr
:
6865 case Instruction::ICmp
:
6866 case Instruction::IntToPtr
:
6867 case Instruction::Load
:
6868 case Instruction::LShr
:
6869 case Instruction::Mul
:
6870 case Instruction::Or
:
6871 case Instruction::PHI
:
6872 case Instruction::PtrToInt
:
6873 case Instruction::SDiv
:
6874 case Instruction::Select
:
6875 case Instruction::SExt
:
6876 case Instruction::Shl
:
6877 case Instruction::SIToFP
:
6878 case Instruction::SRem
:
6879 case Instruction::Store
:
6880 case Instruction::Sub
:
6881 case Instruction::Trunc
:
6882 case Instruction::UDiv
:
6883 case Instruction::UIToFP
:
6884 case Instruction::URem
:
6885 case Instruction::Xor
:
6886 case Instruction::ZExt
:
6892 if (!IsVectorizableOpcode(I
->getOpcode()))
6895 if (CallInst
*CI
= dyn_cast
<CallInst
>(I
)) {
6896 Intrinsic::ID ID
= getVectorIntrinsicIDForCall(CI
, TLI
);
6897 if (ID
&& (ID
== Intrinsic::assume
|| ID
== Intrinsic::lifetime_end
||
6898 ID
== Intrinsic::lifetime_start
|| ID
== Intrinsic::sideeffect
))
6902 auto willWiden
= [&](unsigned VF
) -> bool {
6903 if (!isa
<PHINode
>(I
) && (CM
.isScalarAfterVectorization(I
, VF
) ||
6904 CM
.isProfitableToScalarize(I
, VF
)))
6906 if (CallInst
*CI
= dyn_cast
<CallInst
>(I
)) {
6907 Intrinsic::ID ID
= getVectorIntrinsicIDForCall(CI
, TLI
);
6908 // The following case may be scalarized depending on the VF.
6909 // The flag shows whether we use Intrinsic or a usual Call for vectorized
6910 // version of the instruction.
6911 // Is it beneficial to perform intrinsic call compared to lib call?
6912 bool NeedToScalarize
;
6913 unsigned CallCost
= CM
.getVectorCallCost(CI
, VF
, NeedToScalarize
);
6914 bool UseVectorIntrinsic
=
6915 ID
&& CM
.getVectorIntrinsicCost(CI
, VF
) <= CallCost
;
6916 return UseVectorIntrinsic
|| !NeedToScalarize
;
6918 if (isa
<LoadInst
>(I
) || isa
<StoreInst
>(I
)) {
6919 assert(CM
.getWideningDecision(I
, VF
) ==
6920 LoopVectorizationCostModel::CM_Scalarize
&&
6921 "Memory widening decisions should have been taken care by now");
6927 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden
, Range
))
6930 // Success: widen this instruction. We optimize the common case where
6931 // consecutive instructions can be represented by a single recipe.
6932 if (!VPBB
->empty()) {
6933 VPWidenRecipe
*LastWidenRecipe
= dyn_cast
<VPWidenRecipe
>(&VPBB
->back());
6934 if (LastWidenRecipe
&& LastWidenRecipe
->appendInstruction(I
))
6938 VPBB
->appendRecipe(new VPWidenRecipe(I
));
6942 VPBasicBlock
*VPRecipeBuilder::handleReplication(
6943 Instruction
*I
, VFRange
&Range
, VPBasicBlock
*VPBB
,
6944 DenseMap
<Instruction
*, VPReplicateRecipe
*> &PredInst2Recipe
,
6946 bool IsUniform
= LoopVectorizationPlanner::getDecisionAndClampRange(
6947 [&](unsigned VF
) { return CM
.isUniformAfterVectorization(I
, VF
); },
6950 bool IsPredicated
= LoopVectorizationPlanner::getDecisionAndClampRange(
6951 [&](unsigned VF
) { return CM
.isScalarWithPredication(I
, VF
); }, Range
);
6953 auto *Recipe
= new VPReplicateRecipe(I
, IsUniform
, IsPredicated
);
6955 // Find if I uses a predicated instruction. If so, it will use its scalar
6956 // value. Avoid hoisting the insert-element which packs the scalar value into
6957 // a vector value, as that happens iff all users use the vector value.
6958 for (auto &Op
: I
->operands())
6959 if (auto *PredInst
= dyn_cast
<Instruction
>(Op
))
6960 if (PredInst2Recipe
.find(PredInst
) != PredInst2Recipe
.end())
6961 PredInst2Recipe
[PredInst
]->setAlsoPack(false);
6963 // Finalize the recipe for Instr, first if it is not predicated.
6964 if (!IsPredicated
) {
6965 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I
<< "\n");
6966 VPBB
->appendRecipe(Recipe
);
6969 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I
<< "\n");
6970 assert(VPBB
->getSuccessors().empty() &&
6971 "VPBB has successors when handling predicated replication.");
6972 // Record predicated instructions for above packing optimizations.
6973 PredInst2Recipe
[I
] = Recipe
;
6974 VPBlockBase
*Region
= createReplicateRegion(I
, Recipe
, Plan
);
6975 VPBlockUtils::insertBlockAfter(Region
, VPBB
);
6976 auto *RegSucc
= new VPBasicBlock();
6977 VPBlockUtils::insertBlockAfter(RegSucc
, Region
);
6981 VPRegionBlock
*VPRecipeBuilder::createReplicateRegion(Instruction
*Instr
,
6982 VPRecipeBase
*PredRecipe
,
6984 // Instructions marked for predication are replicated and placed under an
6985 // if-then construct to prevent side-effects.
6987 // Generate recipes to compute the block mask for this region.
6988 VPValue
*BlockInMask
= createBlockInMask(Instr
->getParent(), Plan
);
6990 // Build the triangular if-then region.
6991 std::string RegionName
= (Twine("pred.") + Instr
->getOpcodeName()).str();
6992 assert(Instr
->getParent() && "Predicated instruction not in any basic block");
6993 auto *BOMRecipe
= new VPBranchOnMaskRecipe(BlockInMask
);
6994 auto *Entry
= new VPBasicBlock(Twine(RegionName
) + ".entry", BOMRecipe
);
6996 Instr
->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr
);
6997 auto *Exit
= new VPBasicBlock(Twine(RegionName
) + ".continue", PHIRecipe
);
6998 auto *Pred
= new VPBasicBlock(Twine(RegionName
) + ".if", PredRecipe
);
6999 VPRegionBlock
*Region
= new VPRegionBlock(Entry
, Exit
, RegionName
, true);
7001 // Note: first set Entry as region entry and then connect successors starting
7002 // from it in order, to propagate the "parent" of each VPBasicBlock.
7003 VPBlockUtils::insertTwoBlocksAfter(Pred
, Exit
, BlockInMask
, Entry
);
7004 VPBlockUtils::connectBlocks(Pred
, Exit
);
7009 bool VPRecipeBuilder::tryToCreateRecipe(Instruction
*Instr
, VFRange
&Range
,
7010 VPlanPtr
&Plan
, VPBasicBlock
*VPBB
) {
7011 VPRecipeBase
*Recipe
= nullptr;
7012 // Check if Instr should belong to an interleave memory recipe, or already
7013 // does. In the latter case Instr is irrelevant.
7014 if ((Recipe
= tryToInterleaveMemory(Instr
, Range
, Plan
))) {
7015 VPBB
->appendRecipe(Recipe
);
7019 // Check if Instr is a memory operation that should be widened.
7020 if ((Recipe
= tryToWidenMemory(Instr
, Range
, Plan
))) {
7021 VPBB
->appendRecipe(Recipe
);
7025 // Check if Instr should form some PHI recipe.
7026 if ((Recipe
= tryToOptimizeInduction(Instr
, Range
))) {
7027 VPBB
->appendRecipe(Recipe
);
7030 if ((Recipe
= tryToBlend(Instr
, Plan
))) {
7031 VPBB
->appendRecipe(Recipe
);
7034 if (PHINode
*Phi
= dyn_cast
<PHINode
>(Instr
)) {
7035 VPBB
->appendRecipe(new VPWidenPHIRecipe(Phi
));
7039 // Check if Instr is to be widened by a general VPWidenRecipe, after
7040 // having first checked for specific widening recipes that deal with
7041 // Interleave Groups, Inductions and Phi nodes.
7042 if (tryToWiden(Instr
, VPBB
, Range
))
7048 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF
,
7050 assert(OrigLoop
->empty() && "Inner loop expected.");
7052 // Collect conditions feeding internal conditional branches; they need to be
7053 // represented in VPlan for it to model masking.
7054 SmallPtrSet
<Value
*, 1> NeedDef
;
7056 auto *Latch
= OrigLoop
->getLoopLatch();
7057 for (BasicBlock
*BB
: OrigLoop
->blocks()) {
7060 BranchInst
*Branch
= dyn_cast
<BranchInst
>(BB
->getTerminator());
7061 if (Branch
&& Branch
->isConditional())
7062 NeedDef
.insert(Branch
->getCondition());
7065 // If the tail is to be folded by masking, the primary induction variable
7066 // needs to be represented in VPlan for it to model early-exit masking.
7067 // Also, both the Phi and the live-out instruction of each reduction are
7068 // required in order to introduce a select between them in VPlan.
7069 if (CM
.foldTailByMasking()) {
7070 NeedDef
.insert(Legal
->getPrimaryInduction());
7071 for (auto &Reduction
: *Legal
->getReductionVars()) {
7072 NeedDef
.insert(Reduction
.first
);
7073 NeedDef
.insert(Reduction
.second
.getLoopExitInstr());
7077 // Collect instructions from the original loop that will become trivially dead
7078 // in the vectorized loop. We don't need to vectorize these instructions. For
7079 // example, original induction update instructions can become dead because we
7080 // separately emit induction "steps" when generating code for the new loop.
7081 // Similarly, we create a new latch condition when setting up the structure
7082 // of the new loop, so the old one can become dead.
7083 SmallPtrSet
<Instruction
*, 4> DeadInstructions
;
7084 collectTriviallyDeadInstructions(DeadInstructions
);
7086 for (unsigned VF
= MinVF
; VF
< MaxVF
+ 1;) {
7087 VFRange SubRange
= {VF
, MaxVF
+ 1};
7089 buildVPlanWithVPRecipes(SubRange
, NeedDef
, DeadInstructions
));
7094 VPlanPtr
LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7095 VFRange
&Range
, SmallPtrSetImpl
<Value
*> &NeedDef
,
7096 SmallPtrSetImpl
<Instruction
*> &DeadInstructions
) {
7097 // Hold a mapping from predicated instructions to their recipes, in order to
7098 // fix their AlsoPack behavior if a user is determined to replicate and use a
7099 // scalar instead of vector value.
7100 DenseMap
<Instruction
*, VPReplicateRecipe
*> PredInst2Recipe
;
7102 DenseMap
<Instruction
*, Instruction
*> &SinkAfter
= Legal
->getSinkAfter();
7103 DenseMap
<Instruction
*, Instruction
*> SinkAfterInverse
;
7105 // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7106 VPBasicBlock
*VPBB
= new VPBasicBlock("Pre-Entry");
7107 auto Plan
= std::make_unique
<VPlan
>(VPBB
);
7109 VPRecipeBuilder
RecipeBuilder(OrigLoop
, TLI
, Legal
, CM
, Builder
);
7110 // Represent values that will have defs inside VPlan.
7111 for (Value
*V
: NeedDef
)
7112 Plan
->addVPValue(V
);
7114 // Scan the body of the loop in a topological order to visit each basic block
7115 // after having visited its predecessor basic blocks.
7116 LoopBlocksDFS
DFS(OrigLoop
);
7119 for (BasicBlock
*BB
: make_range(DFS
.beginRPO(), DFS
.endRPO())) {
7120 // Relevant instructions from basic block BB will be grouped into VPRecipe
7121 // ingredients and fill a new VPBasicBlock.
7122 unsigned VPBBsForBB
= 0;
7123 auto *FirstVPBBForBB
= new VPBasicBlock(BB
->getName());
7124 VPBlockUtils::insertBlockAfter(FirstVPBBForBB
, VPBB
);
7125 VPBB
= FirstVPBBForBB
;
7126 Builder
.setInsertPoint(VPBB
);
7128 std::vector
<Instruction
*> Ingredients
;
7130 // Organize the ingredients to vectorize from current basic block in the
7132 for (Instruction
&I
: BB
->instructionsWithoutDebug()) {
7133 Instruction
*Instr
= &I
;
7135 // First filter out irrelevant instructions, to ensure no recipes are
7137 if (isa
<BranchInst
>(Instr
) ||
7138 DeadInstructions
.find(Instr
) != DeadInstructions
.end())
7141 // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
7142 // member of the IG, do not construct any Recipe for it.
7143 const InterleaveGroup
<Instruction
> *IG
=
7144 CM
.getInterleavedAccessGroup(Instr
);
7145 if (IG
&& Instr
!= IG
->getInsertPos() &&
7146 Range
.Start
>= 2 && // Query is illegal for VF == 1
7147 CM
.getWideningDecision(Instr
, Range
.Start
) ==
7148 LoopVectorizationCostModel::CM_Interleave
) {
7149 auto SinkCandidate
= SinkAfterInverse
.find(Instr
);
7150 if (SinkCandidate
!= SinkAfterInverse
.end())
7151 Ingredients
.push_back(SinkCandidate
->second
);
7155 // Move instructions to handle first-order recurrences, step 1: avoid
7156 // handling this instruction until after we've handled the instruction it
7158 auto SAIt
= SinkAfter
.find(Instr
);
7159 if (SAIt
!= SinkAfter
.end()) {
7160 LLVM_DEBUG(dbgs() << "Sinking" << *SAIt
->first
<< " after"
7162 << " to vectorize a 1st order recurrence.\n");
7163 SinkAfterInverse
[SAIt
->second
] = Instr
;
7167 Ingredients
.push_back(Instr
);
7169 // Move instructions to handle first-order recurrences, step 2: push the
7170 // instruction to be sunk at its insertion point.
7171 auto SAInvIt
= SinkAfterInverse
.find(Instr
);
7172 if (SAInvIt
!= SinkAfterInverse
.end())
7173 Ingredients
.push_back(SAInvIt
->second
);
7176 // Introduce each ingredient into VPlan.
7177 for (Instruction
*Instr
: Ingredients
) {
7178 if (RecipeBuilder
.tryToCreateRecipe(Instr
, Range
, Plan
, VPBB
))
7181 // Otherwise, if all widening options failed, Instruction is to be
7182 // replicated. This may create a successor for VPBB.
7183 VPBasicBlock
*NextVPBB
= RecipeBuilder
.handleReplication(
7184 Instr
, Range
, VPBB
, PredInst2Recipe
, Plan
);
7185 if (NextVPBB
!= VPBB
) {
7187 VPBB
->setName(BB
->hasName() ? BB
->getName() + "." + Twine(VPBBsForBB
++)
7193 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7194 // may also be empty, such as the last one VPBB, reflecting original
7195 // basic-blocks with no recipes.
7196 VPBasicBlock
*PreEntry
= cast
<VPBasicBlock
>(Plan
->getEntry());
7197 assert(PreEntry
->empty() && "Expecting empty pre-entry block.");
7198 VPBlockBase
*Entry
= Plan
->setEntry(PreEntry
->getSingleSuccessor());
7199 VPBlockUtils::disconnectBlocks(PreEntry
, Entry
);
7202 // Finally, if tail is folded by masking, introduce selects between the phi
7203 // and the live-out instruction of each reduction, at the end of the latch.
7204 if (CM
.foldTailByMasking()) {
7205 Builder
.setInsertPoint(VPBB
);
7206 auto *Cond
= RecipeBuilder
.createBlockInMask(OrigLoop
->getHeader(), Plan
);
7207 for (auto &Reduction
: *Legal
->getReductionVars()) {
7208 VPValue
*Phi
= Plan
->getVPValue(Reduction
.first
);
7209 VPValue
*Red
= Plan
->getVPValue(Reduction
.second
.getLoopExitInstr());
7210 Builder
.createNaryOp(Instruction::Select
, {Cond
, Red
, Phi
});
7214 std::string PlanName
;
7215 raw_string_ostream
RSO(PlanName
);
7216 unsigned VF
= Range
.Start
;
7218 RSO
<< "Initial VPlan for VF={" << VF
;
7219 for (VF
*= 2; VF
< Range
.End
; VF
*= 2) {
7225 Plan
->setName(PlanName
);
7230 VPlanPtr
LoopVectorizationPlanner::buildVPlan(VFRange
&Range
) {
7231 // Outer loop handling: They may require CFG and instruction level
7232 // transformations before even evaluating whether vectorization is profitable.
7233 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7234 // the vectorization pipeline.
7235 assert(!OrigLoop
->empty());
7236 assert(EnableVPlanNativePath
&& "VPlan-native path is not enabled.");
7238 // Create new empty VPlan
7239 auto Plan
= std::make_unique
<VPlan
>();
7241 // Build hierarchical CFG
7242 VPlanHCFGBuilder
HCFGBuilder(OrigLoop
, LI
, *Plan
);
7243 HCFGBuilder
.buildHierarchicalCFG();
7245 for (unsigned VF
= Range
.Start
; VF
< Range
.End
; VF
*= 2)
7248 if (EnableVPlanPredication
) {
7249 VPlanPredicator
VPP(*Plan
);
7252 // Avoid running transformation to recipes until masked code generation in
7253 // VPlan-native path is in place.
7257 SmallPtrSet
<Instruction
*, 1> DeadInstructions
;
7258 VPlanHCFGTransforms::VPInstructionsToVPRecipes(
7259 Plan
, Legal
->getInductionVars(), DeadInstructions
);
7264 Value
* LoopVectorizationPlanner::VPCallbackILV::
7265 getOrCreateVectorValues(Value
*V
, unsigned Part
) {
7266 return ILV
.getOrCreateVectorValue(V
, Part
);
7269 void VPInterleaveRecipe::print(raw_ostream
&O
, const Twine
&Indent
) const {
7271 << Indent
<< "\"INTERLEAVE-GROUP with factor " << IG
->getFactor() << " at ";
7272 IG
->getInsertPos()->printAsOperand(O
, false);
7275 User
->getOperand(0)->printAsOperand(O
);
7278 for (unsigned i
= 0; i
< IG
->getFactor(); ++i
)
7279 if (Instruction
*I
= IG
->getMember(i
))
7281 << Indent
<< "\" " << VPlanIngredient(I
) << " " << i
<< "\\l\"";
7284 void VPWidenRecipe::execute(VPTransformState
&State
) {
7285 for (auto &Instr
: make_range(Begin
, End
))
7286 State
.ILV
->widenInstruction(Instr
);
7289 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState
&State
) {
7290 assert(!State
.Instance
&& "Int or FP induction being replicated.");
7291 State
.ILV
->widenIntOrFpInduction(IV
, Trunc
);
7294 void VPWidenPHIRecipe::execute(VPTransformState
&State
) {
7295 State
.ILV
->widenPHIInstruction(Phi
, State
.UF
, State
.VF
);
7298 void VPBlendRecipe::execute(VPTransformState
&State
) {
7299 State
.ILV
->setDebugLocFromInst(State
.Builder
, Phi
);
7300 // We know that all PHIs in non-header blocks are converted into
7301 // selects, so we don't have to worry about the insertion order and we
7302 // can just use the builder.
7303 // At this point we generate the predication tree. There may be
7304 // duplications since this is a simple recursive scan, but future
7305 // optimizations will clean it up.
7307 unsigned NumIncoming
= Phi
->getNumIncomingValues();
7309 assert((User
|| NumIncoming
== 1) &&
7310 "Multiple predecessors with predecessors having a full mask");
7311 // Generate a sequence of selects of the form:
7312 // SELECT(Mask3, In3,
7313 // SELECT(Mask2, In2,
7315 InnerLoopVectorizer::VectorParts
Entry(State
.UF
);
7316 for (unsigned In
= 0; In
< NumIncoming
; ++In
) {
7317 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
) {
7318 // We might have single edge PHIs (blocks) - use an identity
7319 // 'select' for the first PHI operand.
7321 State
.ILV
->getOrCreateVectorValue(Phi
->getIncomingValue(In
), Part
);
7323 Entry
[Part
] = In0
; // Initialize with the first incoming value.
7325 // Select between the current value and the previous incoming edge
7326 // based on the incoming mask.
7327 Value
*Cond
= State
.get(User
->getOperand(In
), Part
);
7329 State
.Builder
.CreateSelect(Cond
, In0
, Entry
[Part
], "predphi");
7333 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
)
7334 State
.ValueMap
.setVectorValue(Phi
, Part
, Entry
[Part
]);
7337 void VPInterleaveRecipe::execute(VPTransformState
&State
) {
7338 assert(!State
.Instance
&& "Interleave group being replicated.");
7340 return State
.ILV
->vectorizeInterleaveGroup(IG
->getInsertPos());
7342 // Last (and currently only) operand is a mask.
7343 InnerLoopVectorizer::VectorParts
MaskValues(State
.UF
);
7344 VPValue
*Mask
= User
->getOperand(User
->getNumOperands() - 1);
7345 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
)
7346 MaskValues
[Part
] = State
.get(Mask
, Part
);
7347 State
.ILV
->vectorizeInterleaveGroup(IG
->getInsertPos(), &MaskValues
);
7350 void VPReplicateRecipe::execute(VPTransformState
&State
) {
7351 if (State
.Instance
) { // Generate a single instance.
7352 State
.ILV
->scalarizeInstruction(Ingredient
, *State
.Instance
, IsPredicated
);
7353 // Insert scalar instance packing it into a vector.
7354 if (AlsoPack
&& State
.VF
> 1) {
7355 // If we're constructing lane 0, initialize to start from undef.
7356 if (State
.Instance
->Lane
== 0) {
7358 UndefValue::get(VectorType::get(Ingredient
->getType(), State
.VF
));
7359 State
.ValueMap
.setVectorValue(Ingredient
, State
.Instance
->Part
, Undef
);
7361 State
.ILV
->packScalarIntoVectorValue(Ingredient
, *State
.Instance
);
7366 // Generate scalar instances for all VF lanes of all UF parts, unless the
7367 // instruction is uniform inwhich case generate only the first lane for each
7369 unsigned EndLane
= IsUniform
? 1 : State
.VF
;
7370 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
)
7371 for (unsigned Lane
= 0; Lane
< EndLane
; ++Lane
)
7372 State
.ILV
->scalarizeInstruction(Ingredient
, {Part
, Lane
}, IsPredicated
);
7375 void VPBranchOnMaskRecipe::execute(VPTransformState
&State
) {
7376 assert(State
.Instance
&& "Branch on Mask works only on single instance.");
7378 unsigned Part
= State
.Instance
->Part
;
7379 unsigned Lane
= State
.Instance
->Lane
;
7381 Value
*ConditionBit
= nullptr;
7382 if (!User
) // Block in mask is all-one.
7383 ConditionBit
= State
.Builder
.getTrue();
7385 VPValue
*BlockInMask
= User
->getOperand(0);
7386 ConditionBit
= State
.get(BlockInMask
, Part
);
7387 if (ConditionBit
->getType()->isVectorTy())
7388 ConditionBit
= State
.Builder
.CreateExtractElement(
7389 ConditionBit
, State
.Builder
.getInt32(Lane
));
7392 // Replace the temporary unreachable terminator with a new conditional branch,
7393 // whose two destinations will be set later when they are created.
7394 auto *CurrentTerminator
= State
.CFG
.PrevBB
->getTerminator();
7395 assert(isa
<UnreachableInst
>(CurrentTerminator
) &&
7396 "Expected to replace unreachable terminator with conditional branch.");
7397 auto *CondBr
= BranchInst::Create(State
.CFG
.PrevBB
, nullptr, ConditionBit
);
7398 CondBr
->setSuccessor(0, nullptr);
7399 ReplaceInstWithInst(CurrentTerminator
, CondBr
);
7402 void VPPredInstPHIRecipe::execute(VPTransformState
&State
) {
7403 assert(State
.Instance
&& "Predicated instruction PHI works per instance.");
7404 Instruction
*ScalarPredInst
= cast
<Instruction
>(
7405 State
.ValueMap
.getScalarValue(PredInst
, *State
.Instance
));
7406 BasicBlock
*PredicatedBB
= ScalarPredInst
->getParent();
7407 BasicBlock
*PredicatingBB
= PredicatedBB
->getSinglePredecessor();
7408 assert(PredicatingBB
&& "Predicated block has no single predecessor.");
7410 // By current pack/unpack logic we need to generate only a single phi node: if
7411 // a vector value for the predicated instruction exists at this point it means
7412 // the instruction has vector users only, and a phi for the vector value is
7413 // needed. In this case the recipe of the predicated instruction is marked to
7414 // also do that packing, thereby "hoisting" the insert-element sequence.
7415 // Otherwise, a phi node for the scalar value is needed.
7416 unsigned Part
= State
.Instance
->Part
;
7417 if (State
.ValueMap
.hasVectorValue(PredInst
, Part
)) {
7418 Value
*VectorValue
= State
.ValueMap
.getVectorValue(PredInst
, Part
);
7419 InsertElementInst
*IEI
= cast
<InsertElementInst
>(VectorValue
);
7420 PHINode
*VPhi
= State
.Builder
.CreatePHI(IEI
->getType(), 2);
7421 VPhi
->addIncoming(IEI
->getOperand(0), PredicatingBB
); // Unmodified vector.
7422 VPhi
->addIncoming(IEI
, PredicatedBB
); // New vector with inserted element.
7423 State
.ValueMap
.resetVectorValue(PredInst
, Part
, VPhi
); // Update cache.
7425 Type
*PredInstType
= PredInst
->getType();
7426 PHINode
*Phi
= State
.Builder
.CreatePHI(PredInstType
, 2);
7427 Phi
->addIncoming(UndefValue::get(ScalarPredInst
->getType()), PredicatingBB
);
7428 Phi
->addIncoming(ScalarPredInst
, PredicatedBB
);
7429 State
.ValueMap
.resetScalarValue(PredInst
, *State
.Instance
, Phi
);
7433 void VPWidenMemoryInstructionRecipe::execute(VPTransformState
&State
) {
7435 return State
.ILV
->vectorizeMemoryInstruction(&Instr
);
7437 // Last (and currently only) operand is a mask.
7438 InnerLoopVectorizer::VectorParts
MaskValues(State
.UF
);
7439 VPValue
*Mask
= User
->getOperand(User
->getNumOperands() - 1);
7440 for (unsigned Part
= 0; Part
< State
.UF
; ++Part
)
7441 MaskValues
[Part
] = State
.get(Mask
, Part
);
7442 State
.ILV
->vectorizeMemoryInstruction(&Instr
, &MaskValues
);
7445 static ScalarEpilogueLowering
7446 getScalarEpilogueLowering(Function
*F
, Loop
*L
, LoopVectorizeHints
&Hints
,
7447 ProfileSummaryInfo
*PSI
, BlockFrequencyInfo
*BFI
) {
7448 ScalarEpilogueLowering SEL
= CM_ScalarEpilogueAllowed
;
7449 if (Hints
.getForce() != LoopVectorizeHints::FK_Enabled
&&
7451 llvm::shouldOptimizeForSize(L
->getHeader(), PSI
, BFI
)))
7452 SEL
= CM_ScalarEpilogueNotAllowedOptSize
;
7453 else if (PreferPredicateOverEpilog
|| Hints
.getPredicate())
7454 SEL
= CM_ScalarEpilogueNotNeededUsePredicate
;
7459 // Process the loop in the VPlan-native vectorization path. This path builds
7460 // VPlan upfront in the vectorization pipeline, which allows to apply
7461 // VPlan-to-VPlan transformations from the very beginning without modifying the
7463 static bool processLoopInVPlanNativePath(
7464 Loop
*L
, PredicatedScalarEvolution
&PSE
, LoopInfo
*LI
, DominatorTree
*DT
,
7465 LoopVectorizationLegality
*LVL
, TargetTransformInfo
*TTI
,
7466 TargetLibraryInfo
*TLI
, DemandedBits
*DB
, AssumptionCache
*AC
,
7467 OptimizationRemarkEmitter
*ORE
, BlockFrequencyInfo
*BFI
,
7468 ProfileSummaryInfo
*PSI
, LoopVectorizeHints
&Hints
) {
7470 assert(EnableVPlanNativePath
&& "VPlan-native path is disabled.");
7471 Function
*F
= L
->getHeader()->getParent();
7472 InterleavedAccessInfo
IAI(PSE
, L
, DT
, LI
, LVL
->getLAI());
7473 ScalarEpilogueLowering SEL
= getScalarEpilogueLowering(F
, L
, Hints
, PSI
, BFI
);
7475 LoopVectorizationCostModel
CM(SEL
, L
, PSE
, LI
, LVL
, *TTI
, TLI
, DB
, AC
, ORE
, F
,
7477 // Use the planner for outer loop vectorization.
7478 // TODO: CM is not used at this point inside the planner. Turn CM into an
7479 // optional argument if we don't need it in the future.
7480 LoopVectorizationPlanner
LVP(L
, LI
, TLI
, TTI
, LVL
, CM
);
7482 // Get user vectorization factor.
7483 const unsigned UserVF
= Hints
.getWidth();
7485 // Plan how to best vectorize, return the best VF and its cost.
7486 const VectorizationFactor VF
= LVP
.planInVPlanNativePath(UserVF
);
7488 // If we are stress testing VPlan builds, do not attempt to generate vector
7489 // code. Masked vector code generation support will follow soon.
7490 // Also, do not attempt to vectorize if no vector code will be produced.
7491 if (VPlanBuildStressTest
|| EnableVPlanPredication
||
7492 VectorizationFactor::Disabled() == VF
)
7495 LVP
.setBestPlan(VF
.Width
, 1);
7497 InnerLoopVectorizer
LB(L
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
, VF
.Width
, 1, LVL
,
7499 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7500 << L
->getHeader()->getParent()->getName() << "\"\n");
7501 LVP
.executePlan(LB
, DT
);
7503 // Mark the loop as already vectorized to avoid vectorizing again.
7504 Hints
.setAlreadyVectorized();
7506 LLVM_DEBUG(verifyFunction(*L
->getHeader()->getParent()));
7510 bool LoopVectorizePass::processLoop(Loop
*L
) {
7511 assert((EnableVPlanNativePath
|| L
->empty()) &&
7512 "VPlan-native path is not enabled. Only process inner loops.");
7515 const std::string DebugLocStr
= getDebugLocString(L
);
7518 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7519 << L
->getHeader()->getParent()->getName() << "\" from "
7520 << DebugLocStr
<< "\n");
7522 LoopVectorizeHints
Hints(L
, InterleaveOnlyWhenForced
, *ORE
);
7525 dbgs() << "LV: Loop hints:"
7527 << (Hints
.getForce() == LoopVectorizeHints::FK_Disabled
7529 : (Hints
.getForce() == LoopVectorizeHints::FK_Enabled
7532 << " width=" << Hints
.getWidth()
7533 << " unroll=" << Hints
.getInterleave() << "\n");
7535 // Function containing loop
7536 Function
*F
= L
->getHeader()->getParent();
7538 // Looking at the diagnostic output is the only way to determine if a loop
7539 // was vectorized (other than looking at the IR or machine code), so it
7540 // is important to generate an optimization remark for each loop. Most of
7541 // these messages are generated as OptimizationRemarkAnalysis. Remarks
7542 // generated as OptimizationRemark and OptimizationRemarkMissed are
7543 // less verbose reporting vectorized loops and unvectorized loops that may
7544 // benefit from vectorization, respectively.
7546 if (!Hints
.allowVectorization(F
, L
, VectorizeOnlyWhenForced
)) {
7547 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7551 PredicatedScalarEvolution
PSE(*SE
, *L
);
7553 // Check if it is legal to vectorize the loop.
7554 LoopVectorizationRequirements
Requirements(*ORE
);
7555 LoopVectorizationLegality
LVL(L
, PSE
, DT
, TTI
, TLI
, AA
, F
, GetLAA
, LI
, ORE
,
7556 &Requirements
, &Hints
, DB
, AC
);
7557 if (!LVL
.canVectorize(EnableVPlanNativePath
)) {
7558 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7559 Hints
.emitRemarkWithHints();
7563 // Check the function attributes and profiles to find out if this function
7564 // should be optimized for size.
7565 ScalarEpilogueLowering SEL
= getScalarEpilogueLowering(F
, L
, Hints
, PSI
, BFI
);
7567 // Entrance to the VPlan-native vectorization path. Outer loops are processed
7568 // here. They may require CFG and instruction level transformations before
7569 // even evaluating whether vectorization is profitable. Since we cannot modify
7570 // the incoming IR, we need to build VPlan upfront in the vectorization
7573 return processLoopInVPlanNativePath(L
, PSE
, LI
, DT
, &LVL
, TTI
, TLI
, DB
, AC
,
7574 ORE
, BFI
, PSI
, Hints
);
7576 assert(L
->empty() && "Inner loop expected.");
7578 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7579 // count by optimizing for size, to minimize overheads.
7580 auto ExpectedTC
= getSmallBestKnownTC(*SE
, L
);
7581 if (ExpectedTC
&& *ExpectedTC
< TinyTripCountVectorThreshold
) {
7582 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7583 << "This loop is worth vectorizing only if no scalar "
7584 << "iteration overheads are incurred.");
7585 if (Hints
.getForce() == LoopVectorizeHints::FK_Enabled
)
7586 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7588 LLVM_DEBUG(dbgs() << "\n");
7589 SEL
= CM_ScalarEpilogueNotAllowedLowTripLoop
;
7593 // Check the function attributes to see if implicit floats are allowed.
7594 // FIXME: This check doesn't seem possibly correct -- what if the loop is
7595 // an integer loop and the vector instructions selected are purely integer
7596 // vector instructions?
7597 if (F
->hasFnAttribute(Attribute::NoImplicitFloat
)) {
7598 reportVectorizationFailure(
7599 "Can't vectorize when the NoImplicitFloat attribute is used",
7600 "loop not vectorized due to NoImplicitFloat attribute",
7601 "NoImplicitFloat", ORE
, L
);
7602 Hints
.emitRemarkWithHints();
7606 // Check if the target supports potentially unsafe FP vectorization.
7607 // FIXME: Add a check for the type of safety issue (denormal, signaling)
7608 // for the target we're vectorizing for, to make sure none of the
7609 // additional fp-math flags can help.
7610 if (Hints
.isPotentiallyUnsafe() &&
7611 TTI
->isFPVectorizationPotentiallyUnsafe()) {
7612 reportVectorizationFailure(
7613 "Potentially unsafe FP op prevents vectorization",
7614 "loop not vectorized due to unsafe FP support.",
7615 "UnsafeFP", ORE
, L
);
7616 Hints
.emitRemarkWithHints();
7620 bool UseInterleaved
= TTI
->enableInterleavedAccessVectorization();
7621 InterleavedAccessInfo
IAI(PSE
, L
, DT
, LI
, LVL
.getLAI());
7623 // If an override option has been passed in for interleaved accesses, use it.
7624 if (EnableInterleavedMemAccesses
.getNumOccurrences() > 0)
7625 UseInterleaved
= EnableInterleavedMemAccesses
;
7627 // Analyze interleaved memory accesses.
7628 if (UseInterleaved
) {
7629 IAI
.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI
));
7632 // Use the cost model.
7633 LoopVectorizationCostModel
CM(SEL
, L
, PSE
, LI
, &LVL
, *TTI
, TLI
, DB
, AC
, ORE
,
7635 CM
.collectValuesToIgnore();
7637 // Use the planner for vectorization.
7638 LoopVectorizationPlanner
LVP(L
, LI
, TLI
, TTI
, &LVL
, CM
);
7640 // Get user vectorization factor.
7641 unsigned UserVF
= Hints
.getWidth();
7643 // Plan how to best vectorize, return the best VF and its cost.
7644 Optional
<VectorizationFactor
> MaybeVF
= LVP
.plan(UserVF
);
7646 VectorizationFactor VF
= VectorizationFactor::Disabled();
7648 unsigned UserIC
= Hints
.getInterleave();
7652 // Select the interleave count.
7653 IC
= CM
.selectInterleaveCount(VF
.Width
, VF
.Cost
);
7656 // Identify the diagnostic messages that should be produced.
7657 std::pair
<StringRef
, std::string
> VecDiagMsg
, IntDiagMsg
;
7658 bool VectorizeLoop
= true, InterleaveLoop
= true;
7659 if (Requirements
.doesNotMeet(F
, L
, Hints
)) {
7660 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7662 Hints
.emitRemarkWithHints();
7666 if (VF
.Width
== 1) {
7667 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7668 VecDiagMsg
= std::make_pair(
7669 "VectorizationNotBeneficial",
7670 "the cost-model indicates that vectorization is not beneficial");
7671 VectorizeLoop
= false;
7674 if (!MaybeVF
&& UserIC
> 1) {
7675 // Tell the user interleaving was avoided up-front, despite being explicitly
7677 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7678 "interleaving should be avoided up front\n");
7679 IntDiagMsg
= std::make_pair(
7680 "InterleavingAvoided",
7681 "Ignoring UserIC, because interleaving was avoided up front");
7682 InterleaveLoop
= false;
7683 } else if (IC
== 1 && UserIC
<= 1) {
7684 // Tell the user interleaving is not beneficial.
7685 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7686 IntDiagMsg
= std::make_pair(
7687 "InterleavingNotBeneficial",
7688 "the cost-model indicates that interleaving is not beneficial");
7689 InterleaveLoop
= false;
7691 IntDiagMsg
.first
= "InterleavingNotBeneficialAndDisabled";
7692 IntDiagMsg
.second
+=
7693 " and is explicitly disabled or interleave count is set to 1";
7695 } else if (IC
> 1 && UserIC
== 1) {
7696 // Tell the user interleaving is beneficial, but it explicitly disabled.
7698 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7699 IntDiagMsg
= std::make_pair(
7700 "InterleavingBeneficialButDisabled",
7701 "the cost-model indicates that interleaving is beneficial "
7702 "but is explicitly disabled or interleave count is set to 1");
7703 InterleaveLoop
= false;
7706 // Override IC if user provided an interleave count.
7707 IC
= UserIC
> 0 ? UserIC
: IC
;
7709 // Emit diagnostic messages, if any.
7710 const char *VAPassName
= Hints
.vectorizeAnalysisPassName();
7711 if (!VectorizeLoop
&& !InterleaveLoop
) {
7712 // Do not vectorize or interleaving the loop.
7714 return OptimizationRemarkMissed(VAPassName
, VecDiagMsg
.first
,
7715 L
->getStartLoc(), L
->getHeader())
7716 << VecDiagMsg
.second
;
7719 return OptimizationRemarkMissed(LV_NAME
, IntDiagMsg
.first
,
7720 L
->getStartLoc(), L
->getHeader())
7721 << IntDiagMsg
.second
;
7724 } else if (!VectorizeLoop
&& InterleaveLoop
) {
7725 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC
<< '\n');
7727 return OptimizationRemarkAnalysis(VAPassName
, VecDiagMsg
.first
,
7728 L
->getStartLoc(), L
->getHeader())
7729 << VecDiagMsg
.second
;
7731 } else if (VectorizeLoop
&& !InterleaveLoop
) {
7732 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF
.Width
7733 << ") in " << DebugLocStr
<< '\n');
7735 return OptimizationRemarkAnalysis(LV_NAME
, IntDiagMsg
.first
,
7736 L
->getStartLoc(), L
->getHeader())
7737 << IntDiagMsg
.second
;
7739 } else if (VectorizeLoop
&& InterleaveLoop
) {
7740 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF
.Width
7741 << ") in " << DebugLocStr
<< '\n');
7742 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC
<< '\n');
7745 LVP
.setBestPlan(VF
.Width
, IC
);
7747 using namespace ore
;
7748 bool DisableRuntimeUnroll
= false;
7749 MDNode
*OrigLoopID
= L
->getLoopID();
7751 if (!VectorizeLoop
) {
7752 assert(IC
> 1 && "interleave count should not be 1 or 0");
7753 // If we decided that it is not legal to vectorize the loop, then
7755 InnerLoopUnroller
Unroller(L
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
, IC
, &LVL
,
7757 LVP
.executePlan(Unroller
, DT
);
7760 return OptimizationRemark(LV_NAME
, "Interleaved", L
->getStartLoc(),
7762 << "interleaved loop (interleaved count: "
7763 << NV("InterleaveCount", IC
) << ")";
7766 // If we decided that it is *legal* to vectorize the loop, then do it.
7767 InnerLoopVectorizer
LB(L
, PSE
, LI
, DT
, TLI
, TTI
, AC
, ORE
, VF
.Width
, IC
,
7769 LVP
.executePlan(LB
, DT
);
7772 // Add metadata to disable runtime unrolling a scalar loop when there are
7773 // no runtime checks about strides and memory. A scalar loop that is
7774 // rarely used is not worth unrolling.
7775 if (!LB
.areSafetyChecksAdded())
7776 DisableRuntimeUnroll
= true;
7778 // Report the vectorization decision.
7780 return OptimizationRemark(LV_NAME
, "Vectorized", L
->getStartLoc(),
7782 << "vectorized loop (vectorization width: "
7783 << NV("VectorizationFactor", VF
.Width
)
7784 << ", interleaved count: " << NV("InterleaveCount", IC
) << ")";
7788 Optional
<MDNode
*> RemainderLoopID
=
7789 makeFollowupLoopID(OrigLoopID
, {LLVMLoopVectorizeFollowupAll
,
7790 LLVMLoopVectorizeFollowupEpilogue
});
7791 if (RemainderLoopID
.hasValue()) {
7792 L
->setLoopID(RemainderLoopID
.getValue());
7794 if (DisableRuntimeUnroll
)
7795 AddRuntimeUnrollDisableMetaData(L
);
7797 // Mark the loop as already vectorized to avoid vectorizing again.
7798 Hints
.setAlreadyVectorized();
7801 LLVM_DEBUG(verifyFunction(*L
->getHeader()->getParent()));
7805 bool LoopVectorizePass::runImpl(
7806 Function
&F
, ScalarEvolution
&SE_
, LoopInfo
&LI_
, TargetTransformInfo
&TTI_
,
7807 DominatorTree
&DT_
, BlockFrequencyInfo
&BFI_
, TargetLibraryInfo
*TLI_
,
7808 DemandedBits
&DB_
, AliasAnalysis
&AA_
, AssumptionCache
&AC_
,
7809 std::function
<const LoopAccessInfo
&(Loop
&)> &GetLAA_
,
7810 OptimizationRemarkEmitter
&ORE_
, ProfileSummaryInfo
*PSI_
) {
7825 // 1. the target claims to have no vector registers, and
7826 // 2. interleaving won't help ILP.
7828 // The second condition is necessary because, even if the target has no
7829 // vector registers, loop vectorization may still enable scalar
7831 if (!TTI
->getNumberOfRegisters(TTI
->getRegisterClassForType(true)) &&
7832 TTI
->getMaxInterleaveFactor(1) < 2)
7835 bool Changed
= false;
7837 // The vectorizer requires loops to be in simplified form.
7838 // Since simplification may add new inner loops, it has to run before the
7839 // legality and profitability checks. This means running the loop vectorizer
7840 // will simplify all loops, regardless of whether anything end up being
7844 simplifyLoop(L
, DT
, LI
, SE
, AC
, nullptr, false /* PreserveLCSSA */);
7846 // Build up a worklist of inner-loops to vectorize. This is necessary as
7847 // the act of vectorizing or partially unrolling a loop creates new loops
7848 // and can invalidate iterators across the loops.
7849 SmallVector
<Loop
*, 8> Worklist
;
7852 collectSupportedLoops(*L
, LI
, ORE
, Worklist
);
7854 LoopsAnalyzed
+= Worklist
.size();
7856 // Now walk the identified inner loops.
7857 while (!Worklist
.empty()) {
7858 Loop
*L
= Worklist
.pop_back_val();
7860 // For the inner loops we actually process, form LCSSA to simplify the
7862 Changed
|= formLCSSARecursively(*L
, *DT
, LI
, SE
);
7864 Changed
|= processLoop(L
);
7867 // Process each loop nest in the function.
7871 PreservedAnalyses
LoopVectorizePass::run(Function
&F
,
7872 FunctionAnalysisManager
&AM
) {
7873 auto &SE
= AM
.getResult
<ScalarEvolutionAnalysis
>(F
);
7874 auto &LI
= AM
.getResult
<LoopAnalysis
>(F
);
7875 auto &TTI
= AM
.getResult
<TargetIRAnalysis
>(F
);
7876 auto &DT
= AM
.getResult
<DominatorTreeAnalysis
>(F
);
7877 auto &BFI
= AM
.getResult
<BlockFrequencyAnalysis
>(F
);
7878 auto &TLI
= AM
.getResult
<TargetLibraryAnalysis
>(F
);
7879 auto &AA
= AM
.getResult
<AAManager
>(F
);
7880 auto &AC
= AM
.getResult
<AssumptionAnalysis
>(F
);
7881 auto &DB
= AM
.getResult
<DemandedBitsAnalysis
>(F
);
7882 auto &ORE
= AM
.getResult
<OptimizationRemarkEmitterAnalysis
>(F
);
7883 MemorySSA
*MSSA
= EnableMSSALoopDependency
7884 ? &AM
.getResult
<MemorySSAAnalysis
>(F
).getMSSA()
7887 auto &LAM
= AM
.getResult
<LoopAnalysisManagerFunctionProxy
>(F
).getManager();
7888 std::function
<const LoopAccessInfo
&(Loop
&)> GetLAA
=
7889 [&](Loop
&L
) -> const LoopAccessInfo
& {
7890 LoopStandardAnalysisResults AR
= {AA
, AC
, DT
, LI
, SE
, TLI
, TTI
, MSSA
};
7891 return LAM
.getResult
<LoopAccessAnalysis
>(L
, AR
);
7893 const ModuleAnalysisManager
&MAM
=
7894 AM
.getResult
<ModuleAnalysisManagerFunctionProxy
>(F
).getManager();
7895 ProfileSummaryInfo
*PSI
=
7896 MAM
.getCachedResult
<ProfileSummaryAnalysis
>(*F
.getParent());
7898 runImpl(F
, SE
, LI
, TTI
, DT
, BFI
, &TLI
, DB
, AA
, AC
, GetLAA
, ORE
, PSI
);
7900 return PreservedAnalyses::all();
7901 PreservedAnalyses PA
;
7903 // We currently do not preserve loopinfo/dominator analyses with outer loop
7904 // vectorization. Until this is addressed, mark these analyses as preserved
7905 // only for non-VPlan-native path.
7906 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7907 if (!EnableVPlanNativePath
) {
7908 PA
.preserve
<LoopAnalysis
>();
7909 PA
.preserve
<DominatorTreeAnalysis
>();
7911 PA
.preserve
<BasicAA
>();
7912 PA
.preserve
<GlobalsAA
>();