[ASan] Make insertion of version mismatch guard configurable
[llvm-core.git] / lib / Transforms / Vectorize / LoopVectorize.cpp
blob478174f8251c1cd6bc5884af0f43a40e2f01773a
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 // of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 // widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 // of vectorization. It decides on the optimal vector width, which
26 // can be one, if vectorization is not profitable.
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
36 //===----------------------------------------------------------------------===//
38 // The reduction-variable vectorization is based on the paper:
39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
41 // Variable uniformity checks are inspired by:
42 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
44 // The interleaved access vectorization is based on the paper:
45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46 // Data for SIMD
48 // Other ideas/concepts are from:
49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52 // Vectorizing Compilers.
54 //===----------------------------------------------------------------------===//
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanHCFGTransforms.h"
62 #include "VPlanPredicator.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/LoopSimplify.h"
137 #include "llvm/Transforms/Utils/LoopUtils.h"
138 #include "llvm/Transforms/Utils/LoopVersioning.h"
139 #include "llvm/Transforms/Utils/SizeOpts.h"
140 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
141 #include <algorithm>
142 #include <cassert>
143 #include <cstdint>
144 #include <cstdlib>
145 #include <functional>
146 #include <iterator>
147 #include <limits>
148 #include <memory>
149 #include <string>
150 #include <tuple>
151 #include <utility>
152 #include <vector>
154 using namespace llvm;
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
159 /// @{
160 /// Metadata attribute names
161 static const char *const LLVMLoopVectorizeFollowupAll =
162 "llvm.loop.vectorize.followup_all";
163 static const char *const LLVMLoopVectorizeFollowupVectorized =
164 "llvm.loop.vectorize.followup_vectorized";
165 static const char *const LLVMLoopVectorizeFollowupEpilogue =
166 "llvm.loop.vectorize.followup_epilogue";
167 /// @}
169 STATISTIC(LoopsVectorized, "Number of loops vectorized");
170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 /// Loops with a known constant trip count below this number are vectorized only
173 /// if no scalar iteration overheads are incurred.
174 static cl::opt<unsigned> TinyTripCountVectorThreshold(
175 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
176 cl::desc("Loops with a constant trip count that is smaller than this "
177 "value are vectorized only if no scalar iteration overheads "
178 "are incurred."));
180 // Indicates that an epilogue is undesired, predication is preferred.
181 // This means that the vectorizer will try to fold the loop-tail (epilogue)
182 // into the loop and predicate the loop body accordingly.
183 static cl::opt<bool> PreferPredicateOverEpilog(
184 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
185 cl::desc("Indicate that an epilogue is undesired, predication should be "
186 "used instead."));
188 static cl::opt<bool> MaximizeBandwidth(
189 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
190 cl::desc("Maximize bandwidth when selecting vectorization factor which "
191 "will be determined by the smallest type in loop."));
193 static cl::opt<bool> EnableInterleavedMemAccesses(
194 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
195 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
197 /// An interleave-group may need masking if it resides in a block that needs
198 /// predication, or in order to mask away gaps.
199 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
200 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
201 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
203 /// We don't interleave loops with a known constant trip count below this
204 /// number.
205 static const unsigned TinyTripCountInterleaveThreshold = 128;
207 static cl::opt<unsigned> ForceTargetNumScalarRegs(
208 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
209 cl::desc("A flag that overrides the target's number of scalar registers."));
211 static cl::opt<unsigned> ForceTargetNumVectorRegs(
212 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
213 cl::desc("A flag that overrides the target's number of vector registers."));
215 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
216 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
217 cl::desc("A flag that overrides the target's max interleave factor for "
218 "scalar loops."));
220 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
221 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
222 cl::desc("A flag that overrides the target's max interleave factor for "
223 "vectorized loops."));
225 static cl::opt<unsigned> ForceTargetInstructionCost(
226 "force-target-instruction-cost", cl::init(0), cl::Hidden,
227 cl::desc("A flag that overrides the target's expected cost for "
228 "an instruction to a single constant value. Mostly "
229 "useful for getting consistent testing."));
231 static cl::opt<unsigned> SmallLoopCost(
232 "small-loop-cost", cl::init(20), cl::Hidden,
233 cl::desc(
234 "The cost of a loop that is considered 'small' by the interleaver."));
236 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
237 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
238 cl::desc("Enable the use of the block frequency analysis to access PGO "
239 "heuristics minimizing code growth in cold regions and being more "
240 "aggressive in hot regions."));
242 // Runtime interleave loops for load/store throughput.
243 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
244 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
245 cl::desc(
246 "Enable runtime interleaving until load/store ports are saturated"));
248 /// The number of stores in a loop that are allowed to need predication.
249 static cl::opt<unsigned> NumberOfStoresToPredicate(
250 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
251 cl::desc("Max number of stores to be predicated behind an if."));
253 static cl::opt<bool> EnableIndVarRegisterHeur(
254 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
255 cl::desc("Count the induction variable only once when interleaving"));
257 static cl::opt<bool> EnableCondStoresVectorization(
258 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
259 cl::desc("Enable if predication of stores during vectorization."));
261 static cl::opt<unsigned> MaxNestedScalarReductionIC(
262 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
263 cl::desc("The maximum interleave count to use when interleaving a scalar "
264 "reduction in a nested loop."));
266 cl::opt<bool> EnableVPlanNativePath(
267 "enable-vplan-native-path", cl::init(false), cl::Hidden,
268 cl::desc("Enable VPlan-native vectorization path with "
269 "support for outer loop vectorization."));
271 // FIXME: Remove this switch once we have divergence analysis. Currently we
272 // assume divergent non-backedge branches when this switch is true.
273 cl::opt<bool> EnableVPlanPredication(
274 "enable-vplan-predication", cl::init(false), cl::Hidden,
275 cl::desc("Enable VPlan-native vectorization path predicator with "
276 "support for outer loop vectorization."));
278 // This flag enables the stress testing of the VPlan H-CFG construction in the
279 // VPlan-native vectorization path. It must be used in conjuction with
280 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
281 // verification of the H-CFGs built.
282 static cl::opt<bool> VPlanBuildStressTest(
283 "vplan-build-stress-test", cl::init(false), cl::Hidden,
284 cl::desc(
285 "Build VPlan for every supported loop nest in the function and bail "
286 "out right after the build (stress test the VPlan H-CFG construction "
287 "in the VPlan-native vectorization path)."));
289 cl::opt<bool> llvm::EnableLoopInterleaving(
290 "interleave-loops", cl::init(true), cl::Hidden,
291 cl::desc("Enable loop interleaving in Loop vectorization passes"));
292 cl::opt<bool> llvm::EnableLoopVectorization(
293 "vectorize-loops", cl::init(true), cl::Hidden,
294 cl::desc("Run the Loop vectorization passes"));
296 /// A helper function for converting Scalar types to vector types.
297 /// If the incoming type is void, we return void. If the VF is 1, we return
298 /// the scalar type.
299 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
300 if (Scalar->isVoidTy() || VF == 1)
301 return Scalar;
302 return VectorType::get(Scalar, VF);
305 /// A helper function that returns the type of loaded or stored value.
306 static Type *getMemInstValueType(Value *I) {
307 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
308 "Expected Load or Store instruction");
309 if (auto *LI = dyn_cast<LoadInst>(I))
310 return LI->getType();
311 return cast<StoreInst>(I)->getValueOperand()->getType();
314 /// A helper function that returns true if the given type is irregular. The
315 /// type is irregular if its allocated size doesn't equal the store size of an
316 /// element of the corresponding vector type at the given vectorization factor.
317 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
318 // Determine if an array of VF elements of type Ty is "bitcast compatible"
319 // with a <VF x Ty> vector.
320 if (VF > 1) {
321 auto *VectorTy = VectorType::get(Ty, VF);
322 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
325 // If the vectorization factor is one, we just check if an array of type Ty
326 // requires padding between elements.
327 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
330 /// A helper function that returns the reciprocal of the block probability of
331 /// predicated blocks. If we return X, we are assuming the predicated block
332 /// will execute once for every X iterations of the loop header.
334 /// TODO: We should use actual block probability here, if available. Currently,
335 /// we always assume predicated blocks have a 50% chance of executing.
336 static unsigned getReciprocalPredBlockProb() { return 2; }
338 /// A helper function that adds a 'fast' flag to floating-point operations.
339 static Value *addFastMathFlag(Value *V) {
340 if (isa<FPMathOperator>(V))
341 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
342 return V;
345 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
346 if (isa<FPMathOperator>(V))
347 cast<Instruction>(V)->setFastMathFlags(FMF);
348 return V;
351 /// A helper function that returns an integer or floating-point constant with
352 /// value C.
353 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
354 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
355 : ConstantFP::get(Ty, C);
358 namespace llvm {
360 /// InnerLoopVectorizer vectorizes loops which contain only one basic
361 /// block to a specified vectorization factor (VF).
362 /// This class performs the widening of scalars into vectors, or multiple
363 /// scalars. This class also implements the following features:
364 /// * It inserts an epilogue loop for handling loops that don't have iteration
365 /// counts that are known to be a multiple of the vectorization factor.
366 /// * It handles the code generation for reduction variables.
367 /// * Scalarization (implementation using scalars) of un-vectorizable
368 /// instructions.
369 /// InnerLoopVectorizer does not perform any vectorization-legality
370 /// checks, and relies on the caller to check for the different legality
371 /// aspects. The InnerLoopVectorizer relies on the
372 /// LoopVectorizationLegality class to provide information about the induction
373 /// and reduction variables that were found to a given vectorization factor.
374 class InnerLoopVectorizer {
375 public:
376 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
377 LoopInfo *LI, DominatorTree *DT,
378 const TargetLibraryInfo *TLI,
379 const TargetTransformInfo *TTI, AssumptionCache *AC,
380 OptimizationRemarkEmitter *ORE, unsigned VecWidth,
381 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
382 LoopVectorizationCostModel *CM)
383 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
384 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
385 Builder(PSE.getSE()->getContext()),
386 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
387 virtual ~InnerLoopVectorizer() = default;
389 /// Create a new empty loop. Unlink the old loop and connect the new one.
390 /// Return the pre-header block of the new loop.
391 BasicBlock *createVectorizedLoopSkeleton();
393 /// Widen a single instruction within the innermost loop.
394 void widenInstruction(Instruction &I);
396 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
397 void fixVectorizedLoop();
399 // Return true if any runtime check is added.
400 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
402 /// A type for vectorized values in the new loop. Each value from the
403 /// original loop, when vectorized, is represented by UF vector values in the
404 /// new unrolled loop, where UF is the unroll factor.
405 using VectorParts = SmallVector<Value *, 2>;
407 /// Vectorize a single PHINode in a block. This method handles the induction
408 /// variable canonicalization. It supports both VF = 1 for unrolled loops and
409 /// arbitrary length vectors.
410 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
412 /// A helper function to scalarize a single Instruction in the innermost loop.
413 /// Generates a sequence of scalar instances for each lane between \p MinLane
414 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
415 /// inclusive..
416 void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
417 bool IfPredicateInstr);
419 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
420 /// is provided, the integer induction variable will first be truncated to
421 /// the corresponding type.
422 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
424 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
425 /// vector or scalar value on-demand if one is not yet available. When
426 /// vectorizing a loop, we visit the definition of an instruction before its
427 /// uses. When visiting the definition, we either vectorize or scalarize the
428 /// instruction, creating an entry for it in the corresponding map. (In some
429 /// cases, such as induction variables, we will create both vector and scalar
430 /// entries.) Then, as we encounter uses of the definition, we derive values
431 /// for each scalar or vector use unless such a value is already available.
432 /// For example, if we scalarize a definition and one of its uses is vector,
433 /// we build the required vector on-demand with an insertelement sequence
434 /// when visiting the use. Otherwise, if the use is scalar, we can use the
435 /// existing scalar definition.
437 /// Return a value in the new loop corresponding to \p V from the original
438 /// loop at unroll index \p Part. If the value has already been vectorized,
439 /// the corresponding vector entry in VectorLoopValueMap is returned. If,
440 /// however, the value has a scalar entry in VectorLoopValueMap, we construct
441 /// a new vector value on-demand by inserting the scalar values into a vector
442 /// with an insertelement sequence. If the value has been neither vectorized
443 /// nor scalarized, it must be loop invariant, so we simply broadcast the
444 /// value into a vector.
445 Value *getOrCreateVectorValue(Value *V, unsigned Part);
447 /// Return a value in the new loop corresponding to \p V from the original
448 /// loop at unroll and vector indices \p Instance. If the value has been
449 /// vectorized but not scalarized, the necessary extractelement instruction
450 /// will be generated.
451 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
453 /// Construct the vector value of a scalarized value \p V one lane at a time.
454 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
456 /// Try to vectorize the interleaved access group that \p Instr belongs to,
457 /// optionally masking the vector operations if \p BlockInMask is non-null.
458 void vectorizeInterleaveGroup(Instruction *Instr,
459 VectorParts *BlockInMask = nullptr);
461 /// Vectorize Load and Store instructions, optionally masking the vector
462 /// operations if \p BlockInMask is non-null.
463 void vectorizeMemoryInstruction(Instruction *Instr,
464 VectorParts *BlockInMask = nullptr);
466 /// Set the debug location in the builder using the debug location in
467 /// the instruction.
468 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
470 /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
471 void fixNonInductionPHIs(void);
473 protected:
474 friend class LoopVectorizationPlanner;
476 /// A small list of PHINodes.
477 using PhiVector = SmallVector<PHINode *, 4>;
479 /// A type for scalarized values in the new loop. Each value from the
480 /// original loop, when scalarized, is represented by UF x VF scalar values
481 /// in the new unrolled loop, where UF is the unroll factor and VF is the
482 /// vectorization factor.
483 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
485 /// Set up the values of the IVs correctly when exiting the vector loop.
486 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
487 Value *CountRoundDown, Value *EndValue,
488 BasicBlock *MiddleBlock);
490 /// Create a new induction variable inside L.
491 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
492 Value *Step, Instruction *DL);
494 /// Handle all cross-iteration phis in the header.
495 void fixCrossIterationPHIs();
497 /// Fix a first-order recurrence. This is the second phase of vectorizing
498 /// this phi node.
499 void fixFirstOrderRecurrence(PHINode *Phi);
501 /// Fix a reduction cross-iteration phi. This is the second phase of
502 /// vectorizing this phi node.
503 void fixReduction(PHINode *Phi);
505 /// The Loop exit block may have single value PHI nodes with some
506 /// incoming value. While vectorizing we only handled real values
507 /// that were defined inside the loop and we should have one value for
508 /// each predecessor of its parent basic block. See PR14725.
509 void fixLCSSAPHIs();
511 /// Iteratively sink the scalarized operands of a predicated instruction into
512 /// the block that was created for it.
513 void sinkScalarOperands(Instruction *PredInst);
515 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
516 /// represented as.
517 void truncateToMinimalBitwidths();
519 /// Insert the new loop to the loop hierarchy and pass manager
520 /// and update the analysis passes.
521 void updateAnalysis();
523 /// Create a broadcast instruction. This method generates a broadcast
524 /// instruction (shuffle) for loop invariant values and for the induction
525 /// value. If this is the induction variable then we extend it to N, N+1, ...
526 /// this is needed because each iteration in the loop corresponds to a SIMD
527 /// element.
528 virtual Value *getBroadcastInstrs(Value *V);
530 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
531 /// to each vector element of Val. The sequence starts at StartIndex.
532 /// \p Opcode is relevant for FP induction variable.
533 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
534 Instruction::BinaryOps Opcode =
535 Instruction::BinaryOpsEnd);
537 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
538 /// variable on which to base the steps, \p Step is the size of the step, and
539 /// \p EntryVal is the value from the original loop that maps to the steps.
540 /// Note that \p EntryVal doesn't have to be an induction variable - it
541 /// can also be a truncate instruction.
542 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
543 const InductionDescriptor &ID);
545 /// Create a vector induction phi node based on an existing scalar one. \p
546 /// EntryVal is the value from the original loop that maps to the vector phi
547 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
548 /// truncate instruction, instead of widening the original IV, we widen a
549 /// version of the IV truncated to \p EntryVal's type.
550 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
551 Value *Step, Instruction *EntryVal);
553 /// Returns true if an instruction \p I should be scalarized instead of
554 /// vectorized for the chosen vectorization factor.
555 bool shouldScalarizeInstruction(Instruction *I) const;
557 /// Returns true if we should generate a scalar version of \p IV.
558 bool needsScalarInduction(Instruction *IV) const;
560 /// If there is a cast involved in the induction variable \p ID, which should
561 /// be ignored in the vectorized loop body, this function records the
562 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
563 /// cast. We had already proved that the casted Phi is equal to the uncasted
564 /// Phi in the vectorized loop (under a runtime guard), and therefore
565 /// there is no need to vectorize the cast - the same value can be used in the
566 /// vector loop for both the Phi and the cast.
567 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
568 /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
570 /// \p EntryVal is the value from the original loop that maps to the vector
571 /// phi node and is used to distinguish what is the IV currently being
572 /// processed - original one (if \p EntryVal is a phi corresponding to the
573 /// original IV) or the "newly-created" one based on the proof mentioned above
574 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
575 /// latter case \p EntryVal is a TruncInst and we must not record anything for
576 /// that IV, but it's error-prone to expect callers of this routine to care
577 /// about that, hence this explicit parameter.
578 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
579 const Instruction *EntryVal,
580 Value *VectorLoopValue,
581 unsigned Part,
582 unsigned Lane = UINT_MAX);
584 /// Generate a shuffle sequence that will reverse the vector Vec.
585 virtual Value *reverseVector(Value *Vec);
587 /// Returns (and creates if needed) the original loop trip count.
588 Value *getOrCreateTripCount(Loop *NewLoop);
590 /// Returns (and creates if needed) the trip count of the widened loop.
591 Value *getOrCreateVectorTripCount(Loop *NewLoop);
593 /// Returns a bitcasted value to the requested vector type.
594 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
595 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
596 const DataLayout &DL);
598 /// Emit a bypass check to see if the vector trip count is zero, including if
599 /// it overflows.
600 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
602 /// Emit a bypass check to see if all of the SCEV assumptions we've
603 /// had to make are correct.
604 void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
606 /// Emit bypass checks to check any memory assumptions we may have made.
607 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
609 /// Compute the transformed value of Index at offset StartValue using step
610 /// StepValue.
611 /// For integer induction, returns StartValue + Index * StepValue.
612 /// For pointer induction, returns StartValue[Index * StepValue].
613 /// FIXME: The newly created binary instructions should contain nsw/nuw
614 /// flags, which can be found from the original scalar operations.
615 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
616 const DataLayout &DL,
617 const InductionDescriptor &ID) const;
619 /// Add additional metadata to \p To that was not present on \p Orig.
621 /// Currently this is used to add the noalias annotations based on the
622 /// inserted memchecks. Use this for instructions that are *cloned* into the
623 /// vector loop.
624 void addNewMetadata(Instruction *To, const Instruction *Orig);
626 /// Add metadata from one instruction to another.
628 /// This includes both the original MDs from \p From and additional ones (\see
629 /// addNewMetadata). Use this for *newly created* instructions in the vector
630 /// loop.
631 void addMetadata(Instruction *To, Instruction *From);
633 /// Similar to the previous function but it adds the metadata to a
634 /// vector of instructions.
635 void addMetadata(ArrayRef<Value *> To, Instruction *From);
637 /// The original loop.
638 Loop *OrigLoop;
640 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
641 /// dynamic knowledge to simplify SCEV expressions and converts them to a
642 /// more usable form.
643 PredicatedScalarEvolution &PSE;
645 /// Loop Info.
646 LoopInfo *LI;
648 /// Dominator Tree.
649 DominatorTree *DT;
651 /// Alias Analysis.
652 AliasAnalysis *AA;
654 /// Target Library Info.
655 const TargetLibraryInfo *TLI;
657 /// Target Transform Info.
658 const TargetTransformInfo *TTI;
660 /// Assumption Cache.
661 AssumptionCache *AC;
663 /// Interface to emit optimization remarks.
664 OptimizationRemarkEmitter *ORE;
666 /// LoopVersioning. It's only set up (non-null) if memchecks were
667 /// used.
669 /// This is currently only used to add no-alias metadata based on the
670 /// memchecks. The actually versioning is performed manually.
671 std::unique_ptr<LoopVersioning> LVer;
673 /// The vectorization SIMD factor to use. Each vector will have this many
674 /// vector elements.
675 unsigned VF;
677 /// The vectorization unroll factor to use. Each scalar is vectorized to this
678 /// many different vector instructions.
679 unsigned UF;
681 /// The builder that we use
682 IRBuilder<> Builder;
684 // --- Vectorization state ---
686 /// The vector-loop preheader.
687 BasicBlock *LoopVectorPreHeader;
689 /// The scalar-loop preheader.
690 BasicBlock *LoopScalarPreHeader;
692 /// Middle Block between the vector and the scalar.
693 BasicBlock *LoopMiddleBlock;
695 /// The ExitBlock of the scalar loop.
696 BasicBlock *LoopExitBlock;
698 /// The vector loop body.
699 BasicBlock *LoopVectorBody;
701 /// The scalar loop body.
702 BasicBlock *LoopScalarBody;
704 /// A list of all bypass blocks. The first block is the entry of the loop.
705 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
707 /// The new Induction variable which was added to the new block.
708 PHINode *Induction = nullptr;
710 /// The induction variable of the old basic block.
711 PHINode *OldInduction = nullptr;
713 /// Maps values from the original loop to their corresponding values in the
714 /// vectorized loop. A key value can map to either vector values, scalar
715 /// values or both kinds of values, depending on whether the key was
716 /// vectorized and scalarized.
717 VectorizerValueMap VectorLoopValueMap;
719 /// Store instructions that were predicated.
720 SmallVector<Instruction *, 4> PredicatedInstructions;
722 /// Trip count of the original loop.
723 Value *TripCount = nullptr;
725 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
726 Value *VectorTripCount = nullptr;
728 /// The legality analysis.
729 LoopVectorizationLegality *Legal;
731 /// The profitablity analysis.
732 LoopVectorizationCostModel *Cost;
734 // Record whether runtime checks are added.
735 bool AddedSafetyChecks = false;
737 // Holds the end values for each induction variable. We save the end values
738 // so we can later fix-up the external users of the induction variables.
739 DenseMap<PHINode *, Value *> IVEndValues;
741 // Vector of original scalar PHIs whose corresponding widened PHIs need to be
742 // fixed up at the end of vector code generation.
743 SmallVector<PHINode *, 8> OrigPHIsToFix;
746 class InnerLoopUnroller : public InnerLoopVectorizer {
747 public:
748 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
749 LoopInfo *LI, DominatorTree *DT,
750 const TargetLibraryInfo *TLI,
751 const TargetTransformInfo *TTI, AssumptionCache *AC,
752 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
753 LoopVectorizationLegality *LVL,
754 LoopVectorizationCostModel *CM)
755 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
756 UnrollFactor, LVL, CM) {}
758 private:
759 Value *getBroadcastInstrs(Value *V) override;
760 Value *getStepVector(Value *Val, int StartIdx, Value *Step,
761 Instruction::BinaryOps Opcode =
762 Instruction::BinaryOpsEnd) override;
763 Value *reverseVector(Value *Vec) override;
766 } // end namespace llvm
768 /// Look for a meaningful debug location on the instruction or it's
769 /// operands.
770 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
771 if (!I)
772 return I;
774 DebugLoc Empty;
775 if (I->getDebugLoc() != Empty)
776 return I;
778 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
779 if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
780 if (OpInst->getDebugLoc() != Empty)
781 return OpInst;
784 return I;
787 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
788 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
789 const DILocation *DIL = Inst->getDebugLoc();
790 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
791 !isa<DbgInfoIntrinsic>(Inst)) {
792 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
793 if (NewDIL)
794 B.SetCurrentDebugLocation(NewDIL.getValue());
795 else
796 LLVM_DEBUG(dbgs()
797 << "Failed to create new discriminator: "
798 << DIL->getFilename() << " Line: " << DIL->getLine());
800 else
801 B.SetCurrentDebugLocation(DIL);
802 } else
803 B.SetCurrentDebugLocation(DebugLoc());
806 /// Write a record \p DebugMsg about vectorization failure to the debug
807 /// output stream. If \p I is passed, it is an instruction that prevents
808 /// vectorization.
809 #ifndef NDEBUG
810 static void debugVectorizationFailure(const StringRef DebugMsg,
811 Instruction *I) {
812 dbgs() << "LV: Not vectorizing: " << DebugMsg;
813 if (I != nullptr)
814 dbgs() << " " << *I;
815 else
816 dbgs() << '.';
817 dbgs() << '\n';
819 #endif
821 /// Create an analysis remark that explains why vectorization failed
823 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
824 /// RemarkName is the identifier for the remark. If \p I is passed it is an
825 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for
826 /// the location of the remark. \return the remark object that can be
827 /// streamed to.
828 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
829 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
830 Value *CodeRegion = TheLoop->getHeader();
831 DebugLoc DL = TheLoop->getStartLoc();
833 if (I) {
834 CodeRegion = I->getParent();
835 // If there is no debug location attached to the instruction, revert back to
836 // using the loop's.
837 if (I->getDebugLoc())
838 DL = I->getDebugLoc();
841 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
842 R << "loop not vectorized: ";
843 return R;
846 namespace llvm {
848 void reportVectorizationFailure(const StringRef DebugMsg,
849 const StringRef OREMsg, const StringRef ORETag,
850 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
851 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
852 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
853 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
854 ORETag, TheLoop, I) << OREMsg);
857 } // end namespace llvm
859 #ifndef NDEBUG
860 /// \return string containing a file name and a line # for the given loop.
861 static std::string getDebugLocString(const Loop *L) {
862 std::string Result;
863 if (L) {
864 raw_string_ostream OS(Result);
865 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
866 LoopDbgLoc.print(OS);
867 else
868 // Just print the module name.
869 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
870 OS.flush();
872 return Result;
874 #endif
876 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
877 const Instruction *Orig) {
878 // If the loop was versioned with memchecks, add the corresponding no-alias
879 // metadata.
880 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
881 LVer->annotateInstWithNoAlias(To, Orig);
884 void InnerLoopVectorizer::addMetadata(Instruction *To,
885 Instruction *From) {
886 propagateMetadata(To, From);
887 addNewMetadata(To, From);
890 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
891 Instruction *From) {
892 for (Value *V : To) {
893 if (Instruction *I = dyn_cast<Instruction>(V))
894 addMetadata(I, From);
898 namespace llvm {
900 // Loop vectorization cost-model hints how the scalar epilogue loop should be
901 // lowered.
902 enum ScalarEpilogueLowering {
904 // The default: allowing scalar epilogues.
905 CM_ScalarEpilogueAllowed,
907 // Vectorization with OptForSize: don't allow epilogues.
908 CM_ScalarEpilogueNotAllowedOptSize,
910 // A special case of vectorisation with OptForSize: loops with a very small
911 // trip count are considered for vectorization under OptForSize, thereby
912 // making sure the cost of their loop body is dominant, free of runtime
913 // guards and scalar iteration overheads.
914 CM_ScalarEpilogueNotAllowedLowTripLoop,
916 // Loop hint predicate indicating an epilogue is undesired.
917 CM_ScalarEpilogueNotNeededUsePredicate
920 /// LoopVectorizationCostModel - estimates the expected speedups due to
921 /// vectorization.
922 /// In many cases vectorization is not profitable. This can happen because of
923 /// a number of reasons. In this class we mainly attempt to predict the
924 /// expected speedup/slowdowns due to the supported instruction set. We use the
925 /// TargetTransformInfo to query the different backends for the cost of
926 /// different operations.
927 class LoopVectorizationCostModel {
928 public:
929 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
930 PredicatedScalarEvolution &PSE, LoopInfo *LI,
931 LoopVectorizationLegality *Legal,
932 const TargetTransformInfo &TTI,
933 const TargetLibraryInfo *TLI, DemandedBits *DB,
934 AssumptionCache *AC,
935 OptimizationRemarkEmitter *ORE, const Function *F,
936 const LoopVectorizeHints *Hints,
937 InterleavedAccessInfo &IAI)
938 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
939 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
940 Hints(Hints), InterleaveInfo(IAI) {}
942 /// \return An upper bound for the vectorization factor, or None if
943 /// vectorization and interleaving should be avoided up front.
944 Optional<unsigned> computeMaxVF();
946 /// \return True if runtime checks are required for vectorization, and false
947 /// otherwise.
948 bool runtimeChecksRequired();
950 /// \return The most profitable vectorization factor and the cost of that VF.
951 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
952 /// then this vectorization factor will be selected if vectorization is
953 /// possible.
954 VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
956 /// Setup cost-based decisions for user vectorization factor.
957 void selectUserVectorizationFactor(unsigned UserVF) {
958 collectUniformsAndScalars(UserVF);
959 collectInstsToScalarize(UserVF);
962 /// \return The size (in bits) of the smallest and widest types in the code
963 /// that needs to be vectorized. We ignore values that remain scalar such as
964 /// 64 bit loop indices.
965 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
967 /// \return The desired interleave count.
968 /// If interleave count has been specified by metadata it will be returned.
969 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
970 /// are the selected vectorization factor and the cost of the selected VF.
971 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
973 /// Memory access instruction may be vectorized in more than one way.
974 /// Form of instruction after vectorization depends on cost.
975 /// This function takes cost-based decisions for Load/Store instructions
976 /// and collects them in a map. This decisions map is used for building
977 /// the lists of loop-uniform and loop-scalar instructions.
978 /// The calculated cost is saved with widening decision in order to
979 /// avoid redundant calculations.
980 void setCostBasedWideningDecision(unsigned VF);
982 /// A struct that represents some properties of the register usage
983 /// of a loop.
984 struct RegisterUsage {
985 /// Holds the number of loop invariant values that are used in the loop.
986 unsigned LoopInvariantRegs;
988 /// Holds the maximum number of concurrent live intervals in the loop.
989 unsigned MaxLocalUsers;
992 /// \return Returns information about the register usages of the loop for the
993 /// given vectorization factors.
994 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
996 /// Collect values we want to ignore in the cost model.
997 void collectValuesToIgnore();
999 /// \returns The smallest bitwidth each instruction can be represented with.
1000 /// The vector equivalents of these instructions should be truncated to this
1001 /// type.
1002 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1003 return MinBWs;
1006 /// \returns True if it is more profitable to scalarize instruction \p I for
1007 /// vectorization factor \p VF.
1008 bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1009 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1011 // Cost model is not run in the VPlan-native path - return conservative
1012 // result until this changes.
1013 if (EnableVPlanNativePath)
1014 return false;
1016 auto Scalars = InstsToScalarize.find(VF);
1017 assert(Scalars != InstsToScalarize.end() &&
1018 "VF not yet analyzed for scalarization profitability");
1019 return Scalars->second.find(I) != Scalars->second.end();
1022 /// Returns true if \p I is known to be uniform after vectorization.
1023 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1024 if (VF == 1)
1025 return true;
1027 // Cost model is not run in the VPlan-native path - return conservative
1028 // result until this changes.
1029 if (EnableVPlanNativePath)
1030 return false;
1032 auto UniformsPerVF = Uniforms.find(VF);
1033 assert(UniformsPerVF != Uniforms.end() &&
1034 "VF not yet analyzed for uniformity");
1035 return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1038 /// Returns true if \p I is known to be scalar after vectorization.
1039 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1040 if (VF == 1)
1041 return true;
1043 // Cost model is not run in the VPlan-native path - return conservative
1044 // result until this changes.
1045 if (EnableVPlanNativePath)
1046 return false;
1048 auto ScalarsPerVF = Scalars.find(VF);
1049 assert(ScalarsPerVF != Scalars.end() &&
1050 "Scalar values are not calculated for VF");
1051 return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1054 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1055 /// for vectorization factor \p VF.
1056 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1057 return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1058 !isProfitableToScalarize(I, VF) &&
1059 !isScalarAfterVectorization(I, VF);
1062 /// Decision that was taken during cost calculation for memory instruction.
1063 enum InstWidening {
1064 CM_Unknown,
1065 CM_Widen, // For consecutive accesses with stride +1.
1066 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1067 CM_Interleave,
1068 CM_GatherScatter,
1069 CM_Scalarize
1072 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1073 /// instruction \p I and vector width \p VF.
1074 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1075 unsigned Cost) {
1076 assert(VF >= 2 && "Expected VF >=2");
1077 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1080 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1081 /// interleaving group \p Grp and vector width \p VF.
1082 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1083 InstWidening W, unsigned Cost) {
1084 assert(VF >= 2 && "Expected VF >=2");
1085 /// Broadcast this decicion to all instructions inside the group.
1086 /// But the cost will be assigned to one instruction only.
1087 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1088 if (auto *I = Grp->getMember(i)) {
1089 if (Grp->getInsertPos() == I)
1090 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1091 else
1092 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1097 /// Return the cost model decision for the given instruction \p I and vector
1098 /// width \p VF. Return CM_Unknown if this instruction did not pass
1099 /// through the cost modeling.
1100 InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1101 assert(VF >= 2 && "Expected VF >=2");
1103 // Cost model is not run in the VPlan-native path - return conservative
1104 // result until this changes.
1105 if (EnableVPlanNativePath)
1106 return CM_GatherScatter;
1108 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1109 auto Itr = WideningDecisions.find(InstOnVF);
1110 if (Itr == WideningDecisions.end())
1111 return CM_Unknown;
1112 return Itr->second.first;
1115 /// Return the vectorization cost for the given instruction \p I and vector
1116 /// width \p VF.
1117 unsigned getWideningCost(Instruction *I, unsigned VF) {
1118 assert(VF >= 2 && "Expected VF >=2");
1119 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1120 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1121 "The cost is not calculated");
1122 return WideningDecisions[InstOnVF].second;
1125 /// Return True if instruction \p I is an optimizable truncate whose operand
1126 /// is an induction variable. Such a truncate will be removed by adding a new
1127 /// induction variable with the destination type.
1128 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1129 // If the instruction is not a truncate, return false.
1130 auto *Trunc = dyn_cast<TruncInst>(I);
1131 if (!Trunc)
1132 return false;
1134 // Get the source and destination types of the truncate.
1135 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1136 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1138 // If the truncate is free for the given types, return false. Replacing a
1139 // free truncate with an induction variable would add an induction variable
1140 // update instruction to each iteration of the loop. We exclude from this
1141 // check the primary induction variable since it will need an update
1142 // instruction regardless.
1143 Value *Op = Trunc->getOperand(0);
1144 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1145 return false;
1147 // If the truncated value is not an induction variable, return false.
1148 return Legal->isInductionPhi(Op);
1151 /// Collects the instructions to scalarize for each predicated instruction in
1152 /// the loop.
1153 void collectInstsToScalarize(unsigned VF);
1155 /// Collect Uniform and Scalar values for the given \p VF.
1156 /// The sets depend on CM decision for Load/Store instructions
1157 /// that may be vectorized as interleave, gather-scatter or scalarized.
1158 void collectUniformsAndScalars(unsigned VF) {
1159 // Do the analysis once.
1160 if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1161 return;
1162 setCostBasedWideningDecision(VF);
1163 collectLoopUniforms(VF);
1164 collectLoopScalars(VF);
1167 /// Returns true if the target machine supports masked store operation
1168 /// for the given \p DataType and kind of access to \p Ptr.
1169 bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
1170 return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
1173 /// Returns true if the target machine supports masked load operation
1174 /// for the given \p DataType and kind of access to \p Ptr.
1175 bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
1176 return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
1179 /// Returns true if the target machine supports masked scatter operation
1180 /// for the given \p DataType.
1181 bool isLegalMaskedScatter(Type *DataType) {
1182 return TTI.isLegalMaskedScatter(DataType);
1185 /// Returns true if the target machine supports masked gather operation
1186 /// for the given \p DataType.
1187 bool isLegalMaskedGather(Type *DataType) {
1188 return TTI.isLegalMaskedGather(DataType);
1191 /// Returns true if the target machine can represent \p V as a masked gather
1192 /// or scatter operation.
1193 bool isLegalGatherOrScatter(Value *V) {
1194 bool LI = isa<LoadInst>(V);
1195 bool SI = isa<StoreInst>(V);
1196 if (!LI && !SI)
1197 return false;
1198 auto *Ty = getMemInstValueType(V);
1199 return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1202 /// Returns true if \p I is an instruction that will be scalarized with
1203 /// predication. Such instructions include conditional stores and
1204 /// instructions that may divide by zero.
1205 /// If a non-zero VF has been calculated, we check if I will be scalarized
1206 /// predication for that VF.
1207 bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1209 // Returns true if \p I is an instruction that will be predicated either
1210 // through scalar predication or masked load/store or masked gather/scatter.
1211 // Superset of instructions that return true for isScalarWithPredication.
1212 bool isPredicatedInst(Instruction *I) {
1213 if (!blockNeedsPredication(I->getParent()))
1214 return false;
1215 // Loads and stores that need some form of masked operation are predicated
1216 // instructions.
1217 if (isa<LoadInst>(I) || isa<StoreInst>(I))
1218 return Legal->isMaskRequired(I);
1219 return isScalarWithPredication(I);
1222 /// Returns true if \p I is a memory instruction with consecutive memory
1223 /// access that can be widened.
1224 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1226 /// Returns true if \p I is a memory instruction in an interleaved-group
1227 /// of memory accesses that can be vectorized with wide vector loads/stores
1228 /// and shuffles.
1229 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1231 /// Check if \p Instr belongs to any interleaved access group.
1232 bool isAccessInterleaved(Instruction *Instr) {
1233 return InterleaveInfo.isInterleaved(Instr);
1236 /// Get the interleaved access group that \p Instr belongs to.
1237 const InterleaveGroup<Instruction> *
1238 getInterleavedAccessGroup(Instruction *Instr) {
1239 return InterleaveInfo.getInterleaveGroup(Instr);
1242 /// Returns true if an interleaved group requires a scalar iteration
1243 /// to handle accesses with gaps, and there is nothing preventing us from
1244 /// creating a scalar epilogue.
1245 bool requiresScalarEpilogue() const {
1246 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1249 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1250 /// loop hint annotation.
1251 bool isScalarEpilogueAllowed() const {
1252 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1255 /// Returns true if all loop blocks should be masked to fold tail loop.
1256 bool foldTailByMasking() const { return FoldTailByMasking; }
1258 bool blockNeedsPredication(BasicBlock *BB) {
1259 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1262 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1263 /// with factor VF. Return the cost of the instruction, including
1264 /// scalarization overhead if it's needed.
1265 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1267 /// Estimate cost of a call instruction CI if it were vectorized with factor
1268 /// VF. Return the cost of the instruction, including scalarization overhead
1269 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1270 /// scalarized -
1271 /// i.e. either vector version isn't available, or is too expensive.
1272 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1274 private:
1275 unsigned NumPredStores = 0;
1277 /// \return An upper bound for the vectorization factor, larger than zero.
1278 /// One is returned if vectorization should best be avoided due to cost.
1279 unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1281 /// The vectorization cost is a combination of the cost itself and a boolean
1282 /// indicating whether any of the contributing operations will actually
1283 /// operate on
1284 /// vector values after type legalization in the backend. If this latter value
1285 /// is
1286 /// false, then all operations will be scalarized (i.e. no vectorization has
1287 /// actually taken place).
1288 using VectorizationCostTy = std::pair<unsigned, bool>;
1290 /// Returns the expected execution cost. The unit of the cost does
1291 /// not matter because we use the 'cost' units to compare different
1292 /// vector widths. The cost that is returned is *not* normalized by
1293 /// the factor width.
1294 VectorizationCostTy expectedCost(unsigned VF);
1296 /// Returns the execution time cost of an instruction for a given vector
1297 /// width. Vector width of one means scalar.
1298 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1300 /// The cost-computation logic from getInstructionCost which provides
1301 /// the vector type as an output parameter.
1302 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1304 /// Calculate vectorization cost of memory instruction \p I.
1305 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1307 /// The cost computation for scalarized memory instruction.
1308 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1310 /// The cost computation for interleaving group of memory instructions.
1311 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1313 /// The cost computation for Gather/Scatter instruction.
1314 unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1316 /// The cost computation for widening instruction \p I with consecutive
1317 /// memory access.
1318 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1320 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1321 /// Load: scalar load + broadcast.
1322 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1323 /// element)
1324 unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1326 /// Estimate the overhead of scalarizing an instruction. This is a
1327 /// convenience wrapper for the type-based getScalarizationOverhead API.
1328 unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1330 /// Returns whether the instruction is a load or store and will be a emitted
1331 /// as a vector operation.
1332 bool isConsecutiveLoadOrStore(Instruction *I);
1334 /// Returns true if an artificially high cost for emulated masked memrefs
1335 /// should be used.
1336 bool useEmulatedMaskMemRefHack(Instruction *I);
1338 /// Map of scalar integer values to the smallest bitwidth they can be legally
1339 /// represented as. The vector equivalents of these values should be truncated
1340 /// to this type.
1341 MapVector<Instruction *, uint64_t> MinBWs;
1343 /// A type representing the costs for instructions if they were to be
1344 /// scalarized rather than vectorized. The entries are Instruction-Cost
1345 /// pairs.
1346 using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1348 /// A set containing all BasicBlocks that are known to present after
1349 /// vectorization as a predicated block.
1350 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1352 /// Records whether it is allowed to have the original scalar loop execute at
1353 /// least once. This may be needed as a fallback loop in case runtime
1354 /// aliasing/dependence checks fail, or to handle the tail/remainder
1355 /// iterations when the trip count is unknown or doesn't divide by the VF,
1356 /// or as a peel-loop to handle gaps in interleave-groups.
1357 /// Under optsize and when the trip count is very small we don't allow any
1358 /// iterations to execute in the scalar loop.
1359 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1361 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1362 bool FoldTailByMasking = false;
1364 /// A map holding scalar costs for different vectorization factors. The
1365 /// presence of a cost for an instruction in the mapping indicates that the
1366 /// instruction will be scalarized when vectorizing with the associated
1367 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1368 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1370 /// Holds the instructions known to be uniform after vectorization.
1371 /// The data is collected per VF.
1372 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1374 /// Holds the instructions known to be scalar after vectorization.
1375 /// The data is collected per VF.
1376 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1378 /// Holds the instructions (address computations) that are forced to be
1379 /// scalarized.
1380 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1382 /// Returns the expected difference in cost from scalarizing the expression
1383 /// feeding a predicated instruction \p PredInst. The instructions to
1384 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1385 /// non-negative return value implies the expression will be scalarized.
1386 /// Currently, only single-use chains are considered for scalarization.
1387 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1388 unsigned VF);
1390 /// Collect the instructions that are uniform after vectorization. An
1391 /// instruction is uniform if we represent it with a single scalar value in
1392 /// the vectorized loop corresponding to each vector iteration. Examples of
1393 /// uniform instructions include pointer operands of consecutive or
1394 /// interleaved memory accesses. Note that although uniformity implies an
1395 /// instruction will be scalar, the reverse is not true. In general, a
1396 /// scalarized instruction will be represented by VF scalar values in the
1397 /// vectorized loop, each corresponding to an iteration of the original
1398 /// scalar loop.
1399 void collectLoopUniforms(unsigned VF);
1401 /// Collect the instructions that are scalar after vectorization. An
1402 /// instruction is scalar if it is known to be uniform or will be scalarized
1403 /// during vectorization. Non-uniform scalarized instructions will be
1404 /// represented by VF values in the vectorized loop, each corresponding to an
1405 /// iteration of the original scalar loop.
1406 void collectLoopScalars(unsigned VF);
1408 /// Keeps cost model vectorization decision and cost for instructions.
1409 /// Right now it is used for memory instructions only.
1410 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1411 std::pair<InstWidening, unsigned>>;
1413 DecisionList WideningDecisions;
1415 /// Returns true if \p V is expected to be vectorized and it needs to be
1416 /// extracted.
1417 bool needsExtract(Value *V, unsigned VF) const {
1418 Instruction *I = dyn_cast<Instruction>(V);
1419 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1420 return false;
1422 // Assume we can vectorize V (and hence we need extraction) if the
1423 // scalars are not computed yet. This can happen, because it is called
1424 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1425 // the scalars are collected. That should be a safe assumption in most
1426 // cases, because we check if the operands have vectorizable types
1427 // beforehand in LoopVectorizationLegality.
1428 return Scalars.find(VF) == Scalars.end() ||
1429 !isScalarAfterVectorization(I, VF);
1432 /// Returns a range containing only operands needing to be extracted.
1433 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1434 unsigned VF) {
1435 return SmallVector<Value *, 4>(make_filter_range(
1436 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1439 public:
1440 /// The loop that we evaluate.
1441 Loop *TheLoop;
1443 /// Predicated scalar evolution analysis.
1444 PredicatedScalarEvolution &PSE;
1446 /// Loop Info analysis.
1447 LoopInfo *LI;
1449 /// Vectorization legality.
1450 LoopVectorizationLegality *Legal;
1452 /// Vector target information.
1453 const TargetTransformInfo &TTI;
1455 /// Target Library Info.
1456 const TargetLibraryInfo *TLI;
1458 /// Demanded bits analysis.
1459 DemandedBits *DB;
1461 /// Assumption cache.
1462 AssumptionCache *AC;
1464 /// Interface to emit optimization remarks.
1465 OptimizationRemarkEmitter *ORE;
1467 const Function *TheFunction;
1469 /// Loop Vectorize Hint.
1470 const LoopVectorizeHints *Hints;
1472 /// The interleave access information contains groups of interleaved accesses
1473 /// with the same stride and close to each other.
1474 InterleavedAccessInfo &InterleaveInfo;
1476 /// Values to ignore in the cost model.
1477 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1479 /// Values to ignore in the cost model when VF > 1.
1480 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1483 } // end namespace llvm
1485 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1486 // vectorization. The loop needs to be annotated with #pragma omp simd
1487 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1488 // vector length information is not provided, vectorization is not considered
1489 // explicit. Interleave hints are not allowed either. These limitations will be
1490 // relaxed in the future.
1491 // Please, note that we are currently forced to abuse the pragma 'clang
1492 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1493 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1494 // provides *explicit vectorization hints* (LV can bypass legal checks and
1495 // assume that vectorization is legal). However, both hints are implemented
1496 // using the same metadata (llvm.loop.vectorize, processed by
1497 // LoopVectorizeHints). This will be fixed in the future when the native IR
1498 // representation for pragma 'omp simd' is introduced.
1499 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1500 OptimizationRemarkEmitter *ORE) {
1501 assert(!OuterLp->empty() && "This is not an outer loop");
1502 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1504 // Only outer loops with an explicit vectorization hint are supported.
1505 // Unannotated outer loops are ignored.
1506 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1507 return false;
1509 Function *Fn = OuterLp->getHeader()->getParent();
1510 if (!Hints.allowVectorization(Fn, OuterLp,
1511 true /*VectorizeOnlyWhenForced*/)) {
1512 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1513 return false;
1516 if (Hints.getInterleave() > 1) {
1517 // TODO: Interleave support is future work.
1518 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1519 "outer loops.\n");
1520 Hints.emitRemarkWithHints();
1521 return false;
1524 return true;
1527 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1528 OptimizationRemarkEmitter *ORE,
1529 SmallVectorImpl<Loop *> &V) {
1530 // Collect inner loops and outer loops without irreducible control flow. For
1531 // now, only collect outer loops that have explicit vectorization hints. If we
1532 // are stress testing the VPlan H-CFG construction, we collect the outermost
1533 // loop of every loop nest.
1534 if (L.empty() || VPlanBuildStressTest ||
1535 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1536 LoopBlocksRPO RPOT(&L);
1537 RPOT.perform(LI);
1538 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1539 V.push_back(&L);
1540 // TODO: Collect inner loops inside marked outer loops in case
1541 // vectorization fails for the outer loop. Do not invoke
1542 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1543 // already known to be reducible. We can use an inherited attribute for
1544 // that.
1545 return;
1548 for (Loop *InnerL : L)
1549 collectSupportedLoops(*InnerL, LI, ORE, V);
1552 namespace {
1554 /// The LoopVectorize Pass.
1555 struct LoopVectorize : public FunctionPass {
1556 /// Pass identification, replacement for typeid
1557 static char ID;
1559 LoopVectorizePass Impl;
1561 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1562 bool VectorizeOnlyWhenForced = false)
1563 : FunctionPass(ID) {
1564 Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1565 Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1566 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1569 bool runOnFunction(Function &F) override {
1570 if (skipFunction(F))
1571 return false;
1573 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1574 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1575 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1576 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1577 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1578 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1579 auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
1580 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1581 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1582 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1583 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1584 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1585 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1587 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1588 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1590 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1591 GetLAA, *ORE, PSI);
1594 void getAnalysisUsage(AnalysisUsage &AU) const override {
1595 AU.addRequired<AssumptionCacheTracker>();
1596 AU.addRequired<BlockFrequencyInfoWrapperPass>();
1597 AU.addRequired<DominatorTreeWrapperPass>();
1598 AU.addRequired<LoopInfoWrapperPass>();
1599 AU.addRequired<ScalarEvolutionWrapperPass>();
1600 AU.addRequired<TargetTransformInfoWrapperPass>();
1601 AU.addRequired<AAResultsWrapperPass>();
1602 AU.addRequired<LoopAccessLegacyAnalysis>();
1603 AU.addRequired<DemandedBitsWrapperPass>();
1604 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1606 // We currently do not preserve loopinfo/dominator analyses with outer loop
1607 // vectorization. Until this is addressed, mark these analyses as preserved
1608 // only for non-VPlan-native path.
1609 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1610 if (!EnableVPlanNativePath) {
1611 AU.addPreserved<LoopInfoWrapperPass>();
1612 AU.addPreserved<DominatorTreeWrapperPass>();
1615 AU.addPreserved<BasicAAWrapperPass>();
1616 AU.addPreserved<GlobalsAAWrapperPass>();
1617 AU.addRequired<ProfileSummaryInfoWrapperPass>();
1621 } // end anonymous namespace
1623 //===----------------------------------------------------------------------===//
1624 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1625 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1626 //===----------------------------------------------------------------------===//
1628 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1629 // We need to place the broadcast of invariant variables outside the loop,
1630 // but only if it's proven safe to do so. Else, broadcast will be inside
1631 // vector loop body.
1632 Instruction *Instr = dyn_cast<Instruction>(V);
1633 bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1634 (!Instr ||
1635 DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1636 // Place the code for broadcasting invariant variables in the new preheader.
1637 IRBuilder<>::InsertPointGuard Guard(Builder);
1638 if (SafeToHoist)
1639 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1641 // Broadcast the scalar into all locations in the vector.
1642 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1644 return Shuf;
1647 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1648 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1649 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1650 "Expected either an induction phi-node or a truncate of it!");
1651 Value *Start = II.getStartValue();
1653 // Construct the initial value of the vector IV in the vector loop preheader
1654 auto CurrIP = Builder.saveIP();
1655 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1656 if (isa<TruncInst>(EntryVal)) {
1657 assert(Start->getType()->isIntegerTy() &&
1658 "Truncation requires an integer type");
1659 auto *TruncType = cast<IntegerType>(EntryVal->getType());
1660 Step = Builder.CreateTrunc(Step, TruncType);
1661 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1663 Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1664 Value *SteppedStart =
1665 getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1667 // We create vector phi nodes for both integer and floating-point induction
1668 // variables. Here, we determine the kind of arithmetic we will perform.
1669 Instruction::BinaryOps AddOp;
1670 Instruction::BinaryOps MulOp;
1671 if (Step->getType()->isIntegerTy()) {
1672 AddOp = Instruction::Add;
1673 MulOp = Instruction::Mul;
1674 } else {
1675 AddOp = II.getInductionOpcode();
1676 MulOp = Instruction::FMul;
1679 // Multiply the vectorization factor by the step using integer or
1680 // floating-point arithmetic as appropriate.
1681 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1682 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1684 // Create a vector splat to use in the induction update.
1686 // FIXME: If the step is non-constant, we create the vector splat with
1687 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1688 // handle a constant vector splat.
1689 Value *SplatVF = isa<Constant>(Mul)
1690 ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1691 : Builder.CreateVectorSplat(VF, Mul);
1692 Builder.restoreIP(CurrIP);
1694 // We may need to add the step a number of times, depending on the unroll
1695 // factor. The last of those goes into the PHI.
1696 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1697 &*LoopVectorBody->getFirstInsertionPt());
1698 VecInd->setDebugLoc(EntryVal->getDebugLoc());
1699 Instruction *LastInduction = VecInd;
1700 for (unsigned Part = 0; Part < UF; ++Part) {
1701 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1703 if (isa<TruncInst>(EntryVal))
1704 addMetadata(LastInduction, EntryVal);
1705 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1707 LastInduction = cast<Instruction>(addFastMathFlag(
1708 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1709 LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1712 // Move the last step to the end of the latch block. This ensures consistent
1713 // placement of all induction updates.
1714 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1715 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1716 auto *ICmp = cast<Instruction>(Br->getCondition());
1717 LastInduction->moveBefore(ICmp);
1718 LastInduction->setName("vec.ind.next");
1720 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1721 VecInd->addIncoming(LastInduction, LoopVectorLatch);
1724 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1725 return Cost->isScalarAfterVectorization(I, VF) ||
1726 Cost->isProfitableToScalarize(I, VF);
1729 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1730 if (shouldScalarizeInstruction(IV))
1731 return true;
1732 auto isScalarInst = [&](User *U) -> bool {
1733 auto *I = cast<Instruction>(U);
1734 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1736 return llvm::any_of(IV->users(), isScalarInst);
1739 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1740 const InductionDescriptor &ID, const Instruction *EntryVal,
1741 Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1742 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1743 "Expected either an induction phi-node or a truncate of it!");
1745 // This induction variable is not the phi from the original loop but the
1746 // newly-created IV based on the proof that casted Phi is equal to the
1747 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1748 // re-uses the same InductionDescriptor that original IV uses but we don't
1749 // have to do any recording in this case - that is done when original IV is
1750 // processed.
1751 if (isa<TruncInst>(EntryVal))
1752 return;
1754 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1755 if (Casts.empty())
1756 return;
1757 // Only the first Cast instruction in the Casts vector is of interest.
1758 // The rest of the Casts (if exist) have no uses outside the
1759 // induction update chain itself.
1760 Instruction *CastInst = *Casts.begin();
1761 if (Lane < UINT_MAX)
1762 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1763 else
1764 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1767 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1768 assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1769 "Primary induction variable must have an integer type");
1771 auto II = Legal->getInductionVars()->find(IV);
1772 assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1774 auto ID = II->second;
1775 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1777 // The scalar value to broadcast. This will be derived from the canonical
1778 // induction variable.
1779 Value *ScalarIV = nullptr;
1781 // The value from the original loop to which we are mapping the new induction
1782 // variable.
1783 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1785 // True if we have vectorized the induction variable.
1786 auto VectorizedIV = false;
1788 // Determine if we want a scalar version of the induction variable. This is
1789 // true if the induction variable itself is not widened, or if it has at
1790 // least one user in the loop that is not widened.
1791 auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1793 // Generate code for the induction step. Note that induction steps are
1794 // required to be loop-invariant
1795 assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1796 "Induction step should be loop invariant");
1797 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1798 Value *Step = nullptr;
1799 if (PSE.getSE()->isSCEVable(IV->getType())) {
1800 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1801 Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1802 LoopVectorPreHeader->getTerminator());
1803 } else {
1804 Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1807 // Try to create a new independent vector induction variable. If we can't
1808 // create the phi node, we will splat the scalar induction variable in each
1809 // loop iteration.
1810 if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1811 createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1812 VectorizedIV = true;
1815 // If we haven't yet vectorized the induction variable, or if we will create
1816 // a scalar one, we need to define the scalar induction variable and step
1817 // values. If we were given a truncation type, truncate the canonical
1818 // induction variable and step. Otherwise, derive these values from the
1819 // induction descriptor.
1820 if (!VectorizedIV || NeedsScalarIV) {
1821 ScalarIV = Induction;
1822 if (IV != OldInduction) {
1823 ScalarIV = IV->getType()->isIntegerTy()
1824 ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1825 : Builder.CreateCast(Instruction::SIToFP, Induction,
1826 IV->getType());
1827 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1828 ScalarIV->setName("offset.idx");
1830 if (Trunc) {
1831 auto *TruncType = cast<IntegerType>(Trunc->getType());
1832 assert(Step->getType()->isIntegerTy() &&
1833 "Truncation requires an integer step");
1834 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1835 Step = Builder.CreateTrunc(Step, TruncType);
1839 // If we haven't yet vectorized the induction variable, splat the scalar
1840 // induction variable, and build the necessary step vectors.
1841 // TODO: Don't do it unless the vectorized IV is really required.
1842 if (!VectorizedIV) {
1843 Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1844 for (unsigned Part = 0; Part < UF; ++Part) {
1845 Value *EntryPart =
1846 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1847 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1848 if (Trunc)
1849 addMetadata(EntryPart, Trunc);
1850 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1854 // If an induction variable is only used for counting loop iterations or
1855 // calculating addresses, it doesn't need to be widened. Create scalar steps
1856 // that can be used by instructions we will later scalarize. Note that the
1857 // addition of the scalar steps will not increase the number of instructions
1858 // in the loop in the common case prior to InstCombine. We will be trading
1859 // one vector extract for each scalar step.
1860 if (NeedsScalarIV)
1861 buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1864 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1865 Instruction::BinaryOps BinOp) {
1866 // Create and check the types.
1867 assert(Val->getType()->isVectorTy() && "Must be a vector");
1868 int VLen = Val->getType()->getVectorNumElements();
1870 Type *STy = Val->getType()->getScalarType();
1871 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1872 "Induction Step must be an integer or FP");
1873 assert(Step->getType() == STy && "Step has wrong type");
1875 SmallVector<Constant *, 8> Indices;
1877 if (STy->isIntegerTy()) {
1878 // Create a vector of consecutive numbers from zero to VF.
1879 for (int i = 0; i < VLen; ++i)
1880 Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1882 // Add the consecutive indices to the vector value.
1883 Constant *Cv = ConstantVector::get(Indices);
1884 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1885 Step = Builder.CreateVectorSplat(VLen, Step);
1886 assert(Step->getType() == Val->getType() && "Invalid step vec");
1887 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1888 // which can be found from the original scalar operations.
1889 Step = Builder.CreateMul(Cv, Step);
1890 return Builder.CreateAdd(Val, Step, "induction");
1893 // Floating point induction.
1894 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1895 "Binary Opcode should be specified for FP induction");
1896 // Create a vector of consecutive numbers from zero to VF.
1897 for (int i = 0; i < VLen; ++i)
1898 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1900 // Add the consecutive indices to the vector value.
1901 Constant *Cv = ConstantVector::get(Indices);
1903 Step = Builder.CreateVectorSplat(VLen, Step);
1905 // Floating point operations had to be 'fast' to enable the induction.
1906 FastMathFlags Flags;
1907 Flags.setFast();
1909 Value *MulOp = Builder.CreateFMul(Cv, Step);
1910 if (isa<Instruction>(MulOp))
1911 // Have to check, MulOp may be a constant
1912 cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1914 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1915 if (isa<Instruction>(BOp))
1916 cast<Instruction>(BOp)->setFastMathFlags(Flags);
1917 return BOp;
1920 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1921 Instruction *EntryVal,
1922 const InductionDescriptor &ID) {
1923 // We shouldn't have to build scalar steps if we aren't vectorizing.
1924 assert(VF > 1 && "VF should be greater than one");
1926 // Get the value type and ensure it and the step have the same integer type.
1927 Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1928 assert(ScalarIVTy == Step->getType() &&
1929 "Val and Step should have the same type");
1931 // We build scalar steps for both integer and floating-point induction
1932 // variables. Here, we determine the kind of arithmetic we will perform.
1933 Instruction::BinaryOps AddOp;
1934 Instruction::BinaryOps MulOp;
1935 if (ScalarIVTy->isIntegerTy()) {
1936 AddOp = Instruction::Add;
1937 MulOp = Instruction::Mul;
1938 } else {
1939 AddOp = ID.getInductionOpcode();
1940 MulOp = Instruction::FMul;
1943 // Determine the number of scalars we need to generate for each unroll
1944 // iteration. If EntryVal is uniform, we only need to generate the first
1945 // lane. Otherwise, we generate all VF values.
1946 unsigned Lanes =
1947 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1948 : VF;
1949 // Compute the scalar steps and save the results in VectorLoopValueMap.
1950 for (unsigned Part = 0; Part < UF; ++Part) {
1951 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1952 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1953 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1954 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1955 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1956 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1961 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1962 assert(V != Induction && "The new induction variable should not be used.");
1963 assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1964 assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1966 // If we have a stride that is replaced by one, do it here. Defer this for
1967 // the VPlan-native path until we start running Legal checks in that path.
1968 if (!EnableVPlanNativePath && Legal->hasStride(V))
1969 V = ConstantInt::get(V->getType(), 1);
1971 // If we have a vector mapped to this value, return it.
1972 if (VectorLoopValueMap.hasVectorValue(V, Part))
1973 return VectorLoopValueMap.getVectorValue(V, Part);
1975 // If the value has not been vectorized, check if it has been scalarized
1976 // instead. If it has been scalarized, and we actually need the value in
1977 // vector form, we will construct the vector values on demand.
1978 if (VectorLoopValueMap.hasAnyScalarValue(V)) {
1979 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
1981 // If we've scalarized a value, that value should be an instruction.
1982 auto *I = cast<Instruction>(V);
1984 // If we aren't vectorizing, we can just copy the scalar map values over to
1985 // the vector map.
1986 if (VF == 1) {
1987 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
1988 return ScalarValue;
1991 // Get the last scalar instruction we generated for V and Part. If the value
1992 // is known to be uniform after vectorization, this corresponds to lane zero
1993 // of the Part unroll iteration. Otherwise, the last instruction is the one
1994 // we created for the last vector lane of the Part unroll iteration.
1995 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
1996 auto *LastInst = cast<Instruction>(
1997 VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
1999 // Set the insert point after the last scalarized instruction. This ensures
2000 // the insertelement sequence will directly follow the scalar definitions.
2001 auto OldIP = Builder.saveIP();
2002 auto NewIP = std::next(BasicBlock::iterator(LastInst));
2003 Builder.SetInsertPoint(&*NewIP);
2005 // However, if we are vectorizing, we need to construct the vector values.
2006 // If the value is known to be uniform after vectorization, we can just
2007 // broadcast the scalar value corresponding to lane zero for each unroll
2008 // iteration. Otherwise, we construct the vector values using insertelement
2009 // instructions. Since the resulting vectors are stored in
2010 // VectorLoopValueMap, we will only generate the insertelements once.
2011 Value *VectorValue = nullptr;
2012 if (Cost->isUniformAfterVectorization(I, VF)) {
2013 VectorValue = getBroadcastInstrs(ScalarValue);
2014 VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2015 } else {
2016 // Initialize packing with insertelements to start from undef.
2017 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2018 VectorLoopValueMap.setVectorValue(V, Part, Undef);
2019 for (unsigned Lane = 0; Lane < VF; ++Lane)
2020 packScalarIntoVectorValue(V, {Part, Lane});
2021 VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2023 Builder.restoreIP(OldIP);
2024 return VectorValue;
2027 // If this scalar is unknown, assume that it is a constant or that it is
2028 // loop invariant. Broadcast V and save the value for future uses.
2029 Value *B = getBroadcastInstrs(V);
2030 VectorLoopValueMap.setVectorValue(V, Part, B);
2031 return B;
2034 Value *
2035 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2036 const VPIteration &Instance) {
2037 // If the value is not an instruction contained in the loop, it should
2038 // already be scalar.
2039 if (OrigLoop->isLoopInvariant(V))
2040 return V;
2042 assert(Instance.Lane > 0
2043 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2044 : true && "Uniform values only have lane zero");
2046 // If the value from the original loop has not been vectorized, it is
2047 // represented by UF x VF scalar values in the new loop. Return the requested
2048 // scalar value.
2049 if (VectorLoopValueMap.hasScalarValue(V, Instance))
2050 return VectorLoopValueMap.getScalarValue(V, Instance);
2052 // If the value has not been scalarized, get its entry in VectorLoopValueMap
2053 // for the given unroll part. If this entry is not a vector type (i.e., the
2054 // vectorization factor is one), there is no need to generate an
2055 // extractelement instruction.
2056 auto *U = getOrCreateVectorValue(V, Instance.Part);
2057 if (!U->getType()->isVectorTy()) {
2058 assert(VF == 1 && "Value not scalarized has non-vector type");
2059 return U;
2062 // Otherwise, the value from the original loop has been vectorized and is
2063 // represented by UF vector values. Extract and return the requested scalar
2064 // value from the appropriate vector lane.
2065 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2068 void InnerLoopVectorizer::packScalarIntoVectorValue(
2069 Value *V, const VPIteration &Instance) {
2070 assert(V != Induction && "The new induction variable should not be used.");
2071 assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2072 assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2074 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2075 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2076 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2077 Builder.getInt32(Instance.Lane));
2078 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2081 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2082 assert(Vec->getType()->isVectorTy() && "Invalid type");
2083 SmallVector<Constant *, 8> ShuffleMask;
2084 for (unsigned i = 0; i < VF; ++i)
2085 ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2087 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2088 ConstantVector::get(ShuffleMask),
2089 "reverse");
2092 // Return whether we allow using masked interleave-groups (for dealing with
2093 // strided loads/stores that reside in predicated blocks, or for dealing
2094 // with gaps).
2095 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2096 // If an override option has been passed in for interleaved accesses, use it.
2097 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2098 return EnableMaskedInterleavedMemAccesses;
2100 return TTI.enableMaskedInterleavedAccessVectorization();
2103 // Try to vectorize the interleave group that \p Instr belongs to.
2105 // E.g. Translate following interleaved load group (factor = 3):
2106 // for (i = 0; i < N; i+=3) {
2107 // R = Pic[i]; // Member of index 0
2108 // G = Pic[i+1]; // Member of index 1
2109 // B = Pic[i+2]; // Member of index 2
2110 // ... // do something to R, G, B
2111 // }
2112 // To:
2113 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2114 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
2115 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
2116 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
2118 // Or translate following interleaved store group (factor = 3):
2119 // for (i = 0; i < N; i+=3) {
2120 // ... do something to R, G, B
2121 // Pic[i] = R; // Member of index 0
2122 // Pic[i+1] = G; // Member of index 1
2123 // Pic[i+2] = B; // Member of index 2
2124 // }
2125 // To:
2126 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2127 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2128 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2129 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2130 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2131 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2132 VectorParts *BlockInMask) {
2133 const InterleaveGroup<Instruction> *Group =
2134 Cost->getInterleavedAccessGroup(Instr);
2135 assert(Group && "Fail to get an interleaved access group.");
2137 // Skip if current instruction is not the insert position.
2138 if (Instr != Group->getInsertPos())
2139 return;
2141 const DataLayout &DL = Instr->getModule()->getDataLayout();
2142 Value *Ptr = getLoadStorePointerOperand(Instr);
2144 // Prepare for the vector type of the interleaved load/store.
2145 Type *ScalarTy = getMemInstValueType(Instr);
2146 unsigned InterleaveFactor = Group->getFactor();
2147 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2148 Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2150 // Prepare for the new pointers.
2151 setDebugLocFromInst(Builder, Ptr);
2152 SmallVector<Value *, 2> NewPtrs;
2153 unsigned Index = Group->getIndex(Instr);
2155 VectorParts Mask;
2156 bool IsMaskForCondRequired = BlockInMask;
2157 if (IsMaskForCondRequired) {
2158 Mask = *BlockInMask;
2159 // TODO: extend the masked interleaved-group support to reversed access.
2160 assert(!Group->isReverse() && "Reversed masked interleave-group "
2161 "not supported.");
2164 // If the group is reverse, adjust the index to refer to the last vector lane
2165 // instead of the first. We adjust the index from the first vector lane,
2166 // rather than directly getting the pointer for lane VF - 1, because the
2167 // pointer operand of the interleaved access is supposed to be uniform. For
2168 // uniform instructions, we're only required to generate a value for the
2169 // first vector lane in each unroll iteration.
2170 if (Group->isReverse())
2171 Index += (VF - 1) * Group->getFactor();
2173 bool InBounds = false;
2174 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2175 InBounds = gep->isInBounds();
2177 for (unsigned Part = 0; Part < UF; Part++) {
2178 Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2180 // Notice current instruction could be any index. Need to adjust the address
2181 // to the member of index 0.
2183 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2184 // b = A[i]; // Member of index 0
2185 // Current pointer is pointed to A[i+1], adjust it to A[i].
2187 // E.g. A[i+1] = a; // Member of index 1
2188 // A[i] = b; // Member of index 0
2189 // A[i+2] = c; // Member of index 2 (Current instruction)
2190 // Current pointer is pointed to A[i+2], adjust it to A[i].
2191 NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
2192 if (InBounds)
2193 cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2195 // Cast to the vector pointer type.
2196 NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2199 setDebugLocFromInst(Builder, Instr);
2200 Value *UndefVec = UndefValue::get(VecTy);
2202 Value *MaskForGaps = nullptr;
2203 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2204 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2205 assert(MaskForGaps && "Mask for Gaps is required but it is null");
2208 // Vectorize the interleaved load group.
2209 if (isa<LoadInst>(Instr)) {
2210 // For each unroll part, create a wide load for the group.
2211 SmallVector<Value *, 2> NewLoads;
2212 for (unsigned Part = 0; Part < UF; Part++) {
2213 Instruction *NewLoad;
2214 if (IsMaskForCondRequired || MaskForGaps) {
2215 assert(useMaskedInterleavedAccesses(*TTI) &&
2216 "masked interleaved groups are not allowed.");
2217 Value *GroupMask = MaskForGaps;
2218 if (IsMaskForCondRequired) {
2219 auto *Undefs = UndefValue::get(Mask[Part]->getType());
2220 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2221 Value *ShuffledMask = Builder.CreateShuffleVector(
2222 Mask[Part], Undefs, RepMask, "interleaved.mask");
2223 GroupMask = MaskForGaps
2224 ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2225 MaskForGaps)
2226 : ShuffledMask;
2228 NewLoad =
2229 Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2230 GroupMask, UndefVec, "wide.masked.vec");
2232 else
2233 NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
2234 Group->getAlignment(), "wide.vec");
2235 Group->addMetadata(NewLoad);
2236 NewLoads.push_back(NewLoad);
2239 // For each member in the group, shuffle out the appropriate data from the
2240 // wide loads.
2241 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2242 Instruction *Member = Group->getMember(I);
2244 // Skip the gaps in the group.
2245 if (!Member)
2246 continue;
2248 Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2249 for (unsigned Part = 0; Part < UF; Part++) {
2250 Value *StridedVec = Builder.CreateShuffleVector(
2251 NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2253 // If this member has different type, cast the result type.
2254 if (Member->getType() != ScalarTy) {
2255 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2256 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2259 if (Group->isReverse())
2260 StridedVec = reverseVector(StridedVec);
2262 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2265 return;
2268 // The sub vector type for current instruction.
2269 VectorType *SubVT = VectorType::get(ScalarTy, VF);
2271 // Vectorize the interleaved store group.
2272 for (unsigned Part = 0; Part < UF; Part++) {
2273 // Collect the stored vector from each member.
2274 SmallVector<Value *, 4> StoredVecs;
2275 for (unsigned i = 0; i < InterleaveFactor; i++) {
2276 // Interleaved store group doesn't allow a gap, so each index has a member
2277 Instruction *Member = Group->getMember(i);
2278 assert(Member && "Fail to get a member from an interleaved store group");
2280 Value *StoredVec = getOrCreateVectorValue(
2281 cast<StoreInst>(Member)->getValueOperand(), Part);
2282 if (Group->isReverse())
2283 StoredVec = reverseVector(StoredVec);
2285 // If this member has different type, cast it to a unified type.
2287 if (StoredVec->getType() != SubVT)
2288 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2290 StoredVecs.push_back(StoredVec);
2293 // Concatenate all vectors into a wide vector.
2294 Value *WideVec = concatenateVectors(Builder, StoredVecs);
2296 // Interleave the elements in the wide vector.
2297 Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2298 Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2299 "interleaved.vec");
2301 Instruction *NewStoreInstr;
2302 if (IsMaskForCondRequired) {
2303 auto *Undefs = UndefValue::get(Mask[Part]->getType());
2304 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2305 Value *ShuffledMask = Builder.CreateShuffleVector(
2306 Mask[Part], Undefs, RepMask, "interleaved.mask");
2307 NewStoreInstr = Builder.CreateMaskedStore(
2308 IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2310 else
2311 NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2312 Group->getAlignment());
2314 Group->addMetadata(NewStoreInstr);
2318 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2319 VectorParts *BlockInMask) {
2320 // Attempt to issue a wide load.
2321 LoadInst *LI = dyn_cast<LoadInst>(Instr);
2322 StoreInst *SI = dyn_cast<StoreInst>(Instr);
2324 assert((LI || SI) && "Invalid Load/Store instruction");
2326 LoopVectorizationCostModel::InstWidening Decision =
2327 Cost->getWideningDecision(Instr, VF);
2328 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2329 "CM decision should be taken at this point");
2330 if (Decision == LoopVectorizationCostModel::CM_Interleave)
2331 return vectorizeInterleaveGroup(Instr);
2333 Type *ScalarDataTy = getMemInstValueType(Instr);
2334 Type *DataTy = VectorType::get(ScalarDataTy, VF);
2335 Value *Ptr = getLoadStorePointerOperand(Instr);
2336 unsigned Alignment = getLoadStoreAlignment(Instr);
2337 // An alignment of 0 means target abi alignment. We need to use the scalar's
2338 // target abi alignment in such a case.
2339 const DataLayout &DL = Instr->getModule()->getDataLayout();
2340 if (!Alignment)
2341 Alignment = DL.getABITypeAlignment(ScalarDataTy);
2342 unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2344 // Determine if the pointer operand of the access is either consecutive or
2345 // reverse consecutive.
2346 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2347 bool ConsecutiveStride =
2348 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2349 bool CreateGatherScatter =
2350 (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2352 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2353 // gather/scatter. Otherwise Decision should have been to Scalarize.
2354 assert((ConsecutiveStride || CreateGatherScatter) &&
2355 "The instruction should be scalarized");
2357 // Handle consecutive loads/stores.
2358 if (ConsecutiveStride)
2359 Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2361 VectorParts Mask;
2362 bool isMaskRequired = BlockInMask;
2363 if (isMaskRequired)
2364 Mask = *BlockInMask;
2366 bool InBounds = false;
2367 if (auto *gep = dyn_cast<GetElementPtrInst>(
2368 getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2369 InBounds = gep->isInBounds();
2371 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2372 // Calculate the pointer for the specific unroll-part.
2373 GetElementPtrInst *PartPtr = nullptr;
2375 if (Reverse) {
2376 // If the address is consecutive but reversed, then the
2377 // wide store needs to start at the last vector element.
2378 PartPtr = cast<GetElementPtrInst>(
2379 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2380 PartPtr->setIsInBounds(InBounds);
2381 PartPtr = cast<GetElementPtrInst>(
2382 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2383 PartPtr->setIsInBounds(InBounds);
2384 if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2385 Mask[Part] = reverseVector(Mask[Part]);
2386 } else {
2387 PartPtr = cast<GetElementPtrInst>(
2388 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2389 PartPtr->setIsInBounds(InBounds);
2392 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2395 // Handle Stores:
2396 if (SI) {
2397 setDebugLocFromInst(Builder, SI);
2399 for (unsigned Part = 0; Part < UF; ++Part) {
2400 Instruction *NewSI = nullptr;
2401 Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2402 if (CreateGatherScatter) {
2403 Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2404 Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2405 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2406 MaskPart);
2407 } else {
2408 if (Reverse) {
2409 // If we store to reverse consecutive memory locations, then we need
2410 // to reverse the order of elements in the stored value.
2411 StoredVal = reverseVector(StoredVal);
2412 // We don't want to update the value in the map as it might be used in
2413 // another expression. So don't call resetVectorValue(StoredVal).
2415 auto *VecPtr = CreateVecPtr(Part, Ptr);
2416 if (isMaskRequired)
2417 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2418 Mask[Part]);
2419 else
2420 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2422 addMetadata(NewSI, SI);
2424 return;
2427 // Handle loads.
2428 assert(LI && "Must have a load instruction");
2429 setDebugLocFromInst(Builder, LI);
2430 for (unsigned Part = 0; Part < UF; ++Part) {
2431 Value *NewLI;
2432 if (CreateGatherScatter) {
2433 Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2434 Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2435 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2436 nullptr, "wide.masked.gather");
2437 addMetadata(NewLI, LI);
2438 } else {
2439 auto *VecPtr = CreateVecPtr(Part, Ptr);
2440 if (isMaskRequired)
2441 NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
2442 UndefValue::get(DataTy),
2443 "wide.masked.load");
2444 else
2445 NewLI =
2446 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2448 // Add metadata to the load, but setVectorValue to the reverse shuffle.
2449 addMetadata(NewLI, LI);
2450 if (Reverse)
2451 NewLI = reverseVector(NewLI);
2453 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2457 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2458 const VPIteration &Instance,
2459 bool IfPredicateInstr) {
2460 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2462 setDebugLocFromInst(Builder, Instr);
2464 // Does this instruction return a value ?
2465 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2467 Instruction *Cloned = Instr->clone();
2468 if (!IsVoidRetTy)
2469 Cloned->setName(Instr->getName() + ".cloned");
2471 // Replace the operands of the cloned instructions with their scalar
2472 // equivalents in the new loop.
2473 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2474 auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2475 Cloned->setOperand(op, NewOp);
2477 addNewMetadata(Cloned, Instr);
2479 // Place the cloned scalar in the new loop.
2480 Builder.Insert(Cloned);
2482 // Add the cloned scalar to the scalar map entry.
2483 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2485 // If we just cloned a new assumption, add it the assumption cache.
2486 if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2487 if (II->getIntrinsicID() == Intrinsic::assume)
2488 AC->registerAssumption(II);
2490 // End if-block.
2491 if (IfPredicateInstr)
2492 PredicatedInstructions.push_back(Cloned);
2495 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2496 Value *End, Value *Step,
2497 Instruction *DL) {
2498 BasicBlock *Header = L->getHeader();
2499 BasicBlock *Latch = L->getLoopLatch();
2500 // As we're just creating this loop, it's possible no latch exists
2501 // yet. If so, use the header as this will be a single block loop.
2502 if (!Latch)
2503 Latch = Header;
2505 IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2506 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2507 setDebugLocFromInst(Builder, OldInst);
2508 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2510 Builder.SetInsertPoint(Latch->getTerminator());
2511 setDebugLocFromInst(Builder, OldInst);
2513 // Create i+1 and fill the PHINode.
2514 Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2515 Induction->addIncoming(Start, L->getLoopPreheader());
2516 Induction->addIncoming(Next, Latch);
2517 // Create the compare.
2518 Value *ICmp = Builder.CreateICmpEQ(Next, End);
2519 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2521 // Now we have two terminators. Remove the old one from the block.
2522 Latch->getTerminator()->eraseFromParent();
2524 return Induction;
2527 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2528 if (TripCount)
2529 return TripCount;
2531 assert(L && "Create Trip Count for null loop.");
2532 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2533 // Find the loop boundaries.
2534 ScalarEvolution *SE = PSE.getSE();
2535 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2536 assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2537 "Invalid loop count");
2539 Type *IdxTy = Legal->getWidestInductionType();
2540 assert(IdxTy && "No type for induction");
2542 // The exit count might have the type of i64 while the phi is i32. This can
2543 // happen if we have an induction variable that is sign extended before the
2544 // compare. The only way that we get a backedge taken count is that the
2545 // induction variable was signed and as such will not overflow. In such a case
2546 // truncation is legal.
2547 if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2548 IdxTy->getPrimitiveSizeInBits())
2549 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2550 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2552 // Get the total trip count from the count by adding 1.
2553 const SCEV *ExitCount = SE->getAddExpr(
2554 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2556 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2558 // Expand the trip count and place the new instructions in the preheader.
2559 // Notice that the pre-header does not change, only the loop body.
2560 SCEVExpander Exp(*SE, DL, "induction");
2562 // Count holds the overall loop count (N).
2563 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2564 L->getLoopPreheader()->getTerminator());
2566 if (TripCount->getType()->isPointerTy())
2567 TripCount =
2568 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2569 L->getLoopPreheader()->getTerminator());
2571 return TripCount;
2574 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2575 if (VectorTripCount)
2576 return VectorTripCount;
2578 Value *TC = getOrCreateTripCount(L);
2579 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2581 Type *Ty = TC->getType();
2582 Constant *Step = ConstantInt::get(Ty, VF * UF);
2584 // If the tail is to be folded by masking, round the number of iterations N
2585 // up to a multiple of Step instead of rounding down. This is done by first
2586 // adding Step-1 and then rounding down. Note that it's ok if this addition
2587 // overflows: the vector induction variable will eventually wrap to zero given
2588 // that it starts at zero and its Step is a power of two; the loop will then
2589 // exit, with the last early-exit vector comparison also producing all-true.
2590 if (Cost->foldTailByMasking()) {
2591 assert(isPowerOf2_32(VF * UF) &&
2592 "VF*UF must be a power of 2 when folding tail by masking");
2593 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2596 // Now we need to generate the expression for the part of the loop that the
2597 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2598 // iterations are not required for correctness, or N - Step, otherwise. Step
2599 // is equal to the vectorization factor (number of SIMD elements) times the
2600 // unroll factor (number of SIMD instructions).
2601 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2603 // If there is a non-reversed interleaved group that may speculatively access
2604 // memory out-of-bounds, we need to ensure that there will be at least one
2605 // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2606 // the trip count, we set the remainder to be equal to the step. If the step
2607 // does not evenly divide the trip count, no adjustment is necessary since
2608 // there will already be scalar iterations. Note that the minimum iterations
2609 // check ensures that N >= Step.
2610 if (VF > 1 && Cost->requiresScalarEpilogue()) {
2611 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2612 R = Builder.CreateSelect(IsZero, Step, R);
2615 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2617 return VectorTripCount;
2620 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2621 const DataLayout &DL) {
2622 // Verify that V is a vector type with same number of elements as DstVTy.
2623 unsigned VF = DstVTy->getNumElements();
2624 VectorType *SrcVecTy = cast<VectorType>(V->getType());
2625 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2626 Type *SrcElemTy = SrcVecTy->getElementType();
2627 Type *DstElemTy = DstVTy->getElementType();
2628 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2629 "Vector elements must have same size");
2631 // Do a direct cast if element types are castable.
2632 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2633 return Builder.CreateBitOrPointerCast(V, DstVTy);
2635 // V cannot be directly casted to desired vector type.
2636 // May happen when V is a floating point vector but DstVTy is a vector of
2637 // pointers or vice-versa. Handle this using a two-step bitcast using an
2638 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2639 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2640 "Only one type should be a pointer type");
2641 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2642 "Only one type should be a floating point type");
2643 Type *IntTy =
2644 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2645 VectorType *VecIntTy = VectorType::get(IntTy, VF);
2646 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2647 return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2650 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2651 BasicBlock *Bypass) {
2652 Value *Count = getOrCreateTripCount(L);
2653 BasicBlock *BB = L->getLoopPreheader();
2654 IRBuilder<> Builder(BB->getTerminator());
2656 // Generate code to check if the loop's trip count is less than VF * UF, or
2657 // equal to it in case a scalar epilogue is required; this implies that the
2658 // vector trip count is zero. This check also covers the case where adding one
2659 // to the backedge-taken count overflowed leading to an incorrect trip count
2660 // of zero. In this case we will also jump to the scalar loop.
2661 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2662 : ICmpInst::ICMP_ULT;
2664 // If tail is to be folded, vector loop takes care of all iterations.
2665 Value *CheckMinIters = Builder.getFalse();
2666 if (!Cost->foldTailByMasking())
2667 CheckMinIters = Builder.CreateICmp(
2668 P, Count, ConstantInt::get(Count->getType(), VF * UF),
2669 "min.iters.check");
2671 BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2672 // Update dominator tree immediately if the generated block is a
2673 // LoopBypassBlock because SCEV expansions to generate loop bypass
2674 // checks may query it before the current function is finished.
2675 DT->addNewBlock(NewBB, BB);
2676 if (L->getParentLoop())
2677 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2678 ReplaceInstWithInst(BB->getTerminator(),
2679 BranchInst::Create(Bypass, NewBB, CheckMinIters));
2680 LoopBypassBlocks.push_back(BB);
2683 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2684 BasicBlock *BB = L->getLoopPreheader();
2686 // Generate the code to check that the SCEV assumptions that we made.
2687 // We want the new basic block to start at the first instruction in a
2688 // sequence of instructions that form a check.
2689 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2690 "scev.check");
2691 Value *SCEVCheck =
2692 Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2694 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2695 if (C->isZero())
2696 return;
2698 assert(!Cost->foldTailByMasking() &&
2699 "Cannot SCEV check stride or overflow when folding tail");
2700 // Create a new block containing the stride check.
2701 BB->setName("vector.scevcheck");
2702 auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2703 // Update dominator tree immediately if the generated block is a
2704 // LoopBypassBlock because SCEV expansions to generate loop bypass
2705 // checks may query it before the current function is finished.
2706 DT->addNewBlock(NewBB, BB);
2707 if (L->getParentLoop())
2708 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2709 ReplaceInstWithInst(BB->getTerminator(),
2710 BranchInst::Create(Bypass, NewBB, SCEVCheck));
2711 LoopBypassBlocks.push_back(BB);
2712 AddedSafetyChecks = true;
2715 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2716 // VPlan-native path does not do any analysis for runtime checks currently.
2717 if (EnableVPlanNativePath)
2718 return;
2720 BasicBlock *BB = L->getLoopPreheader();
2722 // Generate the code that checks in runtime if arrays overlap. We put the
2723 // checks into a separate block to make the more common case of few elements
2724 // faster.
2725 Instruction *FirstCheckInst;
2726 Instruction *MemRuntimeCheck;
2727 std::tie(FirstCheckInst, MemRuntimeCheck) =
2728 Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
2729 if (!MemRuntimeCheck)
2730 return;
2732 assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail");
2733 // Create a new block containing the memory check.
2734 BB->setName("vector.memcheck");
2735 auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2736 // Update dominator tree immediately if the generated block is a
2737 // LoopBypassBlock because SCEV expansions to generate loop bypass
2738 // checks may query it before the current function is finished.
2739 DT->addNewBlock(NewBB, BB);
2740 if (L->getParentLoop())
2741 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2742 ReplaceInstWithInst(BB->getTerminator(),
2743 BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2744 LoopBypassBlocks.push_back(BB);
2745 AddedSafetyChecks = true;
2747 // We currently don't use LoopVersioning for the actual loop cloning but we
2748 // still use it to add the noalias metadata.
2749 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2750 PSE.getSE());
2751 LVer->prepareNoAliasMetadata();
2754 Value *InnerLoopVectorizer::emitTransformedIndex(
2755 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2756 const InductionDescriptor &ID) const {
2758 SCEVExpander Exp(*SE, DL, "induction");
2759 auto Step = ID.getStep();
2760 auto StartValue = ID.getStartValue();
2761 assert(Index->getType() == Step->getType() &&
2762 "Index type does not match StepValue type");
2764 // Note: the IR at this point is broken. We cannot use SE to create any new
2765 // SCEV and then expand it, hoping that SCEV's simplification will give us
2766 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2767 // lead to various SCEV crashes. So all we can do is to use builder and rely
2768 // on InstCombine for future simplifications. Here we handle some trivial
2769 // cases only.
2770 auto CreateAdd = [&B](Value *X, Value *Y) {
2771 assert(X->getType() == Y->getType() && "Types don't match!");
2772 if (auto *CX = dyn_cast<ConstantInt>(X))
2773 if (CX->isZero())
2774 return Y;
2775 if (auto *CY = dyn_cast<ConstantInt>(Y))
2776 if (CY->isZero())
2777 return X;
2778 return B.CreateAdd(X, Y);
2781 auto CreateMul = [&B](Value *X, Value *Y) {
2782 assert(X->getType() == Y->getType() && "Types don't match!");
2783 if (auto *CX = dyn_cast<ConstantInt>(X))
2784 if (CX->isOne())
2785 return Y;
2786 if (auto *CY = dyn_cast<ConstantInt>(Y))
2787 if (CY->isOne())
2788 return X;
2789 return B.CreateMul(X, Y);
2792 switch (ID.getKind()) {
2793 case InductionDescriptor::IK_IntInduction: {
2794 assert(Index->getType() == StartValue->getType() &&
2795 "Index type does not match StartValue type");
2796 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2797 return B.CreateSub(StartValue, Index);
2798 auto *Offset = CreateMul(
2799 Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2800 return CreateAdd(StartValue, Offset);
2802 case InductionDescriptor::IK_PtrInduction: {
2803 assert(isa<SCEVConstant>(Step) &&
2804 "Expected constant step for pointer induction");
2805 return B.CreateGEP(
2806 StartValue->getType()->getPointerElementType(), StartValue,
2807 CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2808 &*B.GetInsertPoint())));
2810 case InductionDescriptor::IK_FpInduction: {
2811 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2812 auto InductionBinOp = ID.getInductionBinOp();
2813 assert(InductionBinOp &&
2814 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2815 InductionBinOp->getOpcode() == Instruction::FSub) &&
2816 "Original bin op should be defined for FP induction");
2818 Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2820 // Floating point operations had to be 'fast' to enable the induction.
2821 FastMathFlags Flags;
2822 Flags.setFast();
2824 Value *MulExp = B.CreateFMul(StepValue, Index);
2825 if (isa<Instruction>(MulExp))
2826 // We have to check, the MulExp may be a constant.
2827 cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2829 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2830 "induction");
2831 if (isa<Instruction>(BOp))
2832 cast<Instruction>(BOp)->setFastMathFlags(Flags);
2834 return BOp;
2836 case InductionDescriptor::IK_NoInduction:
2837 return nullptr;
2839 llvm_unreachable("invalid enum");
2842 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2844 In this function we generate a new loop. The new loop will contain
2845 the vectorized instructions while the old loop will continue to run the
2846 scalar remainder.
2848 [ ] <-- loop iteration number check.
2851 | [ ] <-- vector loop bypass (may consist of multiple blocks).
2852 | / |
2853 | / v
2854 || [ ] <-- vector pre header.
2855 |/ |
2857 | [ ] \
2858 | [ ]_| <-- vector loop.
2861 | -[ ] <--- middle-block.
2862 | / |
2863 | / v
2864 -|- >[ ] <--- new preheader.
2867 | [ ] \
2868 | [ ]_| <-- old scalar loop to handle remainder.
2871 >[ ] <-- exit block.
2875 BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2876 BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2877 BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2878 MDNode *OrigLoopID = OrigLoop->getLoopID();
2879 assert(VectorPH && "Invalid loop structure");
2880 assert(ExitBlock && "Must have an exit block");
2882 // Some loops have a single integer induction variable, while other loops
2883 // don't. One example is c++ iterators that often have multiple pointer
2884 // induction variables. In the code below we also support a case where we
2885 // don't have a single induction variable.
2887 // We try to obtain an induction variable from the original loop as hard
2888 // as possible. However if we don't find one that:
2889 // - is an integer
2890 // - counts from zero, stepping by one
2891 // - is the size of the widest induction variable type
2892 // then we create a new one.
2893 OldInduction = Legal->getPrimaryInduction();
2894 Type *IdxTy = Legal->getWidestInductionType();
2896 // Split the single block loop into the two loop structure described above.
2897 BasicBlock *VecBody =
2898 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2899 BasicBlock *MiddleBlock =
2900 VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2901 BasicBlock *ScalarPH =
2902 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2904 // Create and register the new vector loop.
2905 Loop *Lp = LI->AllocateLoop();
2906 Loop *ParentLoop = OrigLoop->getParentLoop();
2908 // Insert the new loop into the loop nest and register the new basic blocks
2909 // before calling any utilities such as SCEV that require valid LoopInfo.
2910 if (ParentLoop) {
2911 ParentLoop->addChildLoop(Lp);
2912 ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2913 ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2914 } else {
2915 LI->addTopLevelLoop(Lp);
2917 Lp->addBasicBlockToLoop(VecBody, *LI);
2919 // Find the loop boundaries.
2920 Value *Count = getOrCreateTripCount(Lp);
2922 Value *StartIdx = ConstantInt::get(IdxTy, 0);
2924 // Now, compare the new count to zero. If it is zero skip the vector loop and
2925 // jump to the scalar loop. This check also covers the case where the
2926 // backedge-taken count is uint##_max: adding one to it will overflow leading
2927 // to an incorrect trip count of zero. In this (rare) case we will also jump
2928 // to the scalar loop.
2929 emitMinimumIterationCountCheck(Lp, ScalarPH);
2931 // Generate the code to check any assumptions that we've made for SCEV
2932 // expressions.
2933 emitSCEVChecks(Lp, ScalarPH);
2935 // Generate the code that checks in runtime if arrays overlap. We put the
2936 // checks into a separate block to make the more common case of few elements
2937 // faster.
2938 emitMemRuntimeChecks(Lp, ScalarPH);
2940 // Generate the induction variable.
2941 // The loop step is equal to the vectorization factor (num of SIMD elements)
2942 // times the unroll factor (num of SIMD instructions).
2943 Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2944 Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2945 Induction =
2946 createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2947 getDebugLocFromInstOrOperands(OldInduction));
2949 // We are going to resume the execution of the scalar loop.
2950 // Go over all of the induction variables that we found and fix the
2951 // PHIs that are left in the scalar version of the loop.
2952 // The starting values of PHI nodes depend on the counter of the last
2953 // iteration in the vectorized loop.
2954 // If we come from a bypass edge then we need to start from the original
2955 // start value.
2957 // This variable saves the new starting index for the scalar loop. It is used
2958 // to test if there are any tail iterations left once the vector loop has
2959 // completed.
2960 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
2961 for (auto &InductionEntry : *List) {
2962 PHINode *OrigPhi = InductionEntry.first;
2963 InductionDescriptor II = InductionEntry.second;
2965 // Create phi nodes to merge from the backedge-taken check block.
2966 PHINode *BCResumeVal = PHINode::Create(
2967 OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
2968 // Copy original phi DL over to the new one.
2969 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2970 Value *&EndValue = IVEndValues[OrigPhi];
2971 if (OrigPhi == OldInduction) {
2972 // We know what the end value is.
2973 EndValue = CountRoundDown;
2974 } else {
2975 IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
2976 Type *StepType = II.getStep()->getType();
2977 Instruction::CastOps CastOp =
2978 CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
2979 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
2980 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2981 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
2982 EndValue->setName("ind.end");
2985 // The new PHI merges the original incoming value, in case of a bypass,
2986 // or the value at the end of the vectorized loop.
2987 BCResumeVal->addIncoming(EndValue, MiddleBlock);
2989 // Fix the scalar body counter (PHI node).
2990 // The old induction's phi node in the scalar body needs the truncated
2991 // value.
2992 for (BasicBlock *BB : LoopBypassBlocks)
2993 BCResumeVal->addIncoming(II.getStartValue(), BB);
2994 OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);
2997 // We need the OrigLoop (scalar loop part) latch terminator to help
2998 // produce correct debug info for the middle block BB instructions.
2999 // The legality check stage guarantees that the loop will have a single
3000 // latch.
3001 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3002 "Scalar loop latch terminator isn't a branch");
3003 BranchInst *ScalarLatchBr =
3004 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3006 // Add a check in the middle block to see if we have completed
3007 // all of the iterations in the first vector loop.
3008 // If (N - N%VF) == N, then we *don't* need to run the remainder.
3009 // If tail is to be folded, we know we don't need to run the remainder.
3010 Value *CmpN = Builder.getTrue();
3011 if (!Cost->foldTailByMasking()) {
3012 CmpN =
3013 CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3014 CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
3016 // Here we use the same DebugLoc as the scalar loop latch branch instead
3017 // of the corresponding compare because they may have ended up with
3018 // different line numbers and we want to avoid awkward line stepping while
3019 // debugging. Eg. if the compare has got a line number inside the loop.
3020 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3023 BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
3024 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3025 ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
3027 // Get ready to start creating new instructions into the vectorized body.
3028 Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
3030 // Save the state.
3031 LoopVectorPreHeader = Lp->getLoopPreheader();
3032 LoopScalarPreHeader = ScalarPH;
3033 LoopMiddleBlock = MiddleBlock;
3034 LoopExitBlock = ExitBlock;
3035 LoopVectorBody = VecBody;
3036 LoopScalarBody = OldBasicBlock;
3038 Optional<MDNode *> VectorizedLoopID =
3039 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3040 LLVMLoopVectorizeFollowupVectorized});
3041 if (VectorizedLoopID.hasValue()) {
3042 Lp->setLoopID(VectorizedLoopID.getValue());
3044 // Do not setAlreadyVectorized if loop attributes have been defined
3045 // explicitly.
3046 return LoopVectorPreHeader;
3049 // Keep all loop hints from the original loop on the vector loop (we'll
3050 // replace the vectorizer-specific hints below).
3051 if (MDNode *LID = OrigLoop->getLoopID())
3052 Lp->setLoopID(LID);
3054 LoopVectorizeHints Hints(Lp, true, *ORE);
3055 Hints.setAlreadyVectorized();
3057 return LoopVectorPreHeader;
3060 // Fix up external users of the induction variable. At this point, we are
3061 // in LCSSA form, with all external PHIs that use the IV having one input value,
3062 // coming from the remainder loop. We need those PHIs to also have a correct
3063 // value for the IV when arriving directly from the middle block.
3064 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3065 const InductionDescriptor &II,
3066 Value *CountRoundDown, Value *EndValue,
3067 BasicBlock *MiddleBlock) {
3068 // There are two kinds of external IV usages - those that use the value
3069 // computed in the last iteration (the PHI) and those that use the penultimate
3070 // value (the value that feeds into the phi from the loop latch).
3071 // We allow both, but they, obviously, have different values.
3073 assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3075 DenseMap<Value *, Value *> MissingVals;
3077 // An external user of the last iteration's value should see the value that
3078 // the remainder loop uses to initialize its own IV.
3079 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3080 for (User *U : PostInc->users()) {
3081 Instruction *UI = cast<Instruction>(U);
3082 if (!OrigLoop->contains(UI)) {
3083 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3084 MissingVals[UI] = EndValue;
3088 // An external user of the penultimate value need to see EndValue - Step.
3089 // The simplest way to get this is to recompute it from the constituent SCEVs,
3090 // that is Start + (Step * (CRD - 1)).
3091 for (User *U : OrigPhi->users()) {
3092 auto *UI = cast<Instruction>(U);
3093 if (!OrigLoop->contains(UI)) {
3094 const DataLayout &DL =
3095 OrigLoop->getHeader()->getModule()->getDataLayout();
3096 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3098 IRBuilder<> B(MiddleBlock->getTerminator());
3099 Value *CountMinusOne = B.CreateSub(
3100 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3101 Value *CMO =
3102 !II.getStep()->getType()->isIntegerTy()
3103 ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3104 II.getStep()->getType())
3105 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3106 CMO->setName("cast.cmo");
3107 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3108 Escape->setName("ind.escape");
3109 MissingVals[UI] = Escape;
3113 for (auto &I : MissingVals) {
3114 PHINode *PHI = cast<PHINode>(I.first);
3115 // One corner case we have to handle is two IVs "chasing" each-other,
3116 // that is %IV2 = phi [...], [ %IV1, %latch ]
3117 // In this case, if IV1 has an external use, we need to avoid adding both
3118 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3119 // don't already have an incoming value for the middle block.
3120 if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3121 PHI->addIncoming(I.second, MiddleBlock);
3125 namespace {
3127 struct CSEDenseMapInfo {
3128 static bool canHandle(const Instruction *I) {
3129 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3130 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3133 static inline Instruction *getEmptyKey() {
3134 return DenseMapInfo<Instruction *>::getEmptyKey();
3137 static inline Instruction *getTombstoneKey() {
3138 return DenseMapInfo<Instruction *>::getTombstoneKey();
3141 static unsigned getHashValue(const Instruction *I) {
3142 assert(canHandle(I) && "Unknown instruction!");
3143 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3144 I->value_op_end()));
3147 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3148 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3149 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3150 return LHS == RHS;
3151 return LHS->isIdenticalTo(RHS);
3155 } // end anonymous namespace
3157 ///Perform cse of induction variable instructions.
3158 static void cse(BasicBlock *BB) {
3159 // Perform simple cse.
3160 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3161 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3162 Instruction *In = &*I++;
3164 if (!CSEDenseMapInfo::canHandle(In))
3165 continue;
3167 // Check if we can replace this instruction with any of the
3168 // visited instructions.
3169 if (Instruction *V = CSEMap.lookup(In)) {
3170 In->replaceAllUsesWith(V);
3171 In->eraseFromParent();
3172 continue;
3175 CSEMap[In] = In;
3179 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3180 unsigned VF,
3181 bool &NeedToScalarize) {
3182 Function *F = CI->getCalledFunction();
3183 StringRef FnName = CI->getCalledFunction()->getName();
3184 Type *ScalarRetTy = CI->getType();
3185 SmallVector<Type *, 4> Tys, ScalarTys;
3186 for (auto &ArgOp : CI->arg_operands())
3187 ScalarTys.push_back(ArgOp->getType());
3189 // Estimate cost of scalarized vector call. The source operands are assumed
3190 // to be vectors, so we need to extract individual elements from there,
3191 // execute VF scalar calls, and then gather the result into the vector return
3192 // value.
3193 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3194 if (VF == 1)
3195 return ScalarCallCost;
3197 // Compute corresponding vector type for return value and arguments.
3198 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3199 for (Type *ScalarTy : ScalarTys)
3200 Tys.push_back(ToVectorTy(ScalarTy, VF));
3202 // Compute costs of unpacking argument values for the scalar calls and
3203 // packing the return values to a vector.
3204 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3206 unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3208 // If we can't emit a vector call for this function, then the currently found
3209 // cost is the cost we need to return.
3210 NeedToScalarize = true;
3211 if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3212 return Cost;
3214 // If the corresponding vector cost is cheaper, return its cost.
3215 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3216 if (VectorCallCost < Cost) {
3217 NeedToScalarize = false;
3218 return VectorCallCost;
3220 return Cost;
3223 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3224 unsigned VF) {
3225 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3226 assert(ID && "Expected intrinsic call!");
3228 FastMathFlags FMF;
3229 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3230 FMF = FPMO->getFastMathFlags();
3232 SmallVector<Value *, 4> Operands(CI->arg_operands());
3233 return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3236 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3237 auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3238 auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3239 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3241 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3242 auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3243 auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3244 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3247 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3248 // For every instruction `I` in MinBWs, truncate the operands, create a
3249 // truncated version of `I` and reextend its result. InstCombine runs
3250 // later and will remove any ext/trunc pairs.
3251 SmallPtrSet<Value *, 4> Erased;
3252 for (const auto &KV : Cost->getMinimalBitwidths()) {
3253 // If the value wasn't vectorized, we must maintain the original scalar
3254 // type. The absence of the value from VectorLoopValueMap indicates that it
3255 // wasn't vectorized.
3256 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3257 continue;
3258 for (unsigned Part = 0; Part < UF; ++Part) {
3259 Value *I = getOrCreateVectorValue(KV.first, Part);
3260 if (Erased.find(I) != Erased.end() || I->use_empty() ||
3261 !isa<Instruction>(I))
3262 continue;
3263 Type *OriginalTy = I->getType();
3264 Type *ScalarTruncatedTy =
3265 IntegerType::get(OriginalTy->getContext(), KV.second);
3266 Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3267 OriginalTy->getVectorNumElements());
3268 if (TruncatedTy == OriginalTy)
3269 continue;
3271 IRBuilder<> B(cast<Instruction>(I));
3272 auto ShrinkOperand = [&](Value *V) -> Value * {
3273 if (auto *ZI = dyn_cast<ZExtInst>(V))
3274 if (ZI->getSrcTy() == TruncatedTy)
3275 return ZI->getOperand(0);
3276 return B.CreateZExtOrTrunc(V, TruncatedTy);
3279 // The actual instruction modification depends on the instruction type,
3280 // unfortunately.
3281 Value *NewI = nullptr;
3282 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3283 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3284 ShrinkOperand(BO->getOperand(1)));
3286 // Any wrapping introduced by shrinking this operation shouldn't be
3287 // considered undefined behavior. So, we can't unconditionally copy
3288 // arithmetic wrapping flags to NewI.
3289 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3290 } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3291 NewI =
3292 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3293 ShrinkOperand(CI->getOperand(1)));
3294 } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3295 NewI = B.CreateSelect(SI->getCondition(),
3296 ShrinkOperand(SI->getTrueValue()),
3297 ShrinkOperand(SI->getFalseValue()));
3298 } else if (auto *CI = dyn_cast<CastInst>(I)) {
3299 switch (CI->getOpcode()) {
3300 default:
3301 llvm_unreachable("Unhandled cast!");
3302 case Instruction::Trunc:
3303 NewI = ShrinkOperand(CI->getOperand(0));
3304 break;
3305 case Instruction::SExt:
3306 NewI = B.CreateSExtOrTrunc(
3307 CI->getOperand(0),
3308 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3309 break;
3310 case Instruction::ZExt:
3311 NewI = B.CreateZExtOrTrunc(
3312 CI->getOperand(0),
3313 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3314 break;
3316 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3317 auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3318 auto *O0 = B.CreateZExtOrTrunc(
3319 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3320 auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3321 auto *O1 = B.CreateZExtOrTrunc(
3322 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3324 NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3325 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3326 // Don't do anything with the operands, just extend the result.
3327 continue;
3328 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3329 auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3330 auto *O0 = B.CreateZExtOrTrunc(
3331 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3332 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3333 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3334 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3335 auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3336 auto *O0 = B.CreateZExtOrTrunc(
3337 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3338 NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3339 } else {
3340 // If we don't know what to do, be conservative and don't do anything.
3341 continue;
3344 // Lastly, extend the result.
3345 NewI->takeName(cast<Instruction>(I));
3346 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3347 I->replaceAllUsesWith(Res);
3348 cast<Instruction>(I)->eraseFromParent();
3349 Erased.insert(I);
3350 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3354 // We'll have created a bunch of ZExts that are now parentless. Clean up.
3355 for (const auto &KV : Cost->getMinimalBitwidths()) {
3356 // If the value wasn't vectorized, we must maintain the original scalar
3357 // type. The absence of the value from VectorLoopValueMap indicates that it
3358 // wasn't vectorized.
3359 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3360 continue;
3361 for (unsigned Part = 0; Part < UF; ++Part) {
3362 Value *I = getOrCreateVectorValue(KV.first, Part);
3363 ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3364 if (Inst && Inst->use_empty()) {
3365 Value *NewI = Inst->getOperand(0);
3366 Inst->eraseFromParent();
3367 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3373 void InnerLoopVectorizer::fixVectorizedLoop() {
3374 // Insert truncates and extends for any truncated instructions as hints to
3375 // InstCombine.
3376 if (VF > 1)
3377 truncateToMinimalBitwidths();
3379 // Fix widened non-induction PHIs by setting up the PHI operands.
3380 if (OrigPHIsToFix.size()) {
3381 assert(EnableVPlanNativePath &&
3382 "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3383 fixNonInductionPHIs();
3386 // At this point every instruction in the original loop is widened to a
3387 // vector form. Now we need to fix the recurrences in the loop. These PHI
3388 // nodes are currently empty because we did not want to introduce cycles.
3389 // This is the second stage of vectorizing recurrences.
3390 fixCrossIterationPHIs();
3392 // Update the dominator tree.
3394 // FIXME: After creating the structure of the new loop, the dominator tree is
3395 // no longer up-to-date, and it remains that way until we update it
3396 // here. An out-of-date dominator tree is problematic for SCEV,
3397 // because SCEVExpander uses it to guide code generation. The
3398 // vectorizer use SCEVExpanders in several places. Instead, we should
3399 // keep the dominator tree up-to-date as we go.
3400 updateAnalysis();
3402 // Fix-up external users of the induction variables.
3403 for (auto &Entry : *Legal->getInductionVars())
3404 fixupIVUsers(Entry.first, Entry.second,
3405 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3406 IVEndValues[Entry.first], LoopMiddleBlock);
3408 fixLCSSAPHIs();
3409 for (Instruction *PI : PredicatedInstructions)
3410 sinkScalarOperands(&*PI);
3412 // Remove redundant induction instructions.
3413 cse(LoopVectorBody);
3416 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3417 // In order to support recurrences we need to be able to vectorize Phi nodes.
3418 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3419 // stage #2: We now need to fix the recurrences by adding incoming edges to
3420 // the currently empty PHI nodes. At this point every instruction in the
3421 // original loop is widened to a vector form so we can use them to construct
3422 // the incoming edges.
3423 for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3424 // Handle first-order recurrences and reductions that need to be fixed.
3425 if (Legal->isFirstOrderRecurrence(&Phi))
3426 fixFirstOrderRecurrence(&Phi);
3427 else if (Legal->isReductionVariable(&Phi))
3428 fixReduction(&Phi);
3432 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3433 // This is the second phase of vectorizing first-order recurrences. An
3434 // overview of the transformation is described below. Suppose we have the
3435 // following loop.
3437 // for (int i = 0; i < n; ++i)
3438 // b[i] = a[i] - a[i - 1];
3440 // There is a first-order recurrence on "a". For this loop, the shorthand
3441 // scalar IR looks like:
3443 // scalar.ph:
3444 // s_init = a[-1]
3445 // br scalar.body
3447 // scalar.body:
3448 // i = phi [0, scalar.ph], [i+1, scalar.body]
3449 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3450 // s2 = a[i]
3451 // b[i] = s2 - s1
3452 // br cond, scalar.body, ...
3454 // In this example, s1 is a recurrence because it's value depends on the
3455 // previous iteration. In the first phase of vectorization, we created a
3456 // temporary value for s1. We now complete the vectorization and produce the
3457 // shorthand vector IR shown below (for VF = 4, UF = 1).
3459 // vector.ph:
3460 // v_init = vector(..., ..., ..., a[-1])
3461 // br vector.body
3463 // vector.body
3464 // i = phi [0, vector.ph], [i+4, vector.body]
3465 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3466 // v2 = a[i, i+1, i+2, i+3];
3467 // v3 = vector(v1(3), v2(0, 1, 2))
3468 // b[i, i+1, i+2, i+3] = v2 - v3
3469 // br cond, vector.body, middle.block
3471 // middle.block:
3472 // x = v2(3)
3473 // br scalar.ph
3475 // scalar.ph:
3476 // s_init = phi [x, middle.block], [a[-1], otherwise]
3477 // br scalar.body
3479 // After execution completes the vector loop, we extract the next value of
3480 // the recurrence (x) to use as the initial value in the scalar loop.
3482 // Get the original loop preheader and single loop latch.
3483 auto *Preheader = OrigLoop->getLoopPreheader();
3484 auto *Latch = OrigLoop->getLoopLatch();
3486 // Get the initial and previous values of the scalar recurrence.
3487 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3488 auto *Previous = Phi->getIncomingValueForBlock(Latch);
3490 // Create a vector from the initial value.
3491 auto *VectorInit = ScalarInit;
3492 if (VF > 1) {
3493 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3494 VectorInit = Builder.CreateInsertElement(
3495 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3496 Builder.getInt32(VF - 1), "vector.recur.init");
3499 // We constructed a temporary phi node in the first phase of vectorization.
3500 // This phi node will eventually be deleted.
3501 Builder.SetInsertPoint(
3502 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3504 // Create a phi node for the new recurrence. The current value will either be
3505 // the initial value inserted into a vector or loop-varying vector value.
3506 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3507 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3509 // Get the vectorized previous value of the last part UF - 1. It appears last
3510 // among all unrolled iterations, due to the order of their construction.
3511 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3513 // Set the insertion point after the previous value if it is an instruction.
3514 // Note that the previous value may have been constant-folded so it is not
3515 // guaranteed to be an instruction in the vector loop. Also, if the previous
3516 // value is a phi node, we should insert after all the phi nodes to avoid
3517 // breaking basic block verification.
3518 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
3519 isa<PHINode>(PreviousLastPart))
3520 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3521 else
3522 Builder.SetInsertPoint(
3523 &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
3525 // We will construct a vector for the recurrence by combining the values for
3526 // the current and previous iterations. This is the required shuffle mask.
3527 SmallVector<Constant *, 8> ShuffleMask(VF);
3528 ShuffleMask[0] = Builder.getInt32(VF - 1);
3529 for (unsigned I = 1; I < VF; ++I)
3530 ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3532 // The vector from which to take the initial value for the current iteration
3533 // (actual or unrolled). Initially, this is the vector phi node.
3534 Value *Incoming = VecPhi;
3536 // Shuffle the current and previous vector and update the vector parts.
3537 for (unsigned Part = 0; Part < UF; ++Part) {
3538 Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3539 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3540 auto *Shuffle =
3541 VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3542 ConstantVector::get(ShuffleMask))
3543 : Incoming;
3544 PhiPart->replaceAllUsesWith(Shuffle);
3545 cast<Instruction>(PhiPart)->eraseFromParent();
3546 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3547 Incoming = PreviousPart;
3550 // Fix the latch value of the new recurrence in the vector loop.
3551 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3553 // Extract the last vector element in the middle block. This will be the
3554 // initial value for the recurrence when jumping to the scalar loop.
3555 auto *ExtractForScalar = Incoming;
3556 if (VF > 1) {
3557 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3558 ExtractForScalar = Builder.CreateExtractElement(
3559 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3561 // Extract the second last element in the middle block if the
3562 // Phi is used outside the loop. We need to extract the phi itself
3563 // and not the last element (the phi update in the current iteration). This
3564 // will be the value when jumping to the exit block from the LoopMiddleBlock,
3565 // when the scalar loop is not run at all.
3566 Value *ExtractForPhiUsedOutsideLoop = nullptr;
3567 if (VF > 1)
3568 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3569 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3570 // When loop is unrolled without vectorizing, initialize
3571 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3572 // `Incoming`. This is analogous to the vectorized case above: extracting the
3573 // second last element when VF > 1.
3574 else if (UF > 1)
3575 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3577 // Fix the initial value of the original recurrence in the scalar loop.
3578 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3579 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3580 for (auto *BB : predecessors(LoopScalarPreHeader)) {
3581 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3582 Start->addIncoming(Incoming, BB);
3585 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3586 Phi->setName("scalar.recur");
3588 // Finally, fix users of the recurrence outside the loop. The users will need
3589 // either the last value of the scalar recurrence or the last value of the
3590 // vector recurrence we extracted in the middle block. Since the loop is in
3591 // LCSSA form, we just need to find all the phi nodes for the original scalar
3592 // recurrence in the exit block, and then add an edge for the middle block.
3593 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3594 if (LCSSAPhi.getIncomingValue(0) == Phi) {
3595 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3600 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3601 Constant *Zero = Builder.getInt32(0);
3603 // Get it's reduction variable descriptor.
3604 assert(Legal->isReductionVariable(Phi) &&
3605 "Unable to find the reduction variable");
3606 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3608 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3609 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3610 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3611 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3612 RdxDesc.getMinMaxRecurrenceKind();
3613 setDebugLocFromInst(Builder, ReductionStartValue);
3615 // We need to generate a reduction vector from the incoming scalar.
3616 // To do so, we need to generate the 'identity' vector and override
3617 // one of the elements with the incoming scalar reduction. We need
3618 // to do it in the vector-loop preheader.
3619 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3621 // This is the vector-clone of the value that leaves the loop.
3622 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3624 // Find the reduction identity variable. Zero for addition, or, xor,
3625 // one for multiplication, -1 for And.
3626 Value *Identity;
3627 Value *VectorStart;
3628 if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3629 RK == RecurrenceDescriptor::RK_FloatMinMax) {
3630 // MinMax reduction have the start value as their identify.
3631 if (VF == 1) {
3632 VectorStart = Identity = ReductionStartValue;
3633 } else {
3634 VectorStart = Identity =
3635 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3637 } else {
3638 // Handle other reduction kinds:
3639 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3640 RK, VecTy->getScalarType());
3641 if (VF == 1) {
3642 Identity = Iden;
3643 // This vector is the Identity vector where the first element is the
3644 // incoming scalar reduction.
3645 VectorStart = ReductionStartValue;
3646 } else {
3647 Identity = ConstantVector::getSplat(VF, Iden);
3649 // This vector is the Identity vector where the first element is the
3650 // incoming scalar reduction.
3651 VectorStart =
3652 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3656 // Fix the vector-loop phi.
3658 // Reductions do not have to start at zero. They can start with
3659 // any loop invariant values.
3660 BasicBlock *Latch = OrigLoop->getLoopLatch();
3661 Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3662 for (unsigned Part = 0; Part < UF; ++Part) {
3663 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3664 Value *Val = getOrCreateVectorValue(LoopVal, Part);
3665 // Make sure to add the reduction stat value only to the
3666 // first unroll part.
3667 Value *StartVal = (Part == 0) ? VectorStart : Identity;
3668 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3669 cast<PHINode>(VecRdxPhi)
3670 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3673 // Before each round, move the insertion point right between
3674 // the PHIs and the values we are going to write.
3675 // This allows us to write both PHINodes and the extractelement
3676 // instructions.
3677 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3679 setDebugLocFromInst(Builder, LoopExitInst);
3681 // If tail is folded by masking, the vector value to leave the loop should be
3682 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3683 // instead of the former.
3684 if (Cost->foldTailByMasking()) {
3685 for (unsigned Part = 0; Part < UF; ++Part) {
3686 Value *VecLoopExitInst =
3687 VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3688 Value *Sel = nullptr;
3689 for (User *U : VecLoopExitInst->users()) {
3690 if (isa<SelectInst>(U)) {
3691 assert(!Sel && "Reduction exit feeding two selects");
3692 Sel = U;
3693 } else
3694 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3696 assert(Sel && "Reduction exit feeds no select");
3697 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3701 // If the vector reduction can be performed in a smaller type, we truncate
3702 // then extend the loop exit value to enable InstCombine to evaluate the
3703 // entire expression in the smaller type.
3704 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3705 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3706 Builder.SetInsertPoint(
3707 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3708 VectorParts RdxParts(UF);
3709 for (unsigned Part = 0; Part < UF; ++Part) {
3710 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3711 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3712 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3713 : Builder.CreateZExt(Trunc, VecTy);
3714 for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3715 UI != RdxParts[Part]->user_end();)
3716 if (*UI != Trunc) {
3717 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3718 RdxParts[Part] = Extnd;
3719 } else {
3720 ++UI;
3723 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3724 for (unsigned Part = 0; Part < UF; ++Part) {
3725 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3726 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3730 // Reduce all of the unrolled parts into a single vector.
3731 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3732 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3734 // The middle block terminator has already been assigned a DebugLoc here (the
3735 // OrigLoop's single latch terminator). We want the whole middle block to
3736 // appear to execute on this line because: (a) it is all compiler generated,
3737 // (b) these instructions are always executed after evaluating the latch
3738 // conditional branch, and (c) other passes may add new predecessors which
3739 // terminate on this line. This is the easiest way to ensure we don't
3740 // accidentally cause an extra step back into the loop while debugging.
3741 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3742 for (unsigned Part = 1; Part < UF; ++Part) {
3743 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3744 if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3745 // Floating point operations had to be 'fast' to enable the reduction.
3746 ReducedPartRdx = addFastMathFlag(
3747 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3748 ReducedPartRdx, "bin.rdx"),
3749 RdxDesc.getFastMathFlags());
3750 else
3751 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3752 RdxPart);
3755 if (VF > 1) {
3756 bool NoNaN = Legal->hasFunNoNaNAttr();
3757 ReducedPartRdx =
3758 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3759 // If the reduction can be performed in a smaller type, we need to extend
3760 // the reduction to the wider type before we branch to the original loop.
3761 if (Phi->getType() != RdxDesc.getRecurrenceType())
3762 ReducedPartRdx =
3763 RdxDesc.isSigned()
3764 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3765 : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3768 // Create a phi node that merges control-flow from the backedge-taken check
3769 // block and the middle block.
3770 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3771 LoopScalarPreHeader->getTerminator());
3772 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3773 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3774 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3776 // Now, we need to fix the users of the reduction variable
3777 // inside and outside of the scalar remainder loop.
3778 // We know that the loop is in LCSSA form. We need to update the
3779 // PHI nodes in the exit blocks.
3780 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3781 // All PHINodes need to have a single entry edge, or two if
3782 // we already fixed them.
3783 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3785 // We found a reduction value exit-PHI. Update it with the
3786 // incoming bypass edge.
3787 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3788 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3789 } // end of the LCSSA phi scan.
3791 // Fix the scalar loop reduction variable with the incoming reduction sum
3792 // from the vector body and from the backedge value.
3793 int IncomingEdgeBlockIdx =
3794 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3795 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3796 // Pick the other block.
3797 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3798 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3799 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3802 void InnerLoopVectorizer::fixLCSSAPHIs() {
3803 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3804 if (LCSSAPhi.getNumIncomingValues() == 1) {
3805 auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3806 // Non-instruction incoming values will have only one value.
3807 unsigned LastLane = 0;
3808 if (isa<Instruction>(IncomingValue))
3809 LastLane = Cost->isUniformAfterVectorization(
3810 cast<Instruction>(IncomingValue), VF)
3812 : VF - 1;
3813 // Can be a loop invariant incoming value or the last scalar value to be
3814 // extracted from the vectorized loop.
3815 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3816 Value *lastIncomingValue =
3817 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3818 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3823 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3824 // The basic block and loop containing the predicated instruction.
3825 auto *PredBB = PredInst->getParent();
3826 auto *VectorLoop = LI->getLoopFor(PredBB);
3828 // Initialize a worklist with the operands of the predicated instruction.
3829 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3831 // Holds instructions that we need to analyze again. An instruction may be
3832 // reanalyzed if we don't yet know if we can sink it or not.
3833 SmallVector<Instruction *, 8> InstsToReanalyze;
3835 // Returns true if a given use occurs in the predicated block. Phi nodes use
3836 // their operands in their corresponding predecessor blocks.
3837 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3838 auto *I = cast<Instruction>(U.getUser());
3839 BasicBlock *BB = I->getParent();
3840 if (auto *Phi = dyn_cast<PHINode>(I))
3841 BB = Phi->getIncomingBlock(
3842 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3843 return BB == PredBB;
3846 // Iteratively sink the scalarized operands of the predicated instruction
3847 // into the block we created for it. When an instruction is sunk, it's
3848 // operands are then added to the worklist. The algorithm ends after one pass
3849 // through the worklist doesn't sink a single instruction.
3850 bool Changed;
3851 do {
3852 // Add the instructions that need to be reanalyzed to the worklist, and
3853 // reset the changed indicator.
3854 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3855 InstsToReanalyze.clear();
3856 Changed = false;
3858 while (!Worklist.empty()) {
3859 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3861 // We can't sink an instruction if it is a phi node, is already in the
3862 // predicated block, is not in the loop, or may have side effects.
3863 if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3864 !VectorLoop->contains(I) || I->mayHaveSideEffects())
3865 continue;
3867 // It's legal to sink the instruction if all its uses occur in the
3868 // predicated block. Otherwise, there's nothing to do yet, and we may
3869 // need to reanalyze the instruction.
3870 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3871 InstsToReanalyze.push_back(I);
3872 continue;
3875 // Move the instruction to the beginning of the predicated block, and add
3876 // it's operands to the worklist.
3877 I->moveBefore(&*PredBB->getFirstInsertionPt());
3878 Worklist.insert(I->op_begin(), I->op_end());
3880 // The sinking may have enabled other instructions to be sunk, so we will
3881 // need to iterate.
3882 Changed = true;
3884 } while (Changed);
3887 void InnerLoopVectorizer::fixNonInductionPHIs() {
3888 for (PHINode *OrigPhi : OrigPHIsToFix) {
3889 PHINode *NewPhi =
3890 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3891 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3893 SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3894 predecessors(OrigPhi->getParent()));
3895 SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3896 predecessors(NewPhi->getParent()));
3897 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3898 "Scalar and Vector BB should have the same number of predecessors");
3900 // The insertion point in Builder may be invalidated by the time we get
3901 // here. Force the Builder insertion point to something valid so that we do
3902 // not run into issues during insertion point restore in
3903 // getOrCreateVectorValue calls below.
3904 Builder.SetInsertPoint(NewPhi);
3906 // The predecessor order is preserved and we can rely on mapping between
3907 // scalar and vector block predecessors.
3908 for (unsigned i = 0; i < NumIncomingValues; ++i) {
3909 BasicBlock *NewPredBB = VectorBBPredecessors[i];
3911 // When looking up the new scalar/vector values to fix up, use incoming
3912 // values from original phi.
3913 Value *ScIncV =
3914 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3916 // Scalar incoming value may need a broadcast
3917 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3918 NewPhi->addIncoming(NewIncV, NewPredBB);
3923 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
3924 unsigned VF) {
3925 PHINode *P = cast<PHINode>(PN);
3926 if (EnableVPlanNativePath) {
3927 // Currently we enter here in the VPlan-native path for non-induction
3928 // PHIs where all control flow is uniform. We simply widen these PHIs.
3929 // Create a vector phi with no operands - the vector phi operands will be
3930 // set at the end of vector code generation.
3931 Type *VecTy =
3932 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3933 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3934 VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3935 OrigPHIsToFix.push_back(P);
3937 return;
3940 assert(PN->getParent() == OrigLoop->getHeader() &&
3941 "Non-header phis should have been handled elsewhere");
3943 // In order to support recurrences we need to be able to vectorize Phi nodes.
3944 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3945 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3946 // this value when we vectorize all of the instructions that use the PHI.
3947 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
3948 for (unsigned Part = 0; Part < UF; ++Part) {
3949 // This is phase one of vectorizing PHIs.
3950 Type *VecTy =
3951 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3952 Value *EntryPart = PHINode::Create(
3953 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
3954 VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
3956 return;
3959 setDebugLocFromInst(Builder, P);
3961 // This PHINode must be an induction variable.
3962 // Make sure that we know about it.
3963 assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
3965 InductionDescriptor II = Legal->getInductionVars()->lookup(P);
3966 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3968 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
3969 // which can be found from the original scalar operations.
3970 switch (II.getKind()) {
3971 case InductionDescriptor::IK_NoInduction:
3972 llvm_unreachable("Unknown induction");
3973 case InductionDescriptor::IK_IntInduction:
3974 case InductionDescriptor::IK_FpInduction:
3975 llvm_unreachable("Integer/fp induction is handled elsewhere.");
3976 case InductionDescriptor::IK_PtrInduction: {
3977 // Handle the pointer induction variable case.
3978 assert(P->getType()->isPointerTy() && "Unexpected type.");
3979 // This is the normalized GEP that starts counting at zero.
3980 Value *PtrInd = Induction;
3981 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
3982 // Determine the number of scalars we need to generate for each unroll
3983 // iteration. If the instruction is uniform, we only need to generate the
3984 // first lane. Otherwise, we generate all VF values.
3985 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
3986 // These are the scalar results. Notice that we don't generate vector GEPs
3987 // because scalar GEPs result in better code.
3988 for (unsigned Part = 0; Part < UF; ++Part) {
3989 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
3990 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
3991 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
3992 Value *SclrGep =
3993 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
3994 SclrGep->setName("next.gep");
3995 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
3998 return;
4003 /// A helper function for checking whether an integer division-related
4004 /// instruction may divide by zero (in which case it must be predicated if
4005 /// executed conditionally in the scalar code).
4006 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4007 /// Non-zero divisors that are non compile-time constants will not be
4008 /// converted into multiplication, so we will still end up scalarizing
4009 /// the division, but can do so w/o predication.
4010 static bool mayDivideByZero(Instruction &I) {
4011 assert((I.getOpcode() == Instruction::UDiv ||
4012 I.getOpcode() == Instruction::SDiv ||
4013 I.getOpcode() == Instruction::URem ||
4014 I.getOpcode() == Instruction::SRem) &&
4015 "Unexpected instruction");
4016 Value *Divisor = I.getOperand(1);
4017 auto *CInt = dyn_cast<ConstantInt>(Divisor);
4018 return !CInt || CInt->isZero();
4021 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4022 switch (I.getOpcode()) {
4023 case Instruction::Br:
4024 case Instruction::PHI:
4025 llvm_unreachable("This instruction is handled by a different recipe.");
4026 case Instruction::GetElementPtr: {
4027 // Construct a vector GEP by widening the operands of the scalar GEP as
4028 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4029 // results in a vector of pointers when at least one operand of the GEP
4030 // is vector-typed. Thus, to keep the representation compact, we only use
4031 // vector-typed operands for loop-varying values.
4032 auto *GEP = cast<GetElementPtrInst>(&I);
4034 if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
4035 // If we are vectorizing, but the GEP has only loop-invariant operands,
4036 // the GEP we build (by only using vector-typed operands for
4037 // loop-varying values) would be a scalar pointer. Thus, to ensure we
4038 // produce a vector of pointers, we need to either arbitrarily pick an
4039 // operand to broadcast, or broadcast a clone of the original GEP.
4040 // Here, we broadcast a clone of the original.
4042 // TODO: If at some point we decide to scalarize instructions having
4043 // loop-invariant operands, this special case will no longer be
4044 // required. We would add the scalarization decision to
4045 // collectLoopScalars() and teach getVectorValue() to broadcast
4046 // the lane-zero scalar value.
4047 auto *Clone = Builder.Insert(GEP->clone());
4048 for (unsigned Part = 0; Part < UF; ++Part) {
4049 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4050 VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
4051 addMetadata(EntryPart, GEP);
4053 } else {
4054 // If the GEP has at least one loop-varying operand, we are sure to
4055 // produce a vector of pointers. But if we are only unrolling, we want
4056 // to produce a scalar GEP for each unroll part. Thus, the GEP we
4057 // produce with the code below will be scalar (if VF == 1) or vector
4058 // (otherwise). Note that for the unroll-only case, we still maintain
4059 // values in the vector mapping with initVector, as we do for other
4060 // instructions.
4061 for (unsigned Part = 0; Part < UF; ++Part) {
4062 // The pointer operand of the new GEP. If it's loop-invariant, we
4063 // won't broadcast it.
4064 auto *Ptr =
4065 OrigLoop->isLoopInvariant(GEP->getPointerOperand())
4066 ? GEP->getPointerOperand()
4067 : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4069 // Collect all the indices for the new GEP. If any index is
4070 // loop-invariant, we won't broadcast it.
4071 SmallVector<Value *, 4> Indices;
4072 for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
4073 if (OrigLoop->isLoopInvariant(U.get()))
4074 Indices.push_back(U.get());
4075 else
4076 Indices.push_back(getOrCreateVectorValue(U.get(), Part));
4079 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4080 // but it should be a vector, otherwise.
4081 auto *NewGEP =
4082 GEP->isInBounds()
4083 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4084 Indices)
4085 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4086 assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4087 "NewGEP is not a pointer vector");
4088 VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
4089 addMetadata(NewGEP, GEP);
4093 break;
4095 case Instruction::UDiv:
4096 case Instruction::SDiv:
4097 case Instruction::SRem:
4098 case Instruction::URem:
4099 case Instruction::Add:
4100 case Instruction::FAdd:
4101 case Instruction::Sub:
4102 case Instruction::FSub:
4103 case Instruction::FNeg:
4104 case Instruction::Mul:
4105 case Instruction::FMul:
4106 case Instruction::FDiv:
4107 case Instruction::FRem:
4108 case Instruction::Shl:
4109 case Instruction::LShr:
4110 case Instruction::AShr:
4111 case Instruction::And:
4112 case Instruction::Or:
4113 case Instruction::Xor: {
4114 // Just widen unops and binops.
4115 setDebugLocFromInst(Builder, &I);
4117 for (unsigned Part = 0; Part < UF; ++Part) {
4118 SmallVector<Value *, 2> Ops;
4119 for (Value *Op : I.operands())
4120 Ops.push_back(getOrCreateVectorValue(Op, Part));
4122 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4124 if (auto *VecOp = dyn_cast<Instruction>(V))
4125 VecOp->copyIRFlags(&I);
4127 // Use this vector value for all users of the original instruction.
4128 VectorLoopValueMap.setVectorValue(&I, Part, V);
4129 addMetadata(V, &I);
4132 break;
4134 case Instruction::Select: {
4135 // Widen selects.
4136 // If the selector is loop invariant we can create a select
4137 // instruction with a scalar condition. Otherwise, use vector-select.
4138 auto *SE = PSE.getSE();
4139 bool InvariantCond =
4140 SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4141 setDebugLocFromInst(Builder, &I);
4143 // The condition can be loop invariant but still defined inside the
4144 // loop. This means that we can't just use the original 'cond' value.
4145 // We have to take the 'vectorized' value and pick the first lane.
4146 // Instcombine will make this a no-op.
4148 auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4150 for (unsigned Part = 0; Part < UF; ++Part) {
4151 Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4152 Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4153 Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4154 Value *Sel =
4155 Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4156 VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4157 addMetadata(Sel, &I);
4160 break;
4163 case Instruction::ICmp:
4164 case Instruction::FCmp: {
4165 // Widen compares. Generate vector compares.
4166 bool FCmp = (I.getOpcode() == Instruction::FCmp);
4167 auto *Cmp = dyn_cast<CmpInst>(&I);
4168 setDebugLocFromInst(Builder, Cmp);
4169 for (unsigned Part = 0; Part < UF; ++Part) {
4170 Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4171 Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4172 Value *C = nullptr;
4173 if (FCmp) {
4174 // Propagate fast math flags.
4175 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4176 Builder.setFastMathFlags(Cmp->getFastMathFlags());
4177 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4178 } else {
4179 C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4181 VectorLoopValueMap.setVectorValue(&I, Part, C);
4182 addMetadata(C, &I);
4185 break;
4188 case Instruction::ZExt:
4189 case Instruction::SExt:
4190 case Instruction::FPToUI:
4191 case Instruction::FPToSI:
4192 case Instruction::FPExt:
4193 case Instruction::PtrToInt:
4194 case Instruction::IntToPtr:
4195 case Instruction::SIToFP:
4196 case Instruction::UIToFP:
4197 case Instruction::Trunc:
4198 case Instruction::FPTrunc:
4199 case Instruction::BitCast: {
4200 auto *CI = dyn_cast<CastInst>(&I);
4201 setDebugLocFromInst(Builder, CI);
4203 /// Vectorize casts.
4204 Type *DestTy =
4205 (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4207 for (unsigned Part = 0; Part < UF; ++Part) {
4208 Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4209 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4210 VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4211 addMetadata(Cast, &I);
4213 break;
4216 case Instruction::Call: {
4217 // Ignore dbg intrinsics.
4218 if (isa<DbgInfoIntrinsic>(I))
4219 break;
4220 setDebugLocFromInst(Builder, &I);
4222 Module *M = I.getParent()->getParent()->getParent();
4223 auto *CI = cast<CallInst>(&I);
4225 StringRef FnName = CI->getCalledFunction()->getName();
4226 Function *F = CI->getCalledFunction();
4227 Type *RetTy = ToVectorTy(CI->getType(), VF);
4228 SmallVector<Type *, 4> Tys;
4229 for (Value *ArgOperand : CI->arg_operands())
4230 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4232 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4234 // The flag shows whether we use Intrinsic or a usual Call for vectorized
4235 // version of the instruction.
4236 // Is it beneficial to perform intrinsic call compared to lib call?
4237 bool NeedToScalarize;
4238 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4239 bool UseVectorIntrinsic =
4240 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4241 assert((UseVectorIntrinsic || !NeedToScalarize) &&
4242 "Instruction should be scalarized elsewhere.");
4244 for (unsigned Part = 0; Part < UF; ++Part) {
4245 SmallVector<Value *, 4> Args;
4246 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4247 Value *Arg = CI->getArgOperand(i);
4248 // Some intrinsics have a scalar argument - don't replace it with a
4249 // vector.
4250 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4251 Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4252 Args.push_back(Arg);
4255 Function *VectorF;
4256 if (UseVectorIntrinsic) {
4257 // Use vector version of the intrinsic.
4258 Type *TysForDecl[] = {CI->getType()};
4259 if (VF > 1)
4260 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4261 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4262 } else {
4263 // Use vector version of the library call.
4264 StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4265 assert(!VFnName.empty() && "Vector function name is empty.");
4266 VectorF = M->getFunction(VFnName);
4267 if (!VectorF) {
4268 // Generate a declaration
4269 FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4270 VectorF =
4271 Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4272 VectorF->copyAttributesFrom(F);
4275 assert(VectorF && "Can't create vector function.");
4277 SmallVector<OperandBundleDef, 1> OpBundles;
4278 CI->getOperandBundlesAsDefs(OpBundles);
4279 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4281 if (isa<FPMathOperator>(V))
4282 V->copyFastMathFlags(CI);
4284 VectorLoopValueMap.setVectorValue(&I, Part, V);
4285 addMetadata(V, &I);
4288 break;
4291 default:
4292 // This instruction is not vectorized by simple widening.
4293 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4294 llvm_unreachable("Unhandled instruction!");
4295 } // end of switch.
4298 void InnerLoopVectorizer::updateAnalysis() {
4299 // Forget the original basic block.
4300 PSE.getSE()->forgetLoop(OrigLoop);
4302 // DT is not kept up-to-date for outer loop vectorization
4303 if (EnableVPlanNativePath)
4304 return;
4306 // Update the dominator tree information.
4307 assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
4308 "Entry does not dominate exit.");
4310 DT->addNewBlock(LoopMiddleBlock,
4311 LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4312 DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
4313 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
4314 DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
4315 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
4318 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4319 // We should not collect Scalars more than once per VF. Right now, this
4320 // function is called from collectUniformsAndScalars(), which already does
4321 // this check. Collecting Scalars for VF=1 does not make any sense.
4322 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4323 "This function should not be visited twice for the same VF");
4325 SmallSetVector<Instruction *, 8> Worklist;
4327 // These sets are used to seed the analysis with pointers used by memory
4328 // accesses that will remain scalar.
4329 SmallSetVector<Instruction *, 8> ScalarPtrs;
4330 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4332 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4333 // The pointer operands of loads and stores will be scalar as long as the
4334 // memory access is not a gather or scatter operation. The value operand of a
4335 // store will remain scalar if the store is scalarized.
4336 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4337 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4338 assert(WideningDecision != CM_Unknown &&
4339 "Widening decision should be ready at this moment");
4340 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4341 if (Ptr == Store->getValueOperand())
4342 return WideningDecision == CM_Scalarize;
4343 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4344 "Ptr is neither a value or pointer operand");
4345 return WideningDecision != CM_GatherScatter;
4348 // A helper that returns true if the given value is a bitcast or
4349 // getelementptr instruction contained in the loop.
4350 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4351 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4352 isa<GetElementPtrInst>(V)) &&
4353 !TheLoop->isLoopInvariant(V);
4356 // A helper that evaluates a memory access's use of a pointer. If the use
4357 // will be a scalar use, and the pointer is only used by memory accesses, we
4358 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4359 // PossibleNonScalarPtrs.
4360 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4361 // We only care about bitcast and getelementptr instructions contained in
4362 // the loop.
4363 if (!isLoopVaryingBitCastOrGEP(Ptr))
4364 return;
4366 // If the pointer has already been identified as scalar (e.g., if it was
4367 // also identified as uniform), there's nothing to do.
4368 auto *I = cast<Instruction>(Ptr);
4369 if (Worklist.count(I))
4370 return;
4372 // If the use of the pointer will be a scalar use, and all users of the
4373 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4374 // place the pointer in PossibleNonScalarPtrs.
4375 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4376 return isa<LoadInst>(U) || isa<StoreInst>(U);
4378 ScalarPtrs.insert(I);
4379 else
4380 PossibleNonScalarPtrs.insert(I);
4383 // We seed the scalars analysis with three classes of instructions: (1)
4384 // instructions marked uniform-after-vectorization, (2) bitcast and
4385 // getelementptr instructions used by memory accesses requiring a scalar use,
4386 // and (3) pointer induction variables and their update instructions (we
4387 // currently only scalarize these).
4389 // (1) Add to the worklist all instructions that have been identified as
4390 // uniform-after-vectorization.
4391 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4393 // (2) Add to the worklist all bitcast and getelementptr instructions used by
4394 // memory accesses requiring a scalar use. The pointer operands of loads and
4395 // stores will be scalar as long as the memory accesses is not a gather or
4396 // scatter operation. The value operand of a store will remain scalar if the
4397 // store is scalarized.
4398 for (auto *BB : TheLoop->blocks())
4399 for (auto &I : *BB) {
4400 if (auto *Load = dyn_cast<LoadInst>(&I)) {
4401 evaluatePtrUse(Load, Load->getPointerOperand());
4402 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4403 evaluatePtrUse(Store, Store->getPointerOperand());
4404 evaluatePtrUse(Store, Store->getValueOperand());
4407 for (auto *I : ScalarPtrs)
4408 if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4409 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4410 Worklist.insert(I);
4413 // (3) Add to the worklist all pointer induction variables and their update
4414 // instructions.
4416 // TODO: Once we are able to vectorize pointer induction variables we should
4417 // no longer insert them into the worklist here.
4418 auto *Latch = TheLoop->getLoopLatch();
4419 for (auto &Induction : *Legal->getInductionVars()) {
4420 auto *Ind = Induction.first;
4421 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4422 if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4423 continue;
4424 Worklist.insert(Ind);
4425 Worklist.insert(IndUpdate);
4426 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4427 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4428 << "\n");
4431 // Insert the forced scalars.
4432 // FIXME: Currently widenPHIInstruction() often creates a dead vector
4433 // induction variable when the PHI user is scalarized.
4434 auto ForcedScalar = ForcedScalars.find(VF);
4435 if (ForcedScalar != ForcedScalars.end())
4436 for (auto *I : ForcedScalar->second)
4437 Worklist.insert(I);
4439 // Expand the worklist by looking through any bitcasts and getelementptr
4440 // instructions we've already identified as scalar. This is similar to the
4441 // expansion step in collectLoopUniforms(); however, here we're only
4442 // expanding to include additional bitcasts and getelementptr instructions.
4443 unsigned Idx = 0;
4444 while (Idx != Worklist.size()) {
4445 Instruction *Dst = Worklist[Idx++];
4446 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4447 continue;
4448 auto *Src = cast<Instruction>(Dst->getOperand(0));
4449 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4450 auto *J = cast<Instruction>(U);
4451 return !TheLoop->contains(J) || Worklist.count(J) ||
4452 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4453 isScalarUse(J, Src));
4454 })) {
4455 Worklist.insert(Src);
4456 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4460 // An induction variable will remain scalar if all users of the induction
4461 // variable and induction variable update remain scalar.
4462 for (auto &Induction : *Legal->getInductionVars()) {
4463 auto *Ind = Induction.first;
4464 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4466 // We already considered pointer induction variables, so there's no reason
4467 // to look at their users again.
4469 // TODO: Once we are able to vectorize pointer induction variables we
4470 // should no longer skip over them here.
4471 if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4472 continue;
4474 // Determine if all users of the induction variable are scalar after
4475 // vectorization.
4476 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4477 auto *I = cast<Instruction>(U);
4478 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4480 if (!ScalarInd)
4481 continue;
4483 // Determine if all users of the induction variable update instruction are
4484 // scalar after vectorization.
4485 auto ScalarIndUpdate =
4486 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4487 auto *I = cast<Instruction>(U);
4488 return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4490 if (!ScalarIndUpdate)
4491 continue;
4493 // The induction variable and its update instruction will remain scalar.
4494 Worklist.insert(Ind);
4495 Worklist.insert(IndUpdate);
4496 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4497 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4498 << "\n");
4501 Scalars[VF].insert(Worklist.begin(), Worklist.end());
4504 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4505 if (!blockNeedsPredication(I->getParent()))
4506 return false;
4507 switch(I->getOpcode()) {
4508 default:
4509 break;
4510 case Instruction::Load:
4511 case Instruction::Store: {
4512 if (!Legal->isMaskRequired(I))
4513 return false;
4514 auto *Ptr = getLoadStorePointerOperand(I);
4515 auto *Ty = getMemInstValueType(I);
4516 // We have already decided how to vectorize this instruction, get that
4517 // result.
4518 if (VF > 1) {
4519 InstWidening WideningDecision = getWideningDecision(I, VF);
4520 assert(WideningDecision != CM_Unknown &&
4521 "Widening decision should be ready at this moment");
4522 return WideningDecision == CM_Scalarize;
4524 return isa<LoadInst>(I) ?
4525 !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty))
4526 : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
4528 case Instruction::UDiv:
4529 case Instruction::SDiv:
4530 case Instruction::SRem:
4531 case Instruction::URem:
4532 return mayDivideByZero(*I);
4534 return false;
4537 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4538 unsigned VF) {
4539 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4540 assert(getWideningDecision(I, VF) == CM_Unknown &&
4541 "Decision should not be set yet.");
4542 auto *Group = getInterleavedAccessGroup(I);
4543 assert(Group && "Must have a group.");
4545 // If the instruction's allocated size doesn't equal it's type size, it
4546 // requires padding and will be scalarized.
4547 auto &DL = I->getModule()->getDataLayout();
4548 auto *ScalarTy = getMemInstValueType(I);
4549 if (hasIrregularType(ScalarTy, DL, VF))
4550 return false;
4552 // Check if masking is required.
4553 // A Group may need masking for one of two reasons: it resides in a block that
4554 // needs predication, or it was decided to use masking to deal with gaps.
4555 bool PredicatedAccessRequiresMasking =
4556 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4557 bool AccessWithGapsRequiresMasking =
4558 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4559 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4560 return true;
4562 // If masked interleaving is required, we expect that the user/target had
4563 // enabled it, because otherwise it either wouldn't have been created or
4564 // it should have been invalidated by the CostModel.
4565 assert(useMaskedInterleavedAccesses(TTI) &&
4566 "Masked interleave-groups for predicated accesses are not enabled.");
4568 auto *Ty = getMemInstValueType(I);
4569 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
4570 : TTI.isLegalMaskedStore(Ty);
4573 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4574 unsigned VF) {
4575 // Get and ensure we have a valid memory instruction.
4576 LoadInst *LI = dyn_cast<LoadInst>(I);
4577 StoreInst *SI = dyn_cast<StoreInst>(I);
4578 assert((LI || SI) && "Invalid memory instruction");
4580 auto *Ptr = getLoadStorePointerOperand(I);
4582 // In order to be widened, the pointer should be consecutive, first of all.
4583 if (!Legal->isConsecutivePtr(Ptr))
4584 return false;
4586 // If the instruction is a store located in a predicated block, it will be
4587 // scalarized.
4588 if (isScalarWithPredication(I))
4589 return false;
4591 // If the instruction's allocated size doesn't equal it's type size, it
4592 // requires padding and will be scalarized.
4593 auto &DL = I->getModule()->getDataLayout();
4594 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4595 if (hasIrregularType(ScalarTy, DL, VF))
4596 return false;
4598 return true;
4601 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4602 // We should not collect Uniforms more than once per VF. Right now,
4603 // this function is called from collectUniformsAndScalars(), which
4604 // already does this check. Collecting Uniforms for VF=1 does not make any
4605 // sense.
4607 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4608 "This function should not be visited twice for the same VF");
4610 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4611 // not analyze again. Uniforms.count(VF) will return 1.
4612 Uniforms[VF].clear();
4614 // We now know that the loop is vectorizable!
4615 // Collect instructions inside the loop that will remain uniform after
4616 // vectorization.
4618 // Global values, params and instructions outside of current loop are out of
4619 // scope.
4620 auto isOutOfScope = [&](Value *V) -> bool {
4621 Instruction *I = dyn_cast<Instruction>(V);
4622 return (!I || !TheLoop->contains(I));
4625 SetVector<Instruction *> Worklist;
4626 BasicBlock *Latch = TheLoop->getLoopLatch();
4628 // Start with the conditional branch. If the branch condition is an
4629 // instruction contained in the loop that is only used by the branch, it is
4630 // uniform.
4631 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4632 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4633 Worklist.insert(Cmp);
4634 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
4637 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4638 // are pointers that are treated like consecutive pointers during
4639 // vectorization. The pointer operands of interleaved accesses are an
4640 // example.
4641 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4643 // Holds pointer operands of instructions that are possibly non-uniform.
4644 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4646 auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4647 InstWidening WideningDecision = getWideningDecision(I, VF);
4648 assert(WideningDecision != CM_Unknown &&
4649 "Widening decision should be ready at this moment");
4651 return (WideningDecision == CM_Widen ||
4652 WideningDecision == CM_Widen_Reverse ||
4653 WideningDecision == CM_Interleave);
4655 // Iterate over the instructions in the loop, and collect all
4656 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4657 // that a consecutive-like pointer operand will be scalarized, we collect it
4658 // in PossibleNonUniformPtrs instead. We use two sets here because a single
4659 // getelementptr instruction can be used by both vectorized and scalarized
4660 // memory instructions. For example, if a loop loads and stores from the same
4661 // location, but the store is conditional, the store will be scalarized, and
4662 // the getelementptr won't remain uniform.
4663 for (auto *BB : TheLoop->blocks())
4664 for (auto &I : *BB) {
4665 // If there's no pointer operand, there's nothing to do.
4666 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4667 if (!Ptr)
4668 continue;
4670 // True if all users of Ptr are memory accesses that have Ptr as their
4671 // pointer operand.
4672 auto UsersAreMemAccesses =
4673 llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4674 return getLoadStorePointerOperand(U) == Ptr;
4677 // Ensure the memory instruction will not be scalarized or used by
4678 // gather/scatter, making its pointer operand non-uniform. If the pointer
4679 // operand is used by any instruction other than a memory access, we
4680 // conservatively assume the pointer operand may be non-uniform.
4681 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4682 PossibleNonUniformPtrs.insert(Ptr);
4684 // If the memory instruction will be vectorized and its pointer operand
4685 // is consecutive-like, or interleaving - the pointer operand should
4686 // remain uniform.
4687 else
4688 ConsecutiveLikePtrs.insert(Ptr);
4691 // Add to the Worklist all consecutive and consecutive-like pointers that
4692 // aren't also identified as possibly non-uniform.
4693 for (auto *V : ConsecutiveLikePtrs)
4694 if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
4695 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
4696 Worklist.insert(V);
4699 // Expand Worklist in topological order: whenever a new instruction
4700 // is added , its users should be already inside Worklist. It ensures
4701 // a uniform instruction will only be used by uniform instructions.
4702 unsigned idx = 0;
4703 while (idx != Worklist.size()) {
4704 Instruction *I = Worklist[idx++];
4706 for (auto OV : I->operand_values()) {
4707 // isOutOfScope operands cannot be uniform instructions.
4708 if (isOutOfScope(OV))
4709 continue;
4710 // First order recurrence Phi's should typically be considered
4711 // non-uniform.
4712 auto *OP = dyn_cast<PHINode>(OV);
4713 if (OP && Legal->isFirstOrderRecurrence(OP))
4714 continue;
4715 // If all the users of the operand are uniform, then add the
4716 // operand into the uniform worklist.
4717 auto *OI = cast<Instruction>(OV);
4718 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4719 auto *J = cast<Instruction>(U);
4720 return Worklist.count(J) ||
4721 (OI == getLoadStorePointerOperand(J) &&
4722 isUniformDecision(J, VF));
4723 })) {
4724 Worklist.insert(OI);
4725 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
4730 // Returns true if Ptr is the pointer operand of a memory access instruction
4731 // I, and I is known to not require scalarization.
4732 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4733 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4736 // For an instruction to be added into Worklist above, all its users inside
4737 // the loop should also be in Worklist. However, this condition cannot be
4738 // true for phi nodes that form a cyclic dependence. We must process phi
4739 // nodes separately. An induction variable will remain uniform if all users
4740 // of the induction variable and induction variable update remain uniform.
4741 // The code below handles both pointer and non-pointer induction variables.
4742 for (auto &Induction : *Legal->getInductionVars()) {
4743 auto *Ind = Induction.first;
4744 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4746 // Determine if all users of the induction variable are uniform after
4747 // vectorization.
4748 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4749 auto *I = cast<Instruction>(U);
4750 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4751 isVectorizedMemAccessUse(I, Ind);
4753 if (!UniformInd)
4754 continue;
4756 // Determine if all users of the induction variable update instruction are
4757 // uniform after vectorization.
4758 auto UniformIndUpdate =
4759 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4760 auto *I = cast<Instruction>(U);
4761 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4762 isVectorizedMemAccessUse(I, IndUpdate);
4764 if (!UniformIndUpdate)
4765 continue;
4767 // The induction variable and its update instruction will remain uniform.
4768 Worklist.insert(Ind);
4769 Worklist.insert(IndUpdate);
4770 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
4771 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4772 << "\n");
4775 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4778 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4779 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4781 if (Legal->getRuntimePointerChecking()->Need) {
4782 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4783 "runtime pointer checks needed. Enable vectorization of this "
4784 "loop with '#pragma clang loop vectorize(enable)' when "
4785 "compiling with -Os/-Oz",
4786 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4787 return true;
4790 if (!PSE.getUnionPredicate().getPredicates().empty()) {
4791 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4792 "runtime SCEV checks needed. Enable vectorization of this "
4793 "loop with '#pragma clang loop vectorize(enable)' when "
4794 "compiling with -Os/-Oz",
4795 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4796 return true;
4799 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4800 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4801 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4802 "runtime stride == 1 checks needed. Enable vectorization of "
4803 "this loop with '#pragma clang loop vectorize(enable)' when "
4804 "compiling with -Os/-Oz",
4805 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4806 return true;
4809 return false;
4812 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4813 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4814 // TODO: It may by useful to do since it's still likely to be dynamically
4815 // uniform if the target can skip.
4816 reportVectorizationFailure(
4817 "Not inserting runtime ptr check for divergent target",
4818 "runtime pointer checks needed. Not enabled for divergent target",
4819 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4820 return None;
4823 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4824 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4825 if (TC == 1) {
4826 reportVectorizationFailure("Single iteration (non) loop",
4827 "loop trip count is one, irrelevant for vectorization",
4828 "SingleIterationLoop", ORE, TheLoop);
4829 return None;
4832 switch (ScalarEpilogueStatus) {
4833 case CM_ScalarEpilogueAllowed:
4834 return computeFeasibleMaxVF(TC);
4835 case CM_ScalarEpilogueNotNeededUsePredicate:
4836 LLVM_DEBUG(
4837 dbgs() << "LV: vector predicate hint/switch found.\n"
4838 << "LV: Not allowing scalar epilogue, creating predicated "
4839 << "vector loop.\n");
4840 break;
4841 case CM_ScalarEpilogueNotAllowedLowTripLoop:
4842 // fallthrough as a special case of OptForSize
4843 case CM_ScalarEpilogueNotAllowedOptSize:
4844 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4845 LLVM_DEBUG(
4846 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4847 else
4848 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4849 << "count.\n");
4851 // Bail if runtime checks are required, which are not good when optimising
4852 // for size.
4853 if (runtimeChecksRequired())
4854 return None;
4855 break;
4858 // Now try the tail folding
4860 // Invalidate interleave groups that require an epilogue if we can't mask
4861 // the interleave-group.
4862 if (!useMaskedInterleavedAccesses(TTI))
4863 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4865 unsigned MaxVF = computeFeasibleMaxVF(TC);
4866 if (TC > 0 && TC % MaxVF == 0) {
4867 // Accept MaxVF if we do not have a tail.
4868 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4869 return MaxVF;
4872 // If we don't know the precise trip count, or if the trip count that we
4873 // found modulo the vectorization factor is not zero, try to fold the tail
4874 // by masking.
4875 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4876 if (Legal->prepareToFoldTailByMasking()) {
4877 FoldTailByMasking = true;
4878 return MaxVF;
4881 if (TC == 0) {
4882 reportVectorizationFailure(
4883 "Unable to calculate the loop count due to complex control flow",
4884 "unable to calculate the loop count due to complex control flow",
4885 "UnknownLoopCountComplexCFG", ORE, TheLoop);
4886 return None;
4889 reportVectorizationFailure(
4890 "Cannot optimize for size and vectorize at the same time.",
4891 "cannot optimize for size and vectorize at the same time. "
4892 "Enable vectorization of this loop with '#pragma clang loop "
4893 "vectorize(enable)' when compiling with -Os/-Oz",
4894 "NoTailLoopWithOptForSize", ORE, TheLoop);
4895 return None;
4898 unsigned
4899 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
4900 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4901 unsigned SmallestType, WidestType;
4902 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4903 unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4905 // Get the maximum safe dependence distance in bits computed by LAA.
4906 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4907 // the memory accesses that is most restrictive (involved in the smallest
4908 // dependence distance).
4909 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4911 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4913 unsigned MaxVectorSize = WidestRegister / WidestType;
4915 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4916 << " / " << WidestType << " bits.\n");
4917 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4918 << WidestRegister << " bits.\n");
4920 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4921 " into one vector!");
4922 if (MaxVectorSize == 0) {
4923 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4924 MaxVectorSize = 1;
4925 return MaxVectorSize;
4926 } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4927 isPowerOf2_32(ConstTripCount)) {
4928 // We need to clamp the VF to be the ConstTripCount. There is no point in
4929 // choosing a higher viable VF as done in the loop below.
4930 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4931 << ConstTripCount << "\n");
4932 MaxVectorSize = ConstTripCount;
4933 return MaxVectorSize;
4936 unsigned MaxVF = MaxVectorSize;
4937 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
4938 (MaximizeBandwidth && isScalarEpilogueAllowed())) {
4939 // Collect all viable vectorization factors larger than the default MaxVF
4940 // (i.e. MaxVectorSize).
4941 SmallVector<unsigned, 8> VFs;
4942 unsigned NewMaxVectorSize = WidestRegister / SmallestType;
4943 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
4944 VFs.push_back(VS);
4946 // For each VF calculate its register usage.
4947 auto RUs = calculateRegisterUsage(VFs);
4949 // Select the largest VF which doesn't require more registers than existing
4950 // ones.
4951 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
4952 for (int i = RUs.size() - 1; i >= 0; --i) {
4953 if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
4954 MaxVF = VFs[i];
4955 break;
4958 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
4959 if (MaxVF < MinVF) {
4960 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4961 << ") with target's minimum: " << MinVF << '\n');
4962 MaxVF = MinVF;
4966 return MaxVF;
4969 VectorizationFactor
4970 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
4971 float Cost = expectedCost(1).first;
4972 const float ScalarCost = Cost;
4973 unsigned Width = 1;
4974 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
4976 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
4977 if (ForceVectorization && MaxVF > 1) {
4978 // Ignore scalar width, because the user explicitly wants vectorization.
4979 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4980 // evaluation.
4981 Cost = std::numeric_limits<float>::max();
4984 for (unsigned i = 2; i <= MaxVF; i *= 2) {
4985 // Notice that the vector loop needs to be executed less times, so
4986 // we need to divide the cost of the vector loops by the width of
4987 // the vector elements.
4988 VectorizationCostTy C = expectedCost(i);
4989 float VectorCost = C.first / (float)i;
4990 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
4991 << " costs: " << (int)VectorCost << ".\n");
4992 if (!C.second && !ForceVectorization) {
4993 LLVM_DEBUG(
4994 dbgs() << "LV: Not considering vector loop of width " << i
4995 << " because it will not generate any vector instructions.\n");
4996 continue;
4998 if (VectorCost < Cost) {
4999 Cost = VectorCost;
5000 Width = i;
5004 if (!EnableCondStoresVectorization && NumPredStores) {
5005 reportVectorizationFailure("There are conditional stores.",
5006 "store that is conditionally executed prevents vectorization",
5007 "ConditionalStore", ORE, TheLoop);
5008 Width = 1;
5009 Cost = ScalarCost;
5012 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5013 << "LV: Vectorization seems to be not beneficial, "
5014 << "but was forced by a user.\n");
5015 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5016 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5017 return Factor;
5020 std::pair<unsigned, unsigned>
5021 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5022 unsigned MinWidth = -1U;
5023 unsigned MaxWidth = 8;
5024 const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5026 // For each block.
5027 for (BasicBlock *BB : TheLoop->blocks()) {
5028 // For each instruction in the loop.
5029 for (Instruction &I : BB->instructionsWithoutDebug()) {
5030 Type *T = I.getType();
5032 // Skip ignored values.
5033 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5034 continue;
5036 // Only examine Loads, Stores and PHINodes.
5037 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5038 continue;
5040 // Examine PHI nodes that are reduction variables. Update the type to
5041 // account for the recurrence type.
5042 if (auto *PN = dyn_cast<PHINode>(&I)) {
5043 if (!Legal->isReductionVariable(PN))
5044 continue;
5045 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
5046 T = RdxDesc.getRecurrenceType();
5049 // Examine the stored values.
5050 if (auto *ST = dyn_cast<StoreInst>(&I))
5051 T = ST->getValueOperand()->getType();
5053 // Ignore loaded pointer types and stored pointer types that are not
5054 // vectorizable.
5056 // FIXME: The check here attempts to predict whether a load or store will
5057 // be vectorized. We only know this for certain after a VF has
5058 // been selected. Here, we assume that if an access can be
5059 // vectorized, it will be. We should also look at extending this
5060 // optimization to non-pointer types.
5062 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5063 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5064 continue;
5066 MinWidth = std::min(MinWidth,
5067 (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5068 MaxWidth = std::max(MaxWidth,
5069 (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5073 return {MinWidth, MaxWidth};
5076 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5077 unsigned LoopCost) {
5078 // -- The interleave heuristics --
5079 // We interleave the loop in order to expose ILP and reduce the loop overhead.
5080 // There are many micro-architectural considerations that we can't predict
5081 // at this level. For example, frontend pressure (on decode or fetch) due to
5082 // code size, or the number and capabilities of the execution ports.
5084 // We use the following heuristics to select the interleave count:
5085 // 1. If the code has reductions, then we interleave to break the cross
5086 // iteration dependency.
5087 // 2. If the loop is really small, then we interleave to reduce the loop
5088 // overhead.
5089 // 3. We don't interleave if we think that we will spill registers to memory
5090 // due to the increased register pressure.
5092 if (!isScalarEpilogueAllowed())
5093 return 1;
5095 // We used the distance for the interleave count.
5096 if (Legal->getMaxSafeDepDistBytes() != -1U)
5097 return 1;
5099 // Do not interleave loops with a relatively small trip count.
5100 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5101 if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
5102 return 1;
5104 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
5105 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5106 << " registers\n");
5108 if (VF == 1) {
5109 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5110 TargetNumRegisters = ForceTargetNumScalarRegs;
5111 } else {
5112 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5113 TargetNumRegisters = ForceTargetNumVectorRegs;
5116 RegisterUsage R = calculateRegisterUsage({VF})[0];
5117 // We divide by these constants so assume that we have at least one
5118 // instruction that uses at least one register.
5119 R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
5121 // We calculate the interleave count using the following formula.
5122 // Subtract the number of loop invariants from the number of available
5123 // registers. These registers are used by all of the interleaved instances.
5124 // Next, divide the remaining registers by the number of registers that is
5125 // required by the loop, in order to estimate how many parallel instances
5126 // fit without causing spills. All of this is rounded down if necessary to be
5127 // a power of two. We want power of two interleave count to simplify any
5128 // addressing operations or alignment considerations.
5129 // We also want power of two interleave counts to ensure that the induction
5130 // variable of the vector loop wraps to zero, when tail is folded by masking;
5131 // this currently happens when OptForSize, in which case IC is set to 1 above.
5132 unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
5133 R.MaxLocalUsers);
5135 // Don't count the induction variable as interleaved.
5136 if (EnableIndVarRegisterHeur)
5137 IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
5138 std::max(1U, (R.MaxLocalUsers - 1)));
5140 // Clamp the interleave ranges to reasonable counts.
5141 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5143 // Check if the user has overridden the max.
5144 if (VF == 1) {
5145 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5146 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5147 } else {
5148 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5149 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5152 // If the trip count is constant, limit the interleave count to be less than
5153 // the trip count divided by VF.
5154 if (TC > 0) {
5155 assert(TC >= VF && "VF exceeds trip count?");
5156 if ((TC / VF) < MaxInterleaveCount)
5157 MaxInterleaveCount = (TC / VF);
5160 // If we did not calculate the cost for VF (because the user selected the VF)
5161 // then we calculate the cost of VF here.
5162 if (LoopCost == 0)
5163 LoopCost = expectedCost(VF).first;
5165 assert(LoopCost && "Non-zero loop cost expected");
5167 // Clamp the calculated IC to be between the 1 and the max interleave count
5168 // that the target and trip count allows.
5169 if (IC > MaxInterleaveCount)
5170 IC = MaxInterleaveCount;
5171 else if (IC < 1)
5172 IC = 1;
5174 // Interleave if we vectorized this loop and there is a reduction that could
5175 // benefit from interleaving.
5176 if (VF > 1 && !Legal->getReductionVars()->empty()) {
5177 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5178 return IC;
5181 // Note that if we've already vectorized the loop we will have done the
5182 // runtime check and so interleaving won't require further checks.
5183 bool InterleavingRequiresRuntimePointerCheck =
5184 (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5186 // We want to interleave small loops in order to reduce the loop overhead and
5187 // potentially expose ILP opportunities.
5188 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5189 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5190 // We assume that the cost overhead is 1 and we use the cost model
5191 // to estimate the cost of the loop and interleave until the cost of the
5192 // loop overhead is about 5% of the cost of the loop.
5193 unsigned SmallIC =
5194 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5196 // Interleave until store/load ports (estimated by max interleave count) are
5197 // saturated.
5198 unsigned NumStores = Legal->getNumStores();
5199 unsigned NumLoads = Legal->getNumLoads();
5200 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5201 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5203 // If we have a scalar reduction (vector reductions are already dealt with
5204 // by this point), we can increase the critical path length if the loop
5205 // we're interleaving is inside another loop. Limit, by default to 2, so the
5206 // critical path only gets increased by one reduction operation.
5207 if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5208 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5209 SmallIC = std::min(SmallIC, F);
5210 StoresIC = std::min(StoresIC, F);
5211 LoadsIC = std::min(LoadsIC, F);
5214 if (EnableLoadStoreRuntimeInterleave &&
5215 std::max(StoresIC, LoadsIC) > SmallIC) {
5216 LLVM_DEBUG(
5217 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5218 return std::max(StoresIC, LoadsIC);
5221 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5222 return SmallIC;
5225 // Interleave if this is a large loop (small loops are already dealt with by
5226 // this point) that could benefit from interleaving.
5227 bool HasReductions = !Legal->getReductionVars()->empty();
5228 if (TTI.enableAggressiveInterleaving(HasReductions)) {
5229 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5230 return IC;
5233 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5234 return 1;
5237 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5238 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5239 // This function calculates the register usage by measuring the highest number
5240 // of values that are alive at a single location. Obviously, this is a very
5241 // rough estimation. We scan the loop in a topological order in order and
5242 // assign a number to each instruction. We use RPO to ensure that defs are
5243 // met before their users. We assume that each instruction that has in-loop
5244 // users starts an interval. We record every time that an in-loop value is
5245 // used, so we have a list of the first and last occurrences of each
5246 // instruction. Next, we transpose this data structure into a multi map that
5247 // holds the list of intervals that *end* at a specific location. This multi
5248 // map allows us to perform a linear search. We scan the instructions linearly
5249 // and record each time that a new interval starts, by placing it in a set.
5250 // If we find this value in the multi-map then we remove it from the set.
5251 // The max register usage is the maximum size of the set.
5252 // We also search for instructions that are defined outside the loop, but are
5253 // used inside the loop. We need this number separately from the max-interval
5254 // usage number because when we unroll, loop-invariant values do not take
5255 // more register.
5256 LoopBlocksDFS DFS(TheLoop);
5257 DFS.perform(LI);
5259 RegisterUsage RU;
5261 // Each 'key' in the map opens a new interval. The values
5262 // of the map are the index of the 'last seen' usage of the
5263 // instruction that is the key.
5264 using IntervalMap = DenseMap<Instruction *, unsigned>;
5266 // Maps instruction to its index.
5267 SmallVector<Instruction *, 64> IdxToInstr;
5268 // Marks the end of each interval.
5269 IntervalMap EndPoint;
5270 // Saves the list of instruction indices that are used in the loop.
5271 SmallPtrSet<Instruction *, 8> Ends;
5272 // Saves the list of values that are used in the loop but are
5273 // defined outside the loop, such as arguments and constants.
5274 SmallPtrSet<Value *, 8> LoopInvariants;
5276 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5277 for (Instruction &I : BB->instructionsWithoutDebug()) {
5278 IdxToInstr.push_back(&I);
5280 // Save the end location of each USE.
5281 for (Value *U : I.operands()) {
5282 auto *Instr = dyn_cast<Instruction>(U);
5284 // Ignore non-instruction values such as arguments, constants, etc.
5285 if (!Instr)
5286 continue;
5288 // If this instruction is outside the loop then record it and continue.
5289 if (!TheLoop->contains(Instr)) {
5290 LoopInvariants.insert(Instr);
5291 continue;
5294 // Overwrite previous end points.
5295 EndPoint[Instr] = IdxToInstr.size();
5296 Ends.insert(Instr);
5301 // Saves the list of intervals that end with the index in 'key'.
5302 using InstrList = SmallVector<Instruction *, 2>;
5303 DenseMap<unsigned, InstrList> TransposeEnds;
5305 // Transpose the EndPoints to a list of values that end at each index.
5306 for (auto &Interval : EndPoint)
5307 TransposeEnds[Interval.second].push_back(Interval.first);
5309 SmallPtrSet<Instruction *, 8> OpenIntervals;
5311 // Get the size of the widest register.
5312 unsigned MaxSafeDepDist = -1U;
5313 if (Legal->getMaxSafeDepDistBytes() != -1U)
5314 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5315 unsigned WidestRegister =
5316 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5317 const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5319 SmallVector<RegisterUsage, 8> RUs(VFs.size());
5320 SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
5322 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5324 // A lambda that gets the register usage for the given type and VF.
5325 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5326 if (Ty->isTokenTy())
5327 return 0U;
5328 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5329 return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5332 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5333 Instruction *I = IdxToInstr[i];
5335 // Remove all of the instructions that end at this location.
5336 InstrList &List = TransposeEnds[i];
5337 for (Instruction *ToRemove : List)
5338 OpenIntervals.erase(ToRemove);
5340 // Ignore instructions that are never used within the loop.
5341 if (Ends.find(I) == Ends.end())
5342 continue;
5344 // Skip ignored values.
5345 if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5346 continue;
5348 // For each VF find the maximum usage of registers.
5349 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5350 if (VFs[j] == 1) {
5351 MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
5352 continue;
5354 collectUniformsAndScalars(VFs[j]);
5355 // Count the number of live intervals.
5356 unsigned RegUsage = 0;
5357 for (auto Inst : OpenIntervals) {
5358 // Skip ignored values for VF > 1.
5359 if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() ||
5360 isScalarAfterVectorization(Inst, VFs[j]))
5361 continue;
5362 RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
5364 MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
5367 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5368 << OpenIntervals.size() << '\n');
5370 // Add the current instruction to the list of open intervals.
5371 OpenIntervals.insert(I);
5374 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5375 unsigned Invariant = 0;
5376 if (VFs[i] == 1)
5377 Invariant = LoopInvariants.size();
5378 else {
5379 for (auto Inst : LoopInvariants)
5380 Invariant += GetRegUsage(Inst->getType(), VFs[i]);
5383 LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
5384 LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
5385 LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant
5386 << '\n');
5388 RU.LoopInvariantRegs = Invariant;
5389 RU.MaxLocalUsers = MaxUsages[i];
5390 RUs[i] = RU;
5393 return RUs;
5396 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5397 // TODO: Cost model for emulated masked load/store is completely
5398 // broken. This hack guides the cost model to use an artificially
5399 // high enough value to practically disable vectorization with such
5400 // operations, except where previously deployed legality hack allowed
5401 // using very low cost values. This is to avoid regressions coming simply
5402 // from moving "masked load/store" check from legality to cost model.
5403 // Masked Load/Gather emulation was previously never allowed.
5404 // Limited number of Masked Store/Scatter emulation was allowed.
5405 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5406 return isa<LoadInst>(I) ||
5407 (isa<StoreInst>(I) &&
5408 NumPredStores > NumberOfStoresToPredicate);
5411 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5412 // If we aren't vectorizing the loop, or if we've already collected the
5413 // instructions to scalarize, there's nothing to do. Collection may already
5414 // have occurred if we have a user-selected VF and are now computing the
5415 // expected cost for interleaving.
5416 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5417 return;
5419 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5420 // not profitable to scalarize any instructions, the presence of VF in the
5421 // map will indicate that we've analyzed it already.
5422 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5424 // Find all the instructions that are scalar with predication in the loop and
5425 // determine if it would be better to not if-convert the blocks they are in.
5426 // If so, we also record the instructions to scalarize.
5427 for (BasicBlock *BB : TheLoop->blocks()) {
5428 if (!blockNeedsPredication(BB))
5429 continue;
5430 for (Instruction &I : *BB)
5431 if (isScalarWithPredication(&I)) {
5432 ScalarCostsTy ScalarCosts;
5433 // Do not apply discount logic if hacked cost is needed
5434 // for emulated masked memrefs.
5435 if (!useEmulatedMaskMemRefHack(&I) &&
5436 computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5437 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5438 // Remember that BB will remain after vectorization.
5439 PredicatedBBsAfterVectorization.insert(BB);
5444 int LoopVectorizationCostModel::computePredInstDiscount(
5445 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5446 unsigned VF) {
5447 assert(!isUniformAfterVectorization(PredInst, VF) &&
5448 "Instruction marked uniform-after-vectorization will be predicated");
5450 // Initialize the discount to zero, meaning that the scalar version and the
5451 // vector version cost the same.
5452 int Discount = 0;
5454 // Holds instructions to analyze. The instructions we visit are mapped in
5455 // ScalarCosts. Those instructions are the ones that would be scalarized if
5456 // we find that the scalar version costs less.
5457 SmallVector<Instruction *, 8> Worklist;
5459 // Returns true if the given instruction can be scalarized.
5460 auto canBeScalarized = [&](Instruction *I) -> bool {
5461 // We only attempt to scalarize instructions forming a single-use chain
5462 // from the original predicated block that would otherwise be vectorized.
5463 // Although not strictly necessary, we give up on instructions we know will
5464 // already be scalar to avoid traversing chains that are unlikely to be
5465 // beneficial.
5466 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5467 isScalarAfterVectorization(I, VF))
5468 return false;
5470 // If the instruction is scalar with predication, it will be analyzed
5471 // separately. We ignore it within the context of PredInst.
5472 if (isScalarWithPredication(I))
5473 return false;
5475 // If any of the instruction's operands are uniform after vectorization,
5476 // the instruction cannot be scalarized. This prevents, for example, a
5477 // masked load from being scalarized.
5479 // We assume we will only emit a value for lane zero of an instruction
5480 // marked uniform after vectorization, rather than VF identical values.
5481 // Thus, if we scalarize an instruction that uses a uniform, we would
5482 // create uses of values corresponding to the lanes we aren't emitting code
5483 // for. This behavior can be changed by allowing getScalarValue to clone
5484 // the lane zero values for uniforms rather than asserting.
5485 for (Use &U : I->operands())
5486 if (auto *J = dyn_cast<Instruction>(U.get()))
5487 if (isUniformAfterVectorization(J, VF))
5488 return false;
5490 // Otherwise, we can scalarize the instruction.
5491 return true;
5494 // Compute the expected cost discount from scalarizing the entire expression
5495 // feeding the predicated instruction. We currently only consider expressions
5496 // that are single-use instruction chains.
5497 Worklist.push_back(PredInst);
5498 while (!Worklist.empty()) {
5499 Instruction *I = Worklist.pop_back_val();
5501 // If we've already analyzed the instruction, there's nothing to do.
5502 if (ScalarCosts.find(I) != ScalarCosts.end())
5503 continue;
5505 // Compute the cost of the vector instruction. Note that this cost already
5506 // includes the scalarization overhead of the predicated instruction.
5507 unsigned VectorCost = getInstructionCost(I, VF).first;
5509 // Compute the cost of the scalarized instruction. This cost is the cost of
5510 // the instruction as if it wasn't if-converted and instead remained in the
5511 // predicated block. We will scale this cost by block probability after
5512 // computing the scalarization overhead.
5513 unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5515 // Compute the scalarization overhead of needed insertelement instructions
5516 // and phi nodes.
5517 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5518 ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5519 true, false);
5520 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5523 // Compute the scalarization overhead of needed extractelement
5524 // instructions. For each of the instruction's operands, if the operand can
5525 // be scalarized, add it to the worklist; otherwise, account for the
5526 // overhead.
5527 for (Use &U : I->operands())
5528 if (auto *J = dyn_cast<Instruction>(U.get())) {
5529 assert(VectorType::isValidElementType(J->getType()) &&
5530 "Instruction has non-scalar type");
5531 if (canBeScalarized(J))
5532 Worklist.push_back(J);
5533 else if (needsExtract(J, VF))
5534 ScalarCost += TTI.getScalarizationOverhead(
5535 ToVectorTy(J->getType(),VF), false, true);
5538 // Scale the total scalar cost by block probability.
5539 ScalarCost /= getReciprocalPredBlockProb();
5541 // Compute the discount. A non-negative discount means the vector version
5542 // of the instruction costs more, and scalarizing would be beneficial.
5543 Discount += VectorCost - ScalarCost;
5544 ScalarCosts[I] = ScalarCost;
5547 return Discount;
5550 LoopVectorizationCostModel::VectorizationCostTy
5551 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5552 VectorizationCostTy Cost;
5554 // For each block.
5555 for (BasicBlock *BB : TheLoop->blocks()) {
5556 VectorizationCostTy BlockCost;
5558 // For each instruction in the old loop.
5559 for (Instruction &I : BB->instructionsWithoutDebug()) {
5560 // Skip ignored values.
5561 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5562 (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5563 continue;
5565 VectorizationCostTy C = getInstructionCost(&I, VF);
5567 // Check if we should override the cost.
5568 if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5569 C.first = ForceTargetInstructionCost;
5571 BlockCost.first += C.first;
5572 BlockCost.second |= C.second;
5573 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5574 << " for VF " << VF << " For instruction: " << I
5575 << '\n');
5578 // If we are vectorizing a predicated block, it will have been
5579 // if-converted. This means that the block's instructions (aside from
5580 // stores and instructions that may divide by zero) will now be
5581 // unconditionally executed. For the scalar case, we may not always execute
5582 // the predicated block. Thus, scale the block's cost by the probability of
5583 // executing it.
5584 if (VF == 1 && blockNeedsPredication(BB))
5585 BlockCost.first /= getReciprocalPredBlockProb();
5587 Cost.first += BlockCost.first;
5588 Cost.second |= BlockCost.second;
5591 return Cost;
5594 /// Gets Address Access SCEV after verifying that the access pattern
5595 /// is loop invariant except the induction variable dependence.
5597 /// This SCEV can be sent to the Target in order to estimate the address
5598 /// calculation cost.
5599 static const SCEV *getAddressAccessSCEV(
5600 Value *Ptr,
5601 LoopVectorizationLegality *Legal,
5602 PredicatedScalarEvolution &PSE,
5603 const Loop *TheLoop) {
5605 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5606 if (!Gep)
5607 return nullptr;
5609 // We are looking for a gep with all loop invariant indices except for one
5610 // which should be an induction variable.
5611 auto SE = PSE.getSE();
5612 unsigned NumOperands = Gep->getNumOperands();
5613 for (unsigned i = 1; i < NumOperands; ++i) {
5614 Value *Opd = Gep->getOperand(i);
5615 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5616 !Legal->isInductionVariable(Opd))
5617 return nullptr;
5620 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5621 return PSE.getSCEV(Ptr);
5624 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5625 return Legal->hasStride(I->getOperand(0)) ||
5626 Legal->hasStride(I->getOperand(1));
5629 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5630 unsigned VF) {
5631 assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5632 Type *ValTy = getMemInstValueType(I);
5633 auto SE = PSE.getSE();
5635 unsigned Alignment = getLoadStoreAlignment(I);
5636 unsigned AS = getLoadStoreAddressSpace(I);
5637 Value *Ptr = getLoadStorePointerOperand(I);
5638 Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5640 // Figure out whether the access is strided and get the stride value
5641 // if it's known in compile time
5642 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5644 // Get the cost of the scalar memory instruction and address computation.
5645 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5647 // Don't pass *I here, since it is scalar but will actually be part of a
5648 // vectorized loop where the user of it is a vectorized instruction.
5649 Cost += VF *
5650 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
5651 AS);
5653 // Get the overhead of the extractelement and insertelement instructions
5654 // we might create due to scalarization.
5655 Cost += getScalarizationOverhead(I, VF);
5657 // If we have a predicated store, it may not be executed for each vector
5658 // lane. Scale the cost by the probability of executing the predicated
5659 // block.
5660 if (isPredicatedInst(I)) {
5661 Cost /= getReciprocalPredBlockProb();
5663 if (useEmulatedMaskMemRefHack(I))
5664 // Artificially setting to a high enough value to practically disable
5665 // vectorization with such operations.
5666 Cost = 3000000;
5669 return Cost;
5672 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5673 unsigned VF) {
5674 Type *ValTy = getMemInstValueType(I);
5675 Type *VectorTy = ToVectorTy(ValTy, VF);
5676 unsigned Alignment = getLoadStoreAlignment(I);
5677 Value *Ptr = getLoadStorePointerOperand(I);
5678 unsigned AS = getLoadStoreAddressSpace(I);
5679 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5681 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5682 "Stride should be 1 or -1 for consecutive memory access");
5683 unsigned Cost = 0;
5684 if (Legal->isMaskRequired(I))
5685 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
5686 else
5687 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5689 bool Reverse = ConsecutiveStride < 0;
5690 if (Reverse)
5691 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5692 return Cost;
5695 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5696 unsigned VF) {
5697 Type *ValTy = getMemInstValueType(I);
5698 Type *VectorTy = ToVectorTy(ValTy, VF);
5699 unsigned Alignment = getLoadStoreAlignment(I);
5700 unsigned AS = getLoadStoreAddressSpace(I);
5701 if (isa<LoadInst>(I)) {
5702 return TTI.getAddressComputationCost(ValTy) +
5703 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5704 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5706 StoreInst *SI = cast<StoreInst>(I);
5708 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5709 return TTI.getAddressComputationCost(ValTy) +
5710 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5711 (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(
5712 Instruction::ExtractElement,
5713 VectorTy, VF - 1));
5716 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5717 unsigned VF) {
5718 Type *ValTy = getMemInstValueType(I);
5719 Type *VectorTy = ToVectorTy(ValTy, VF);
5720 unsigned Alignment = getLoadStoreAlignment(I);
5721 Value *Ptr = getLoadStorePointerOperand(I);
5723 return TTI.getAddressComputationCost(VectorTy) +
5724 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5725 Legal->isMaskRequired(I), Alignment);
5728 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5729 unsigned VF) {
5730 Type *ValTy = getMemInstValueType(I);
5731 Type *VectorTy = ToVectorTy(ValTy, VF);
5732 unsigned AS = getLoadStoreAddressSpace(I);
5734 auto Group = getInterleavedAccessGroup(I);
5735 assert(Group && "Fail to get an interleaved access group.");
5737 unsigned InterleaveFactor = Group->getFactor();
5738 Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5740 // Holds the indices of existing members in an interleaved load group.
5741 // An interleaved store group doesn't need this as it doesn't allow gaps.
5742 SmallVector<unsigned, 4> Indices;
5743 if (isa<LoadInst>(I)) {
5744 for (unsigned i = 0; i < InterleaveFactor; i++)
5745 if (Group->getMember(i))
5746 Indices.push_back(i);
5749 // Calculate the cost of the whole interleaved group.
5750 bool UseMaskForGaps =
5751 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5752 unsigned Cost = TTI.getInterleavedMemoryOpCost(
5753 I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5754 Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5756 if (Group->isReverse()) {
5757 // TODO: Add support for reversed masked interleaved access.
5758 assert(!Legal->isMaskRequired(I) &&
5759 "Reverse masked interleaved access not supported.");
5760 Cost += Group->getNumMembers() *
5761 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5763 return Cost;
5766 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5767 unsigned VF) {
5768 // Calculate scalar cost only. Vectorization cost should be ready at this
5769 // moment.
5770 if (VF == 1) {
5771 Type *ValTy = getMemInstValueType(I);
5772 unsigned Alignment = getLoadStoreAlignment(I);
5773 unsigned AS = getLoadStoreAddressSpace(I);
5775 return TTI.getAddressComputationCost(ValTy) +
5776 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5778 return getWideningCost(I, VF);
5781 LoopVectorizationCostModel::VectorizationCostTy
5782 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5783 // If we know that this instruction will remain uniform, check the cost of
5784 // the scalar version.
5785 if (isUniformAfterVectorization(I, VF))
5786 VF = 1;
5788 if (VF > 1 && isProfitableToScalarize(I, VF))
5789 return VectorizationCostTy(InstsToScalarize[VF][I], false);
5791 // Forced scalars do not have any scalarization overhead.
5792 auto ForcedScalar = ForcedScalars.find(VF);
5793 if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5794 auto InstSet = ForcedScalar->second;
5795 if (InstSet.find(I) != InstSet.end())
5796 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5799 Type *VectorTy;
5800 unsigned C = getInstructionCost(I, VF, VectorTy);
5802 bool TypeNotScalarized =
5803 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5804 return VectorizationCostTy(C, TypeNotScalarized);
5807 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5808 unsigned VF) {
5810 if (VF == 1)
5811 return 0;
5813 unsigned Cost = 0;
5814 Type *RetTy = ToVectorTy(I->getType(), VF);
5815 if (!RetTy->isVoidTy() &&
5816 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5817 Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5819 // Some targets keep addresses scalar.
5820 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5821 return Cost;
5823 // Some targets support efficient element stores.
5824 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5825 return Cost;
5827 // Collect operands to consider.
5828 CallInst *CI = dyn_cast<CallInst>(I);
5829 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5831 // Skip operands that do not require extraction/scalarization and do not incur
5832 // any overhead.
5833 return Cost + TTI.getOperandsScalarizationOverhead(
5834 filterExtractingOperands(Ops, VF), VF);
5837 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
5838 if (VF == 1)
5839 return;
5840 NumPredStores = 0;
5841 for (BasicBlock *BB : TheLoop->blocks()) {
5842 // For each instruction in the old loop.
5843 for (Instruction &I : *BB) {
5844 Value *Ptr = getLoadStorePointerOperand(&I);
5845 if (!Ptr)
5846 continue;
5848 // TODO: We should generate better code and update the cost model for
5849 // predicated uniform stores. Today they are treated as any other
5850 // predicated store (see added test cases in
5851 // invariant-store-vectorization.ll).
5852 if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
5853 NumPredStores++;
5855 if (Legal->isUniform(Ptr) &&
5856 // Conditional loads and stores should be scalarized and predicated.
5857 // isScalarWithPredication cannot be used here since masked
5858 // gather/scatters are not considered scalar with predication.
5859 !Legal->blockNeedsPredication(I.getParent())) {
5860 // TODO: Avoid replicating loads and stores instead of
5861 // relying on instcombine to remove them.
5862 // Load: Scalar load + broadcast
5863 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5864 unsigned Cost = getUniformMemOpCost(&I, VF);
5865 setWideningDecision(&I, VF, CM_Scalarize, Cost);
5866 continue;
5869 // We assume that widening is the best solution when possible.
5870 if (memoryInstructionCanBeWidened(&I, VF)) {
5871 unsigned Cost = getConsecutiveMemOpCost(&I, VF);
5872 int ConsecutiveStride =
5873 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
5874 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5875 "Expected consecutive stride.");
5876 InstWidening Decision =
5877 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5878 setWideningDecision(&I, VF, Decision, Cost);
5879 continue;
5882 // Choose between Interleaving, Gather/Scatter or Scalarization.
5883 unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
5884 unsigned NumAccesses = 1;
5885 if (isAccessInterleaved(&I)) {
5886 auto Group = getInterleavedAccessGroup(&I);
5887 assert(Group && "Fail to get an interleaved access group.");
5889 // Make one decision for the whole group.
5890 if (getWideningDecision(&I, VF) != CM_Unknown)
5891 continue;
5893 NumAccesses = Group->getNumMembers();
5894 if (interleavedAccessCanBeWidened(&I, VF))
5895 InterleaveCost = getInterleaveGroupCost(&I, VF);
5898 unsigned GatherScatterCost =
5899 isLegalGatherOrScatter(&I)
5900 ? getGatherScatterCost(&I, VF) * NumAccesses
5901 : std::numeric_limits<unsigned>::max();
5903 unsigned ScalarizationCost =
5904 getMemInstScalarizationCost(&I, VF) * NumAccesses;
5906 // Choose better solution for the current VF,
5907 // write down this decision and use it during vectorization.
5908 unsigned Cost;
5909 InstWidening Decision;
5910 if (InterleaveCost <= GatherScatterCost &&
5911 InterleaveCost < ScalarizationCost) {
5912 Decision = CM_Interleave;
5913 Cost = InterleaveCost;
5914 } else if (GatherScatterCost < ScalarizationCost) {
5915 Decision = CM_GatherScatter;
5916 Cost = GatherScatterCost;
5917 } else {
5918 Decision = CM_Scalarize;
5919 Cost = ScalarizationCost;
5921 // If the instructions belongs to an interleave group, the whole group
5922 // receives the same decision. The whole group receives the cost, but
5923 // the cost will actually be assigned to one instruction.
5924 if (auto Group = getInterleavedAccessGroup(&I))
5925 setWideningDecision(Group, VF, Decision, Cost);
5926 else
5927 setWideningDecision(&I, VF, Decision, Cost);
5931 // Make sure that any load of address and any other address computation
5932 // remains scalar unless there is gather/scatter support. This avoids
5933 // inevitable extracts into address registers, and also has the benefit of
5934 // activating LSR more, since that pass can't optimize vectorized
5935 // addresses.
5936 if (TTI.prefersVectorizedAddressing())
5937 return;
5939 // Start with all scalar pointer uses.
5940 SmallPtrSet<Instruction *, 8> AddrDefs;
5941 for (BasicBlock *BB : TheLoop->blocks())
5942 for (Instruction &I : *BB) {
5943 Instruction *PtrDef =
5944 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5945 if (PtrDef && TheLoop->contains(PtrDef) &&
5946 getWideningDecision(&I, VF) != CM_GatherScatter)
5947 AddrDefs.insert(PtrDef);
5950 // Add all instructions used to generate the addresses.
5951 SmallVector<Instruction *, 4> Worklist;
5952 for (auto *I : AddrDefs)
5953 Worklist.push_back(I);
5954 while (!Worklist.empty()) {
5955 Instruction *I = Worklist.pop_back_val();
5956 for (auto &Op : I->operands())
5957 if (auto *InstOp = dyn_cast<Instruction>(Op))
5958 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
5959 AddrDefs.insert(InstOp).second)
5960 Worklist.push_back(InstOp);
5963 for (auto *I : AddrDefs) {
5964 if (isa<LoadInst>(I)) {
5965 // Setting the desired widening decision should ideally be handled in
5966 // by cost functions, but since this involves the task of finding out
5967 // if the loaded register is involved in an address computation, it is
5968 // instead changed here when we know this is the case.
5969 InstWidening Decision = getWideningDecision(I, VF);
5970 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
5971 // Scalarize a widened load of address.
5972 setWideningDecision(I, VF, CM_Scalarize,
5973 (VF * getMemoryInstructionCost(I, 1)));
5974 else if (auto Group = getInterleavedAccessGroup(I)) {
5975 // Scalarize an interleave group of address loads.
5976 for (unsigned I = 0; I < Group->getFactor(); ++I) {
5977 if (Instruction *Member = Group->getMember(I))
5978 setWideningDecision(Member, VF, CM_Scalarize,
5979 (VF * getMemoryInstructionCost(Member, 1)));
5982 } else
5983 // Make sure I gets scalarized and a cost estimate without
5984 // scalarization overhead.
5985 ForcedScalars[VF].insert(I);
5989 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
5990 unsigned VF,
5991 Type *&VectorTy) {
5992 Type *RetTy = I->getType();
5993 if (canTruncateToMinimalBitwidth(I, VF))
5994 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
5995 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
5996 auto SE = PSE.getSE();
5998 // TODO: We need to estimate the cost of intrinsic calls.
5999 switch (I->getOpcode()) {
6000 case Instruction::GetElementPtr:
6001 // We mark this instruction as zero-cost because the cost of GEPs in
6002 // vectorized code depends on whether the corresponding memory instruction
6003 // is scalarized or not. Therefore, we handle GEPs with the memory
6004 // instruction cost.
6005 return 0;
6006 case Instruction::Br: {
6007 // In cases of scalarized and predicated instructions, there will be VF
6008 // predicated blocks in the vectorized loop. Each branch around these
6009 // blocks requires also an extract of its vector compare i1 element.
6010 bool ScalarPredicatedBB = false;
6011 BranchInst *BI = cast<BranchInst>(I);
6012 if (VF > 1 && BI->isConditional() &&
6013 (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6014 PredicatedBBsAfterVectorization.end() ||
6015 PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6016 PredicatedBBsAfterVectorization.end()))
6017 ScalarPredicatedBB = true;
6019 if (ScalarPredicatedBB) {
6020 // Return cost for branches around scalarized and predicated blocks.
6021 Type *Vec_i1Ty =
6022 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6023 return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6024 (TTI.getCFInstrCost(Instruction::Br) * VF));
6025 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6026 // The back-edge branch will remain, as will all scalar branches.
6027 return TTI.getCFInstrCost(Instruction::Br);
6028 else
6029 // This branch will be eliminated by if-conversion.
6030 return 0;
6031 // Note: We currently assume zero cost for an unconditional branch inside
6032 // a predicated block since it will become a fall-through, although we
6033 // may decide in the future to call TTI for all branches.
6035 case Instruction::PHI: {
6036 auto *Phi = cast<PHINode>(I);
6038 // First-order recurrences are replaced by vector shuffles inside the loop.
6039 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6040 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6041 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6042 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6044 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6045 // converted into select instructions. We require N - 1 selects per phi
6046 // node, where N is the number of incoming values.
6047 if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6048 return (Phi->getNumIncomingValues() - 1) *
6049 TTI.getCmpSelInstrCost(
6050 Instruction::Select, ToVectorTy(Phi->getType(), VF),
6051 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6053 return TTI.getCFInstrCost(Instruction::PHI);
6055 case Instruction::UDiv:
6056 case Instruction::SDiv:
6057 case Instruction::URem:
6058 case Instruction::SRem:
6059 // If we have a predicated instruction, it may not be executed for each
6060 // vector lane. Get the scalarization cost and scale this amount by the
6061 // probability of executing the predicated block. If the instruction is not
6062 // predicated, we fall through to the next case.
6063 if (VF > 1 && isScalarWithPredication(I)) {
6064 unsigned Cost = 0;
6066 // These instructions have a non-void type, so account for the phi nodes
6067 // that we will create. This cost is likely to be zero. The phi node
6068 // cost, if any, should be scaled by the block probability because it
6069 // models a copy at the end of each predicated block.
6070 Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6072 // The cost of the non-predicated instruction.
6073 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6075 // The cost of insertelement and extractelement instructions needed for
6076 // scalarization.
6077 Cost += getScalarizationOverhead(I, VF);
6079 // Scale the cost by the probability of executing the predicated blocks.
6080 // This assumes the predicated block for each vector lane is equally
6081 // likely.
6082 return Cost / getReciprocalPredBlockProb();
6084 LLVM_FALLTHROUGH;
6085 case Instruction::Add:
6086 case Instruction::FAdd:
6087 case Instruction::Sub:
6088 case Instruction::FSub:
6089 case Instruction::Mul:
6090 case Instruction::FMul:
6091 case Instruction::FDiv:
6092 case Instruction::FRem:
6093 case Instruction::Shl:
6094 case Instruction::LShr:
6095 case Instruction::AShr:
6096 case Instruction::And:
6097 case Instruction::Or:
6098 case Instruction::Xor: {
6099 // Since we will replace the stride by 1 the multiplication should go away.
6100 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6101 return 0;
6102 // Certain instructions can be cheaper to vectorize if they have a constant
6103 // second vector operand. One example of this are shifts on x86.
6104 Value *Op2 = I->getOperand(1);
6105 TargetTransformInfo::OperandValueProperties Op2VP;
6106 TargetTransformInfo::OperandValueKind Op2VK =
6107 TTI.getOperandInfo(Op2, Op2VP);
6108 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6109 Op2VK = TargetTransformInfo::OK_UniformValue;
6111 SmallVector<const Value *, 4> Operands(I->operand_values());
6112 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6113 return N * TTI.getArithmeticInstrCost(
6114 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6115 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands);
6117 case Instruction::FNeg: {
6118 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6119 return N * TTI.getArithmeticInstrCost(
6120 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6121 TargetTransformInfo::OK_AnyValue,
6122 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6123 I->getOperand(0));
6125 case Instruction::Select: {
6126 SelectInst *SI = cast<SelectInst>(I);
6127 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6128 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6129 Type *CondTy = SI->getCondition()->getType();
6130 if (!ScalarCond)
6131 CondTy = VectorType::get(CondTy, VF);
6133 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6135 case Instruction::ICmp:
6136 case Instruction::FCmp: {
6137 Type *ValTy = I->getOperand(0)->getType();
6138 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6139 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6140 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6141 VectorTy = ToVectorTy(ValTy, VF);
6142 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6144 case Instruction::Store:
6145 case Instruction::Load: {
6146 unsigned Width = VF;
6147 if (Width > 1) {
6148 InstWidening Decision = getWideningDecision(I, Width);
6149 assert(Decision != CM_Unknown &&
6150 "CM decision should be taken at this point");
6151 if (Decision == CM_Scalarize)
6152 Width = 1;
6154 VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6155 return getMemoryInstructionCost(I, VF);
6157 case Instruction::ZExt:
6158 case Instruction::SExt:
6159 case Instruction::FPToUI:
6160 case Instruction::FPToSI:
6161 case Instruction::FPExt:
6162 case Instruction::PtrToInt:
6163 case Instruction::IntToPtr:
6164 case Instruction::SIToFP:
6165 case Instruction::UIToFP:
6166 case Instruction::Trunc:
6167 case Instruction::FPTrunc:
6168 case Instruction::BitCast: {
6169 // We optimize the truncation of induction variables having constant
6170 // integer steps. The cost of these truncations is the same as the scalar
6171 // operation.
6172 if (isOptimizableIVTruncate(I, VF)) {
6173 auto *Trunc = cast<TruncInst>(I);
6174 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6175 Trunc->getSrcTy(), Trunc);
6178 Type *SrcScalarTy = I->getOperand(0)->getType();
6179 Type *SrcVecTy =
6180 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6181 if (canTruncateToMinimalBitwidth(I, VF)) {
6182 // This cast is going to be shrunk. This may remove the cast or it might
6183 // turn it into slightly different cast. For example, if MinBW == 16,
6184 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6186 // Calculate the modified src and dest types.
6187 Type *MinVecTy = VectorTy;
6188 if (I->getOpcode() == Instruction::Trunc) {
6189 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6190 VectorTy =
6191 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6192 } else if (I->getOpcode() == Instruction::ZExt ||
6193 I->getOpcode() == Instruction::SExt) {
6194 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6195 VectorTy =
6196 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6200 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6201 return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6203 case Instruction::Call: {
6204 bool NeedToScalarize;
6205 CallInst *CI = cast<CallInst>(I);
6206 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6207 if (getVectorIntrinsicIDForCall(CI, TLI))
6208 return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6209 return CallCost;
6211 default:
6212 // The cost of executing VF copies of the scalar instruction. This opcode
6213 // is unknown. Assume that it is the same as 'mul'.
6214 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6215 getScalarizationOverhead(I, VF);
6216 } // end of switch.
6219 char LoopVectorize::ID = 0;
6221 static const char lv_name[] = "Loop Vectorization";
6223 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6224 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6225 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6226 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6227 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6228 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6229 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6230 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6231 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6232 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6233 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6234 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6235 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6236 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6237 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6239 namespace llvm {
6241 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6243 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6244 bool VectorizeOnlyWhenForced) {
6245 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6248 } // end namespace llvm
6250 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6251 // Check if the pointer operand of a load or store instruction is
6252 // consecutive.
6253 if (auto *Ptr = getLoadStorePointerOperand(Inst))
6254 return Legal->isConsecutivePtr(Ptr);
6255 return false;
6258 void LoopVectorizationCostModel::collectValuesToIgnore() {
6259 // Ignore ephemeral values.
6260 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6262 // Ignore type-promoting instructions we identified during reduction
6263 // detection.
6264 for (auto &Reduction : *Legal->getReductionVars()) {
6265 RecurrenceDescriptor &RedDes = Reduction.second;
6266 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6267 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6269 // Ignore type-casting instructions we identified during induction
6270 // detection.
6271 for (auto &Induction : *Legal->getInductionVars()) {
6272 InductionDescriptor &IndDes = Induction.second;
6273 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6274 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6278 // TODO: we could return a pair of values that specify the max VF and
6279 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6280 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6281 // doesn't have a cost model that can choose which plan to execute if
6282 // more than one is generated.
6283 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6284 LoopVectorizationCostModel &CM) {
6285 unsigned WidestType;
6286 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6287 return WidestVectorRegBits / WidestType;
6290 VectorizationFactor
6291 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6292 unsigned VF = UserVF;
6293 // Outer loop handling: They may require CFG and instruction level
6294 // transformations before even evaluating whether vectorization is profitable.
6295 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6296 // the vectorization pipeline.
6297 if (!OrigLoop->empty()) {
6298 // If the user doesn't provide a vectorization factor, determine a
6299 // reasonable one.
6300 if (!UserVF) {
6301 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6302 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6304 // Make sure we have a VF > 1 for stress testing.
6305 if (VPlanBuildStressTest && VF < 2) {
6306 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6307 << "overriding computed VF.\n");
6308 VF = 4;
6311 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6312 assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6313 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6314 << " to build VPlans.\n");
6315 buildVPlans(VF, VF);
6317 // For VPlan build stress testing, we bail out after VPlan construction.
6318 if (VPlanBuildStressTest)
6319 return VectorizationFactor::Disabled();
6321 return {VF, 0};
6324 LLVM_DEBUG(
6325 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6326 "VPlan-native path.\n");
6327 return VectorizationFactor::Disabled();
6330 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6331 assert(OrigLoop->empty() && "Inner loop expected.");
6332 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6333 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6334 return None;
6336 // Invalidate interleave groups if all blocks of loop will be predicated.
6337 if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6338 !useMaskedInterleavedAccesses(*TTI)) {
6339 LLVM_DEBUG(
6340 dbgs()
6341 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6342 "which requires masked-interleaved support.\n");
6343 CM.InterleaveInfo.reset();
6346 if (UserVF) {
6347 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6348 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6349 // Collect the instructions (and their associated costs) that will be more
6350 // profitable to scalarize.
6351 CM.selectUserVectorizationFactor(UserVF);
6352 buildVPlansWithVPRecipes(UserVF, UserVF);
6353 LLVM_DEBUG(printPlans(dbgs()));
6354 return {{UserVF, 0}};
6357 unsigned MaxVF = MaybeMaxVF.getValue();
6358 assert(MaxVF != 0 && "MaxVF is zero.");
6360 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6361 // Collect Uniform and Scalar instructions after vectorization with VF.
6362 CM.collectUniformsAndScalars(VF);
6364 // Collect the instructions (and their associated costs) that will be more
6365 // profitable to scalarize.
6366 if (VF > 1)
6367 CM.collectInstsToScalarize(VF);
6370 buildVPlansWithVPRecipes(1, MaxVF);
6371 LLVM_DEBUG(printPlans(dbgs()));
6372 if (MaxVF == 1)
6373 return VectorizationFactor::Disabled();
6375 // Select the optimal vectorization factor.
6376 return CM.selectVectorizationFactor(MaxVF);
6379 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6380 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6381 << '\n');
6382 BestVF = VF;
6383 BestUF = UF;
6385 erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6386 return !Plan->hasVF(VF);
6388 assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6391 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6392 DominatorTree *DT) {
6393 // Perform the actual loop transformation.
6395 // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6396 VPCallbackILV CallbackILV(ILV);
6398 VPTransformState State{BestVF, BestUF, LI,
6399 DT, ILV.Builder, ILV.VectorLoopValueMap,
6400 &ILV, CallbackILV};
6401 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6402 State.TripCount = ILV.getOrCreateTripCount(nullptr);
6404 //===------------------------------------------------===//
6406 // Notice: any optimization or new instruction that go
6407 // into the code below should also be implemented in
6408 // the cost-model.
6410 //===------------------------------------------------===//
6412 // 2. Copy and widen instructions from the old loop into the new loop.
6413 assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6414 VPlans.front()->execute(&State);
6416 // 3. Fix the vectorized code: take care of header phi's, live-outs,
6417 // predication, updating analyses.
6418 ILV.fixVectorizedLoop();
6421 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6422 SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6423 BasicBlock *Latch = OrigLoop->getLoopLatch();
6425 // We create new control-flow for the vectorized loop, so the original
6426 // condition will be dead after vectorization if it's only used by the
6427 // branch.
6428 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6429 if (Cmp && Cmp->hasOneUse())
6430 DeadInstructions.insert(Cmp);
6432 // We create new "steps" for induction variable updates to which the original
6433 // induction variables map. An original update instruction will be dead if
6434 // all its users except the induction variable are dead.
6435 for (auto &Induction : *Legal->getInductionVars()) {
6436 PHINode *Ind = Induction.first;
6437 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6438 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6439 return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6440 DeadInstructions.end();
6442 DeadInstructions.insert(IndUpdate);
6444 // We record as "Dead" also the type-casting instructions we had identified
6445 // during induction analysis. We don't need any handling for them in the
6446 // vectorized loop because we have proven that, under a proper runtime
6447 // test guarding the vectorized loop, the value of the phi, and the casted
6448 // value of the phi, are the same. The last instruction in this casting chain
6449 // will get its scalar/vector/widened def from the scalar/vector/widened def
6450 // of the respective phi node. Any other casts in the induction def-use chain
6451 // have no other uses outside the phi update chain, and will be ignored.
6452 InductionDescriptor &IndDes = Induction.second;
6453 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6454 DeadInstructions.insert(Casts.begin(), Casts.end());
6458 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6460 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6462 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6463 Instruction::BinaryOps BinOp) {
6464 // When unrolling and the VF is 1, we only need to add a simple scalar.
6465 Type *Ty = Val->getType();
6466 assert(!Ty->isVectorTy() && "Val must be a scalar");
6468 if (Ty->isFloatingPointTy()) {
6469 Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6471 // Floating point operations had to be 'fast' to enable the unrolling.
6472 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6473 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6475 Constant *C = ConstantInt::get(Ty, StartIdx);
6476 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6479 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6480 SmallVector<Metadata *, 4> MDs;
6481 // Reserve first location for self reference to the LoopID metadata node.
6482 MDs.push_back(nullptr);
6483 bool IsUnrollMetadata = false;
6484 MDNode *LoopID = L->getLoopID();
6485 if (LoopID) {
6486 // First find existing loop unrolling disable metadata.
6487 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6488 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6489 if (MD) {
6490 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6491 IsUnrollMetadata =
6492 S && S->getString().startswith("llvm.loop.unroll.disable");
6494 MDs.push_back(LoopID->getOperand(i));
6498 if (!IsUnrollMetadata) {
6499 // Add runtime unroll disable metadata.
6500 LLVMContext &Context = L->getHeader()->getContext();
6501 SmallVector<Metadata *, 1> DisableOperands;
6502 DisableOperands.push_back(
6503 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6504 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6505 MDs.push_back(DisableNode);
6506 MDNode *NewLoopID = MDNode::get(Context, MDs);
6507 // Set operand 0 to refer to the loop id itself.
6508 NewLoopID->replaceOperandWith(0, NewLoopID);
6509 L->setLoopID(NewLoopID);
6513 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6514 const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6515 assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6516 bool PredicateAtRangeStart = Predicate(Range.Start);
6518 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6519 if (Predicate(TmpVF) != PredicateAtRangeStart) {
6520 Range.End = TmpVF;
6521 break;
6524 return PredicateAtRangeStart;
6527 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6528 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6529 /// of VF's starting at a given VF and extending it as much as possible. Each
6530 /// vectorization decision can potentially shorten this sub-range during
6531 /// buildVPlan().
6532 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6533 for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6534 VFRange SubRange = {VF, MaxVF + 1};
6535 VPlans.push_back(buildVPlan(SubRange));
6536 VF = SubRange.End;
6540 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6541 VPlanPtr &Plan) {
6542 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6544 // Look for cached value.
6545 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6546 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6547 if (ECEntryIt != EdgeMaskCache.end())
6548 return ECEntryIt->second;
6550 VPValue *SrcMask = createBlockInMask(Src, Plan);
6552 // The terminator has to be a branch inst!
6553 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6554 assert(BI && "Unexpected terminator found");
6556 if (!BI->isConditional())
6557 return EdgeMaskCache[Edge] = SrcMask;
6559 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6560 assert(EdgeMask && "No Edge Mask found for condition");
6562 if (BI->getSuccessor(0) != Dst)
6563 EdgeMask = Builder.createNot(EdgeMask);
6565 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6566 EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6568 return EdgeMaskCache[Edge] = EdgeMask;
6571 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6572 assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6574 // Look for cached value.
6575 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6576 if (BCEntryIt != BlockMaskCache.end())
6577 return BCEntryIt->second;
6579 // All-one mask is modelled as no-mask following the convention for masked
6580 // load/store/gather/scatter. Initialize BlockMask to no-mask.
6581 VPValue *BlockMask = nullptr;
6583 if (OrigLoop->getHeader() == BB) {
6584 if (!CM.blockNeedsPredication(BB))
6585 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6587 // Introduce the early-exit compare IV <= BTC to form header block mask.
6588 // This is used instead of IV < TC because TC may wrap, unlike BTC.
6589 VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6590 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6591 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6592 return BlockMaskCache[BB] = BlockMask;
6595 // This is the block mask. We OR all incoming edges.
6596 for (auto *Predecessor : predecessors(BB)) {
6597 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6598 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6599 return BlockMaskCache[BB] = EdgeMask;
6601 if (!BlockMask) { // BlockMask has its initialized nullptr value.
6602 BlockMask = EdgeMask;
6603 continue;
6606 BlockMask = Builder.createOr(BlockMask, EdgeMask);
6609 return BlockMaskCache[BB] = BlockMask;
6612 VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
6613 VFRange &Range,
6614 VPlanPtr &Plan) {
6615 const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I);
6616 if (!IG)
6617 return nullptr;
6619 // Now check if IG is relevant for VF's in the given range.
6620 auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {
6621 return [=](unsigned VF) -> bool {
6622 return (VF >= 2 && // Query is illegal for VF == 1
6623 CM.getWideningDecision(I, VF) ==
6624 LoopVectorizationCostModel::CM_Interleave);
6627 if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range))
6628 return nullptr;
6630 // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
6631 // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
6632 // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
6633 assert(I == IG->getInsertPos() &&
6634 "Generating a recipe for an adjunct member of an interleave group");
6636 VPValue *Mask = nullptr;
6637 if (Legal->isMaskRequired(I))
6638 Mask = createBlockInMask(I->getParent(), Plan);
6640 return new VPInterleaveRecipe(IG, Mask);
6643 VPWidenMemoryInstructionRecipe *
6644 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6645 VPlanPtr &Plan) {
6646 if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6647 return nullptr;
6649 auto willWiden = [&](unsigned VF) -> bool {
6650 if (VF == 1)
6651 return false;
6652 if (CM.isScalarAfterVectorization(I, VF) ||
6653 CM.isProfitableToScalarize(I, VF))
6654 return false;
6655 LoopVectorizationCostModel::InstWidening Decision =
6656 CM.getWideningDecision(I, VF);
6657 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6658 "CM decision should be taken at this point.");
6659 assert(Decision != LoopVectorizationCostModel::CM_Interleave &&
6660 "Interleave memory opportunity should be caught earlier.");
6661 return Decision != LoopVectorizationCostModel::CM_Scalarize;
6664 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6665 return nullptr;
6667 VPValue *Mask = nullptr;
6668 if (Legal->isMaskRequired(I))
6669 Mask = createBlockInMask(I->getParent(), Plan);
6671 return new VPWidenMemoryInstructionRecipe(*I, Mask);
6674 VPWidenIntOrFpInductionRecipe *
6675 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6676 if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6677 // Check if this is an integer or fp induction. If so, build the recipe that
6678 // produces its scalar and vector values.
6679 InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6680 if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6681 II.getKind() == InductionDescriptor::IK_FpInduction)
6682 return new VPWidenIntOrFpInductionRecipe(Phi);
6684 return nullptr;
6687 // Optimize the special case where the source is a constant integer
6688 // induction variable. Notice that we can only optimize the 'trunc' case
6689 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6690 // (c) other casts depend on pointer size.
6692 // Determine whether \p K is a truncation based on an induction variable that
6693 // can be optimized.
6694 auto isOptimizableIVTruncate =
6695 [&](Instruction *K) -> std::function<bool(unsigned)> {
6696 return
6697 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6700 if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6701 isOptimizableIVTruncate(I), Range))
6702 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6703 cast<TruncInst>(I));
6704 return nullptr;
6707 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6708 PHINode *Phi = dyn_cast<PHINode>(I);
6709 if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6710 return nullptr;
6712 // We know that all PHIs in non-header blocks are converted into selects, so
6713 // we don't have to worry about the insertion order and we can just use the
6714 // builder. At this point we generate the predication tree. There may be
6715 // duplications since this is a simple recursive scan, but future
6716 // optimizations will clean it up.
6718 SmallVector<VPValue *, 2> Masks;
6719 unsigned NumIncoming = Phi->getNumIncomingValues();
6720 for (unsigned In = 0; In < NumIncoming; In++) {
6721 VPValue *EdgeMask =
6722 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6723 assert((EdgeMask || NumIncoming == 1) &&
6724 "Multiple predecessors with one having a full mask");
6725 if (EdgeMask)
6726 Masks.push_back(EdgeMask);
6728 return new VPBlendRecipe(Phi, Masks);
6731 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6732 VFRange &Range) {
6734 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6735 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6737 if (IsPredicated)
6738 return false;
6740 auto IsVectorizableOpcode = [](unsigned Opcode) {
6741 switch (Opcode) {
6742 case Instruction::Add:
6743 case Instruction::And:
6744 case Instruction::AShr:
6745 case Instruction::BitCast:
6746 case Instruction::Br:
6747 case Instruction::Call:
6748 case Instruction::FAdd:
6749 case Instruction::FCmp:
6750 case Instruction::FDiv:
6751 case Instruction::FMul:
6752 case Instruction::FNeg:
6753 case Instruction::FPExt:
6754 case Instruction::FPToSI:
6755 case Instruction::FPToUI:
6756 case Instruction::FPTrunc:
6757 case Instruction::FRem:
6758 case Instruction::FSub:
6759 case Instruction::GetElementPtr:
6760 case Instruction::ICmp:
6761 case Instruction::IntToPtr:
6762 case Instruction::Load:
6763 case Instruction::LShr:
6764 case Instruction::Mul:
6765 case Instruction::Or:
6766 case Instruction::PHI:
6767 case Instruction::PtrToInt:
6768 case Instruction::SDiv:
6769 case Instruction::Select:
6770 case Instruction::SExt:
6771 case Instruction::Shl:
6772 case Instruction::SIToFP:
6773 case Instruction::SRem:
6774 case Instruction::Store:
6775 case Instruction::Sub:
6776 case Instruction::Trunc:
6777 case Instruction::UDiv:
6778 case Instruction::UIToFP:
6779 case Instruction::URem:
6780 case Instruction::Xor:
6781 case Instruction::ZExt:
6782 return true;
6784 return false;
6787 if (!IsVectorizableOpcode(I->getOpcode()))
6788 return false;
6790 if (CallInst *CI = dyn_cast<CallInst>(I)) {
6791 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6792 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6793 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6794 return false;
6797 auto willWiden = [&](unsigned VF) -> bool {
6798 if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6799 CM.isProfitableToScalarize(I, VF)))
6800 return false;
6801 if (CallInst *CI = dyn_cast<CallInst>(I)) {
6802 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6803 // The following case may be scalarized depending on the VF.
6804 // The flag shows whether we use Intrinsic or a usual Call for vectorized
6805 // version of the instruction.
6806 // Is it beneficial to perform intrinsic call compared to lib call?
6807 bool NeedToScalarize;
6808 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6809 bool UseVectorIntrinsic =
6810 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6811 return UseVectorIntrinsic || !NeedToScalarize;
6813 if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6814 assert(CM.getWideningDecision(I, VF) ==
6815 LoopVectorizationCostModel::CM_Scalarize &&
6816 "Memory widening decisions should have been taken care by now");
6817 return false;
6819 return true;
6822 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6823 return false;
6825 // Success: widen this instruction. We optimize the common case where
6826 // consecutive instructions can be represented by a single recipe.
6827 if (!VPBB->empty()) {
6828 VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());
6829 if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))
6830 return true;
6833 VPBB->appendRecipe(new VPWidenRecipe(I));
6834 return true;
6837 VPBasicBlock *VPRecipeBuilder::handleReplication(
6838 Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6839 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6840 VPlanPtr &Plan) {
6841 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6842 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6843 Range);
6845 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6846 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6848 auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6850 // Find if I uses a predicated instruction. If so, it will use its scalar
6851 // value. Avoid hoisting the insert-element which packs the scalar value into
6852 // a vector value, as that happens iff all users use the vector value.
6853 for (auto &Op : I->operands())
6854 if (auto *PredInst = dyn_cast<Instruction>(Op))
6855 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6856 PredInst2Recipe[PredInst]->setAlsoPack(false);
6858 // Finalize the recipe for Instr, first if it is not predicated.
6859 if (!IsPredicated) {
6860 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
6861 VPBB->appendRecipe(Recipe);
6862 return VPBB;
6864 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
6865 assert(VPBB->getSuccessors().empty() &&
6866 "VPBB has successors when handling predicated replication.");
6867 // Record predicated instructions for above packing optimizations.
6868 PredInst2Recipe[I] = Recipe;
6869 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
6870 VPBlockUtils::insertBlockAfter(Region, VPBB);
6871 auto *RegSucc = new VPBasicBlock();
6872 VPBlockUtils::insertBlockAfter(RegSucc, Region);
6873 return RegSucc;
6876 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
6877 VPRecipeBase *PredRecipe,
6878 VPlanPtr &Plan) {
6879 // Instructions marked for predication are replicated and placed under an
6880 // if-then construct to prevent side-effects.
6882 // Generate recipes to compute the block mask for this region.
6883 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
6885 // Build the triangular if-then region.
6886 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
6887 assert(Instr->getParent() && "Predicated instruction not in any basic block");
6888 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
6889 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
6890 auto *PHIRecipe =
6891 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
6892 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
6893 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
6894 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
6896 // Note: first set Entry as region entry and then connect successors starting
6897 // from it in order, to propagate the "parent" of each VPBasicBlock.
6898 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
6899 VPBlockUtils::connectBlocks(Pred, Exit);
6901 return Region;
6904 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
6905 VPlanPtr &Plan, VPBasicBlock *VPBB) {
6906 VPRecipeBase *Recipe = nullptr;
6907 // Check if Instr should belong to an interleave memory recipe, or already
6908 // does. In the latter case Instr is irrelevant.
6909 if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {
6910 VPBB->appendRecipe(Recipe);
6911 return true;
6914 // Check if Instr is a memory operation that should be widened.
6915 if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
6916 VPBB->appendRecipe(Recipe);
6917 return true;
6920 // Check if Instr should form some PHI recipe.
6921 if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
6922 VPBB->appendRecipe(Recipe);
6923 return true;
6925 if ((Recipe = tryToBlend(Instr, Plan))) {
6926 VPBB->appendRecipe(Recipe);
6927 return true;
6929 if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
6930 VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
6931 return true;
6934 // Check if Instr is to be widened by a general VPWidenRecipe, after
6935 // having first checked for specific widening recipes that deal with
6936 // Interleave Groups, Inductions and Phi nodes.
6937 if (tryToWiden(Instr, VPBB, Range))
6938 return true;
6940 return false;
6943 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
6944 unsigned MaxVF) {
6945 assert(OrigLoop->empty() && "Inner loop expected.");
6947 // Collect conditions feeding internal conditional branches; they need to be
6948 // represented in VPlan for it to model masking.
6949 SmallPtrSet<Value *, 1> NeedDef;
6951 auto *Latch = OrigLoop->getLoopLatch();
6952 for (BasicBlock *BB : OrigLoop->blocks()) {
6953 if (BB == Latch)
6954 continue;
6955 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
6956 if (Branch && Branch->isConditional())
6957 NeedDef.insert(Branch->getCondition());
6960 // If the tail is to be folded by masking, the primary induction variable
6961 // needs to be represented in VPlan for it to model early-exit masking.
6962 // Also, both the Phi and the live-out instruction of each reduction are
6963 // required in order to introduce a select between them in VPlan.
6964 if (CM.foldTailByMasking()) {
6965 NeedDef.insert(Legal->getPrimaryInduction());
6966 for (auto &Reduction : *Legal->getReductionVars()) {
6967 NeedDef.insert(Reduction.first);
6968 NeedDef.insert(Reduction.second.getLoopExitInstr());
6972 // Collect instructions from the original loop that will become trivially dead
6973 // in the vectorized loop. We don't need to vectorize these instructions. For
6974 // example, original induction update instructions can become dead because we
6975 // separately emit induction "steps" when generating code for the new loop.
6976 // Similarly, we create a new latch condition when setting up the structure
6977 // of the new loop, so the old one can become dead.
6978 SmallPtrSet<Instruction *, 4> DeadInstructions;
6979 collectTriviallyDeadInstructions(DeadInstructions);
6981 for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6982 VFRange SubRange = {VF, MaxVF + 1};
6983 VPlans.push_back(
6984 buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
6985 VF = SubRange.End;
6989 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
6990 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
6991 SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6992 // Hold a mapping from predicated instructions to their recipes, in order to
6993 // fix their AlsoPack behavior if a user is determined to replicate and use a
6994 // scalar instead of vector value.
6995 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
6997 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
6998 DenseMap<Instruction *, Instruction *> SinkAfterInverse;
7000 // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7001 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7002 auto Plan = std::make_unique<VPlan>(VPBB);
7004 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
7005 // Represent values that will have defs inside VPlan.
7006 for (Value *V : NeedDef)
7007 Plan->addVPValue(V);
7009 // Scan the body of the loop in a topological order to visit each basic block
7010 // after having visited its predecessor basic blocks.
7011 LoopBlocksDFS DFS(OrigLoop);
7012 DFS.perform(LI);
7014 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7015 // Relevant instructions from basic block BB will be grouped into VPRecipe
7016 // ingredients and fill a new VPBasicBlock.
7017 unsigned VPBBsForBB = 0;
7018 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7019 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7020 VPBB = FirstVPBBForBB;
7021 Builder.setInsertPoint(VPBB);
7023 std::vector<Instruction *> Ingredients;
7025 // Organize the ingredients to vectorize from current basic block in the
7026 // right order.
7027 for (Instruction &I : BB->instructionsWithoutDebug()) {
7028 Instruction *Instr = &I;
7030 // First filter out irrelevant instructions, to ensure no recipes are
7031 // built for them.
7032 if (isa<BranchInst>(Instr) ||
7033 DeadInstructions.find(Instr) != DeadInstructions.end())
7034 continue;
7036 // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
7037 // member of the IG, do not construct any Recipe for it.
7038 const InterleaveGroup<Instruction> *IG =
7039 CM.getInterleavedAccessGroup(Instr);
7040 if (IG && Instr != IG->getInsertPos() &&
7041 Range.Start >= 2 && // Query is illegal for VF == 1
7042 CM.getWideningDecision(Instr, Range.Start) ==
7043 LoopVectorizationCostModel::CM_Interleave) {
7044 auto SinkCandidate = SinkAfterInverse.find(Instr);
7045 if (SinkCandidate != SinkAfterInverse.end())
7046 Ingredients.push_back(SinkCandidate->second);
7047 continue;
7050 // Move instructions to handle first-order recurrences, step 1: avoid
7051 // handling this instruction until after we've handled the instruction it
7052 // should follow.
7053 auto SAIt = SinkAfter.find(Instr);
7054 if (SAIt != SinkAfter.end()) {
7055 LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after"
7056 << *SAIt->second
7057 << " to vectorize a 1st order recurrence.\n");
7058 SinkAfterInverse[SAIt->second] = Instr;
7059 continue;
7062 Ingredients.push_back(Instr);
7064 // Move instructions to handle first-order recurrences, step 2: push the
7065 // instruction to be sunk at its insertion point.
7066 auto SAInvIt = SinkAfterInverse.find(Instr);
7067 if (SAInvIt != SinkAfterInverse.end())
7068 Ingredients.push_back(SAInvIt->second);
7071 // Introduce each ingredient into VPlan.
7072 for (Instruction *Instr : Ingredients) {
7073 if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7074 continue;
7076 // Otherwise, if all widening options failed, Instruction is to be
7077 // replicated. This may create a successor for VPBB.
7078 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7079 Instr, Range, VPBB, PredInst2Recipe, Plan);
7080 if (NextVPBB != VPBB) {
7081 VPBB = NextVPBB;
7082 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7083 : "");
7088 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7089 // may also be empty, such as the last one VPBB, reflecting original
7090 // basic-blocks with no recipes.
7091 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7092 assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7093 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7094 VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7095 delete PreEntry;
7097 // Finally, if tail is folded by masking, introduce selects between the phi
7098 // and the live-out instruction of each reduction, at the end of the latch.
7099 if (CM.foldTailByMasking()) {
7100 Builder.setInsertPoint(VPBB);
7101 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7102 for (auto &Reduction : *Legal->getReductionVars()) {
7103 VPValue *Phi = Plan->getVPValue(Reduction.first);
7104 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7105 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7109 std::string PlanName;
7110 raw_string_ostream RSO(PlanName);
7111 unsigned VF = Range.Start;
7112 Plan->addVF(VF);
7113 RSO << "Initial VPlan for VF={" << VF;
7114 for (VF *= 2; VF < Range.End; VF *= 2) {
7115 Plan->addVF(VF);
7116 RSO << "," << VF;
7118 RSO << "},UF>=1";
7119 RSO.flush();
7120 Plan->setName(PlanName);
7122 return Plan;
7125 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7126 // Outer loop handling: They may require CFG and instruction level
7127 // transformations before even evaluating whether vectorization is profitable.
7128 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7129 // the vectorization pipeline.
7130 assert(!OrigLoop->empty());
7131 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7133 // Create new empty VPlan
7134 auto Plan = std::make_unique<VPlan>();
7136 // Build hierarchical CFG
7137 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7138 HCFGBuilder.buildHierarchicalCFG();
7140 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7141 Plan->addVF(VF);
7143 if (EnableVPlanPredication) {
7144 VPlanPredicator VPP(*Plan);
7145 VPP.predicate();
7147 // Avoid running transformation to recipes until masked code generation in
7148 // VPlan-native path is in place.
7149 return Plan;
7152 SmallPtrSet<Instruction *, 1> DeadInstructions;
7153 VPlanHCFGTransforms::VPInstructionsToVPRecipes(
7154 Plan, Legal->getInductionVars(), DeadInstructions);
7156 return Plan;
7159 Value* LoopVectorizationPlanner::VPCallbackILV::
7160 getOrCreateVectorValues(Value *V, unsigned Part) {
7161 return ILV.getOrCreateVectorValue(V, Part);
7164 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
7165 O << " +\n"
7166 << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7167 IG->getInsertPos()->printAsOperand(O, false);
7168 if (User) {
7169 O << ", ";
7170 User->getOperand(0)->printAsOperand(O);
7172 O << "\\l\"";
7173 for (unsigned i = 0; i < IG->getFactor(); ++i)
7174 if (Instruction *I = IG->getMember(i))
7175 O << " +\n"
7176 << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\"";
7179 void VPWidenRecipe::execute(VPTransformState &State) {
7180 for (auto &Instr : make_range(Begin, End))
7181 State.ILV->widenInstruction(Instr);
7184 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7185 assert(!State.Instance && "Int or FP induction being replicated.");
7186 State.ILV->widenIntOrFpInduction(IV, Trunc);
7189 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7190 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7193 void VPBlendRecipe::execute(VPTransformState &State) {
7194 State.ILV->setDebugLocFromInst(State.Builder, Phi);
7195 // We know that all PHIs in non-header blocks are converted into
7196 // selects, so we don't have to worry about the insertion order and we
7197 // can just use the builder.
7198 // At this point we generate the predication tree. There may be
7199 // duplications since this is a simple recursive scan, but future
7200 // optimizations will clean it up.
7202 unsigned NumIncoming = Phi->getNumIncomingValues();
7204 assert((User || NumIncoming == 1) &&
7205 "Multiple predecessors with predecessors having a full mask");
7206 // Generate a sequence of selects of the form:
7207 // SELECT(Mask3, In3,
7208 // SELECT(Mask2, In2,
7209 // ( ...)))
7210 InnerLoopVectorizer::VectorParts Entry(State.UF);
7211 for (unsigned In = 0; In < NumIncoming; ++In) {
7212 for (unsigned Part = 0; Part < State.UF; ++Part) {
7213 // We might have single edge PHIs (blocks) - use an identity
7214 // 'select' for the first PHI operand.
7215 Value *In0 =
7216 State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7217 if (In == 0)
7218 Entry[Part] = In0; // Initialize with the first incoming value.
7219 else {
7220 // Select between the current value and the previous incoming edge
7221 // based on the incoming mask.
7222 Value *Cond = State.get(User->getOperand(In), Part);
7223 Entry[Part] =
7224 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7228 for (unsigned Part = 0; Part < State.UF; ++Part)
7229 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7232 void VPInterleaveRecipe::execute(VPTransformState &State) {
7233 assert(!State.Instance && "Interleave group being replicated.");
7234 if (!User)
7235 return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
7237 // Last (and currently only) operand is a mask.
7238 InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7239 VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7240 for (unsigned Part = 0; Part < State.UF; ++Part)
7241 MaskValues[Part] = State.get(Mask, Part);
7242 State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
7245 void VPReplicateRecipe::execute(VPTransformState &State) {
7246 if (State.Instance) { // Generate a single instance.
7247 State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7248 // Insert scalar instance packing it into a vector.
7249 if (AlsoPack && State.VF > 1) {
7250 // If we're constructing lane 0, initialize to start from undef.
7251 if (State.Instance->Lane == 0) {
7252 Value *Undef =
7253 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7254 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7256 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7258 return;
7261 // Generate scalar instances for all VF lanes of all UF parts, unless the
7262 // instruction is uniform inwhich case generate only the first lane for each
7263 // of the UF parts.
7264 unsigned EndLane = IsUniform ? 1 : State.VF;
7265 for (unsigned Part = 0; Part < State.UF; ++Part)
7266 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7267 State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7270 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7271 assert(State.Instance && "Branch on Mask works only on single instance.");
7273 unsigned Part = State.Instance->Part;
7274 unsigned Lane = State.Instance->Lane;
7276 Value *ConditionBit = nullptr;
7277 if (!User) // Block in mask is all-one.
7278 ConditionBit = State.Builder.getTrue();
7279 else {
7280 VPValue *BlockInMask = User->getOperand(0);
7281 ConditionBit = State.get(BlockInMask, Part);
7282 if (ConditionBit->getType()->isVectorTy())
7283 ConditionBit = State.Builder.CreateExtractElement(
7284 ConditionBit, State.Builder.getInt32(Lane));
7287 // Replace the temporary unreachable terminator with a new conditional branch,
7288 // whose two destinations will be set later when they are created.
7289 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7290 assert(isa<UnreachableInst>(CurrentTerminator) &&
7291 "Expected to replace unreachable terminator with conditional branch.");
7292 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7293 CondBr->setSuccessor(0, nullptr);
7294 ReplaceInstWithInst(CurrentTerminator, CondBr);
7297 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7298 assert(State.Instance && "Predicated instruction PHI works per instance.");
7299 Instruction *ScalarPredInst = cast<Instruction>(
7300 State.ValueMap.getScalarValue(PredInst, *State.Instance));
7301 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7302 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7303 assert(PredicatingBB && "Predicated block has no single predecessor.");
7305 // By current pack/unpack logic we need to generate only a single phi node: if
7306 // a vector value for the predicated instruction exists at this point it means
7307 // the instruction has vector users only, and a phi for the vector value is
7308 // needed. In this case the recipe of the predicated instruction is marked to
7309 // also do that packing, thereby "hoisting" the insert-element sequence.
7310 // Otherwise, a phi node for the scalar value is needed.
7311 unsigned Part = State.Instance->Part;
7312 if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7313 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7314 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7315 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7316 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7317 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7318 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7319 } else {
7320 Type *PredInstType = PredInst->getType();
7321 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7322 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7323 Phi->addIncoming(ScalarPredInst, PredicatedBB);
7324 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7328 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7329 if (!User)
7330 return State.ILV->vectorizeMemoryInstruction(&Instr);
7332 // Last (and currently only) operand is a mask.
7333 InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7334 VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7335 for (unsigned Part = 0; Part < State.UF; ++Part)
7336 MaskValues[Part] = State.get(Mask, Part);
7337 State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
7340 static ScalarEpilogueLowering
7341 getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
7342 ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
7343 ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
7344 if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7345 (F->hasOptSize() ||
7346 llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
7347 SEL = CM_ScalarEpilogueNotAllowedOptSize;
7348 else if (PreferPredicateOverEpilog || Hints.getPredicate())
7349 SEL = CM_ScalarEpilogueNotNeededUsePredicate;
7351 return SEL;
7354 // Process the loop in the VPlan-native vectorization path. This path builds
7355 // VPlan upfront in the vectorization pipeline, which allows to apply
7356 // VPlan-to-VPlan transformations from the very beginning without modifying the
7357 // input LLVM IR.
7358 static bool processLoopInVPlanNativePath(
7359 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7360 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7361 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7362 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7363 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7365 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7366 Function *F = L->getHeader()->getParent();
7367 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7368 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
7370 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7371 &Hints, IAI);
7372 // Use the planner for outer loop vectorization.
7373 // TODO: CM is not used at this point inside the planner. Turn CM into an
7374 // optional argument if we don't need it in the future.
7375 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);
7377 // Get user vectorization factor.
7378 const unsigned UserVF = Hints.getWidth();
7380 // Plan how to best vectorize, return the best VF and its cost.
7381 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7383 // If we are stress testing VPlan builds, do not attempt to generate vector
7384 // code. Masked vector code generation support will follow soon.
7385 // Also, do not attempt to vectorize if no vector code will be produced.
7386 if (VPlanBuildStressTest || EnableVPlanPredication ||
7387 VectorizationFactor::Disabled() == VF)
7388 return false;
7390 LVP.setBestPlan(VF.Width, 1);
7392 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7393 &CM);
7394 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7395 << L->getHeader()->getParent()->getName() << "\"\n");
7396 LVP.executePlan(LB, DT);
7398 // Mark the loop as already vectorized to avoid vectorizing again.
7399 Hints.setAlreadyVectorized();
7401 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7402 return true;
7405 bool LoopVectorizePass::processLoop(Loop *L) {
7406 assert((EnableVPlanNativePath || L->empty()) &&
7407 "VPlan-native path is not enabled. Only process inner loops.");
7409 #ifndef NDEBUG
7410 const std::string DebugLocStr = getDebugLocString(L);
7411 #endif /* NDEBUG */
7413 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7414 << L->getHeader()->getParent()->getName() << "\" from "
7415 << DebugLocStr << "\n");
7417 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7419 LLVM_DEBUG(
7420 dbgs() << "LV: Loop hints:"
7421 << " force="
7422 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7423 ? "disabled"
7424 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7425 ? "enabled"
7426 : "?"))
7427 << " width=" << Hints.getWidth()
7428 << " unroll=" << Hints.getInterleave() << "\n");
7430 // Function containing loop
7431 Function *F = L->getHeader()->getParent();
7433 // Looking at the diagnostic output is the only way to determine if a loop
7434 // was vectorized (other than looking at the IR or machine code), so it
7435 // is important to generate an optimization remark for each loop. Most of
7436 // these messages are generated as OptimizationRemarkAnalysis. Remarks
7437 // generated as OptimizationRemark and OptimizationRemarkMissed are
7438 // less verbose reporting vectorized loops and unvectorized loops that may
7439 // benefit from vectorization, respectively.
7441 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7442 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7443 return false;
7446 PredicatedScalarEvolution PSE(*SE, *L);
7448 // Check if it is legal to vectorize the loop.
7449 LoopVectorizationRequirements Requirements(*ORE);
7450 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7451 &Requirements, &Hints, DB, AC);
7452 if (!LVL.canVectorize(EnableVPlanNativePath)) {
7453 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7454 Hints.emitRemarkWithHints();
7455 return false;
7458 // Check the function attributes and profiles to find out if this function
7459 // should be optimized for size.
7460 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
7462 // Entrance to the VPlan-native vectorization path. Outer loops are processed
7463 // here. They may require CFG and instruction level transformations before
7464 // even evaluating whether vectorization is profitable. Since we cannot modify
7465 // the incoming IR, we need to build VPlan upfront in the vectorization
7466 // pipeline.
7467 if (!L->empty())
7468 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7469 ORE, BFI, PSI, Hints);
7471 assert(L->empty() && "Inner loop expected.");
7472 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7473 // count by optimizing for size, to minimize overheads.
7474 // Prefer constant trip counts over profile data, over upper bound estimate.
7475 unsigned ExpectedTC = 0;
7476 bool HasExpectedTC = false;
7477 if (const SCEVConstant *ConstExits =
7478 dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) {
7479 const APInt &ExitsCount = ConstExits->getAPInt();
7480 // We are interested in small values for ExpectedTC. Skip over those that
7481 // can't fit an unsigned.
7482 if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) {
7483 ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1;
7484 HasExpectedTC = true;
7487 // ExpectedTC may be large because it's bound by a variable. Check
7488 // profiling information to validate we should vectorize.
7489 if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
7490 auto EstimatedTC = getLoopEstimatedTripCount(L);
7491 if (EstimatedTC) {
7492 ExpectedTC = *EstimatedTC;
7493 HasExpectedTC = true;
7496 if (!HasExpectedTC) {
7497 ExpectedTC = SE->getSmallConstantMaxTripCount(L);
7498 HasExpectedTC = (ExpectedTC > 0);
7501 if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
7502 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7503 << "This loop is worth vectorizing only if no scalar "
7504 << "iteration overheads are incurred.");
7505 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7506 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7507 else {
7508 LLVM_DEBUG(dbgs() << "\n");
7509 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7513 // Check the function attributes to see if implicit floats are allowed.
7514 // FIXME: This check doesn't seem possibly correct -- what if the loop is
7515 // an integer loop and the vector instructions selected are purely integer
7516 // vector instructions?
7517 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7518 reportVectorizationFailure(
7519 "Can't vectorize when the NoImplicitFloat attribute is used",
7520 "loop not vectorized due to NoImplicitFloat attribute",
7521 "NoImplicitFloat", ORE, L);
7522 Hints.emitRemarkWithHints();
7523 return false;
7526 // Check if the target supports potentially unsafe FP vectorization.
7527 // FIXME: Add a check for the type of safety issue (denormal, signaling)
7528 // for the target we're vectorizing for, to make sure none of the
7529 // additional fp-math flags can help.
7530 if (Hints.isPotentiallyUnsafe() &&
7531 TTI->isFPVectorizationPotentiallyUnsafe()) {
7532 reportVectorizationFailure(
7533 "Potentially unsafe FP op prevents vectorization",
7534 "loop not vectorized due to unsafe FP support.",
7535 "UnsafeFP", ORE, L);
7536 Hints.emitRemarkWithHints();
7537 return false;
7540 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7541 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7543 // If an override option has been passed in for interleaved accesses, use it.
7544 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7545 UseInterleaved = EnableInterleavedMemAccesses;
7547 // Analyze interleaved memory accesses.
7548 if (UseInterleaved) {
7549 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7552 // Use the cost model.
7553 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7554 F, &Hints, IAI);
7555 CM.collectValuesToIgnore();
7557 // Use the planner for vectorization.
7558 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);
7560 // Get user vectorization factor.
7561 unsigned UserVF = Hints.getWidth();
7563 // Plan how to best vectorize, return the best VF and its cost.
7564 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7566 VectorizationFactor VF = VectorizationFactor::Disabled();
7567 unsigned IC = 1;
7568 unsigned UserIC = Hints.getInterleave();
7570 if (MaybeVF) {
7571 VF = *MaybeVF;
7572 // Select the interleave count.
7573 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7576 // Identify the diagnostic messages that should be produced.
7577 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7578 bool VectorizeLoop = true, InterleaveLoop = true;
7579 if (Requirements.doesNotMeet(F, L, Hints)) {
7580 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7581 "requirements.\n");
7582 Hints.emitRemarkWithHints();
7583 return false;
7586 if (VF.Width == 1) {
7587 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7588 VecDiagMsg = std::make_pair(
7589 "VectorizationNotBeneficial",
7590 "the cost-model indicates that vectorization is not beneficial");
7591 VectorizeLoop = false;
7594 if (!MaybeVF && UserIC > 1) {
7595 // Tell the user interleaving was avoided up-front, despite being explicitly
7596 // requested.
7597 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7598 "interleaving should be avoided up front\n");
7599 IntDiagMsg = std::make_pair(
7600 "InterleavingAvoided",
7601 "Ignoring UserIC, because interleaving was avoided up front");
7602 InterleaveLoop = false;
7603 } else if (IC == 1 && UserIC <= 1) {
7604 // Tell the user interleaving is not beneficial.
7605 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7606 IntDiagMsg = std::make_pair(
7607 "InterleavingNotBeneficial",
7608 "the cost-model indicates that interleaving is not beneficial");
7609 InterleaveLoop = false;
7610 if (UserIC == 1) {
7611 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7612 IntDiagMsg.second +=
7613 " and is explicitly disabled or interleave count is set to 1";
7615 } else if (IC > 1 && UserIC == 1) {
7616 // Tell the user interleaving is beneficial, but it explicitly disabled.
7617 LLVM_DEBUG(
7618 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7619 IntDiagMsg = std::make_pair(
7620 "InterleavingBeneficialButDisabled",
7621 "the cost-model indicates that interleaving is beneficial "
7622 "but is explicitly disabled or interleave count is set to 1");
7623 InterleaveLoop = false;
7626 // Override IC if user provided an interleave count.
7627 IC = UserIC > 0 ? UserIC : IC;
7629 // Emit diagnostic messages, if any.
7630 const char *VAPassName = Hints.vectorizeAnalysisPassName();
7631 if (!VectorizeLoop && !InterleaveLoop) {
7632 // Do not vectorize or interleaving the loop.
7633 ORE->emit([&]() {
7634 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7635 L->getStartLoc(), L->getHeader())
7636 << VecDiagMsg.second;
7638 ORE->emit([&]() {
7639 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7640 L->getStartLoc(), L->getHeader())
7641 << IntDiagMsg.second;
7643 return false;
7644 } else if (!VectorizeLoop && InterleaveLoop) {
7645 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7646 ORE->emit([&]() {
7647 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7648 L->getStartLoc(), L->getHeader())
7649 << VecDiagMsg.second;
7651 } else if (VectorizeLoop && !InterleaveLoop) {
7652 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7653 << ") in " << DebugLocStr << '\n');
7654 ORE->emit([&]() {
7655 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7656 L->getStartLoc(), L->getHeader())
7657 << IntDiagMsg.second;
7659 } else if (VectorizeLoop && InterleaveLoop) {
7660 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7661 << ") in " << DebugLocStr << '\n');
7662 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7665 LVP.setBestPlan(VF.Width, IC);
7667 using namespace ore;
7668 bool DisableRuntimeUnroll = false;
7669 MDNode *OrigLoopID = L->getLoopID();
7671 if (!VectorizeLoop) {
7672 assert(IC > 1 && "interleave count should not be 1 or 0");
7673 // If we decided that it is not legal to vectorize the loop, then
7674 // interleave it.
7675 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7676 &CM);
7677 LVP.executePlan(Unroller, DT);
7679 ORE->emit([&]() {
7680 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7681 L->getHeader())
7682 << "interleaved loop (interleaved count: "
7683 << NV("InterleaveCount", IC) << ")";
7685 } else {
7686 // If we decided that it is *legal* to vectorize the loop, then do it.
7687 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7688 &LVL, &CM);
7689 LVP.executePlan(LB, DT);
7690 ++LoopsVectorized;
7692 // Add metadata to disable runtime unrolling a scalar loop when there are
7693 // no runtime checks about strides and memory. A scalar loop that is
7694 // rarely used is not worth unrolling.
7695 if (!LB.areSafetyChecksAdded())
7696 DisableRuntimeUnroll = true;
7698 // Report the vectorization decision.
7699 ORE->emit([&]() {
7700 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7701 L->getHeader())
7702 << "vectorized loop (vectorization width: "
7703 << NV("VectorizationFactor", VF.Width)
7704 << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7708 Optional<MDNode *> RemainderLoopID =
7709 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7710 LLVMLoopVectorizeFollowupEpilogue});
7711 if (RemainderLoopID.hasValue()) {
7712 L->setLoopID(RemainderLoopID.getValue());
7713 } else {
7714 if (DisableRuntimeUnroll)
7715 AddRuntimeUnrollDisableMetaData(L);
7717 // Mark the loop as already vectorized to avoid vectorizing again.
7718 Hints.setAlreadyVectorized();
7721 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7722 return true;
7725 bool LoopVectorizePass::runImpl(
7726 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7727 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7728 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7729 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7730 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7731 SE = &SE_;
7732 LI = &LI_;
7733 TTI = &TTI_;
7734 DT = &DT_;
7735 BFI = &BFI_;
7736 TLI = TLI_;
7737 AA = &AA_;
7738 AC = &AC_;
7739 GetLAA = &GetLAA_;
7740 DB = &DB_;
7741 ORE = &ORE_;
7742 PSI = PSI_;
7744 // Don't attempt if
7745 // 1. the target claims to have no vector registers, and
7746 // 2. interleaving won't help ILP.
7748 // The second condition is necessary because, even if the target has no
7749 // vector registers, loop vectorization may still enable scalar
7750 // interleaving.
7751 if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
7752 return false;
7754 bool Changed = false;
7756 // The vectorizer requires loops to be in simplified form.
7757 // Since simplification may add new inner loops, it has to run before the
7758 // legality and profitability checks. This means running the loop vectorizer
7759 // will simplify all loops, regardless of whether anything end up being
7760 // vectorized.
7761 for (auto &L : *LI)
7762 Changed |=
7763 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7765 // Build up a worklist of inner-loops to vectorize. This is necessary as
7766 // the act of vectorizing or partially unrolling a loop creates new loops
7767 // and can invalidate iterators across the loops.
7768 SmallVector<Loop *, 8> Worklist;
7770 for (Loop *L : *LI)
7771 collectSupportedLoops(*L, LI, ORE, Worklist);
7773 LoopsAnalyzed += Worklist.size();
7775 // Now walk the identified inner loops.
7776 while (!Worklist.empty()) {
7777 Loop *L = Worklist.pop_back_val();
7779 // For the inner loops we actually process, form LCSSA to simplify the
7780 // transform.
7781 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7783 Changed |= processLoop(L);
7786 // Process each loop nest in the function.
7787 return Changed;
7790 PreservedAnalyses LoopVectorizePass::run(Function &F,
7791 FunctionAnalysisManager &AM) {
7792 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7793 auto &LI = AM.getResult<LoopAnalysis>(F);
7794 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7795 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7796 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7797 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7798 auto &AA = AM.getResult<AAManager>(F);
7799 auto &AC = AM.getResult<AssumptionAnalysis>(F);
7800 auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7801 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7802 MemorySSA *MSSA = EnableMSSALoopDependency
7803 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7804 : nullptr;
7806 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7807 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7808 [&](Loop &L) -> const LoopAccessInfo & {
7809 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7810 return LAM.getResult<LoopAccessAnalysis>(L, AR);
7812 const ModuleAnalysisManager &MAM =
7813 AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7814 ProfileSummaryInfo *PSI =
7815 MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
7816 bool Changed =
7817 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
7818 if (!Changed)
7819 return PreservedAnalyses::all();
7820 PreservedAnalyses PA;
7822 // We currently do not preserve loopinfo/dominator analyses with outer loop
7823 // vectorization. Until this is addressed, mark these analyses as preserved
7824 // only for non-VPlan-native path.
7825 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7826 if (!EnableVPlanNativePath) {
7827 PA.preserve<LoopAnalysis>();
7828 PA.preserve<DominatorTreeAnalysis>();
7830 PA.preserve<BasicAA>();
7831 PA.preserve<GlobalsAA>();
7832 return PA;