[SimplifyCFG] FoldTwoEntryPHINode(): consider *total* speculation cost, not per-BB...
[llvm-complete.git] / lib / Transforms / Vectorize / LoopVectorize.cpp
blob549a24cbdc3a7160be01efa9791fe19589dc37e7
1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 // of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 // widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 // of vectorization. It decides on the optimal vector width, which
26 // can be one, if vectorization is not profitable.
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
36 //===----------------------------------------------------------------------===//
38 // The reduction-variable vectorization is based on the paper:
39 // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
41 // Variable uniformity checks are inspired by:
42 // Karrenberg, R. and Hack, S. Whole Function Vectorization.
44 // The interleaved access vectorization is based on the paper:
45 // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46 // Data for SIMD
48 // Other ideas/concepts are from:
49 // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
51 // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52 // Vectorizing Compilers.
54 //===----------------------------------------------------------------------===//
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanHCFGTransforms.h"
62 #include "VPlanPredicator.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/LoopSimplify.h"
137 #include "llvm/Transforms/Utils/LoopUtils.h"
138 #include "llvm/Transforms/Utils/LoopVersioning.h"
139 #include "llvm/Transforms/Utils/SizeOpts.h"
140 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
141 #include <algorithm>
142 #include <cassert>
143 #include <cstdint>
144 #include <cstdlib>
145 #include <functional>
146 #include <iterator>
147 #include <limits>
148 #include <memory>
149 #include <string>
150 #include <tuple>
151 #include <utility>
152 #include <vector>
154 using namespace llvm;
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
159 /// @{
160 /// Metadata attribute names
161 static const char *const LLVMLoopVectorizeFollowupAll =
162 "llvm.loop.vectorize.followup_all";
163 static const char *const LLVMLoopVectorizeFollowupVectorized =
164 "llvm.loop.vectorize.followup_vectorized";
165 static const char *const LLVMLoopVectorizeFollowupEpilogue =
166 "llvm.loop.vectorize.followup_epilogue";
167 /// @}
169 STATISTIC(LoopsVectorized, "Number of loops vectorized");
170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 /// Loops with a known constant trip count below this number are vectorized only
173 /// if no scalar iteration overheads are incurred.
174 static cl::opt<unsigned> TinyTripCountVectorThreshold(
175 "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
176 cl::desc("Loops with a constant trip count that is smaller than this "
177 "value are vectorized only if no scalar iteration overheads "
178 "are incurred."));
180 // Indicates that an epilogue is undesired, predication is preferred.
181 // This means that the vectorizer will try to fold the loop-tail (epilogue)
182 // into the loop and predicate the loop body accordingly.
183 static cl::opt<bool> PreferPredicateOverEpilog(
184 "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
185 cl::desc("Indicate that an epilogue is undesired, predication should be "
186 "used instead."));
188 static cl::opt<bool> MaximizeBandwidth(
189 "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
190 cl::desc("Maximize bandwidth when selecting vectorization factor which "
191 "will be determined by the smallest type in loop."));
193 static cl::opt<bool> EnableInterleavedMemAccesses(
194 "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
195 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
197 /// An interleave-group may need masking if it resides in a block that needs
198 /// predication, or in order to mask away gaps.
199 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
200 "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
201 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
203 /// We don't interleave loops with a known constant trip count below this
204 /// number.
205 static const unsigned TinyTripCountInterleaveThreshold = 128;
207 static cl::opt<unsigned> ForceTargetNumScalarRegs(
208 "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
209 cl::desc("A flag that overrides the target's number of scalar registers."));
211 static cl::opt<unsigned> ForceTargetNumVectorRegs(
212 "force-target-num-vector-regs", cl::init(0), cl::Hidden,
213 cl::desc("A flag that overrides the target's number of vector registers."));
215 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
216 "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
217 cl::desc("A flag that overrides the target's max interleave factor for "
218 "scalar loops."));
220 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
221 "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
222 cl::desc("A flag that overrides the target's max interleave factor for "
223 "vectorized loops."));
225 static cl::opt<unsigned> ForceTargetInstructionCost(
226 "force-target-instruction-cost", cl::init(0), cl::Hidden,
227 cl::desc("A flag that overrides the target's expected cost for "
228 "an instruction to a single constant value. Mostly "
229 "useful for getting consistent testing."));
231 static cl::opt<unsigned> SmallLoopCost(
232 "small-loop-cost", cl::init(20), cl::Hidden,
233 cl::desc(
234 "The cost of a loop that is considered 'small' by the interleaver."));
236 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
237 "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
238 cl::desc("Enable the use of the block frequency analysis to access PGO "
239 "heuristics minimizing code growth in cold regions and being more "
240 "aggressive in hot regions."));
242 // Runtime interleave loops for load/store throughput.
243 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
244 "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
245 cl::desc(
246 "Enable runtime interleaving until load/store ports are saturated"));
248 /// The number of stores in a loop that are allowed to need predication.
249 static cl::opt<unsigned> NumberOfStoresToPredicate(
250 "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
251 cl::desc("Max number of stores to be predicated behind an if."));
253 static cl::opt<bool> EnableIndVarRegisterHeur(
254 "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
255 cl::desc("Count the induction variable only once when interleaving"));
257 static cl::opt<bool> EnableCondStoresVectorization(
258 "enable-cond-stores-vec", cl::init(true), cl::Hidden,
259 cl::desc("Enable if predication of stores during vectorization."));
261 static cl::opt<unsigned> MaxNestedScalarReductionIC(
262 "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
263 cl::desc("The maximum interleave count to use when interleaving a scalar "
264 "reduction in a nested loop."));
266 cl::opt<bool> EnableVPlanNativePath(
267 "enable-vplan-native-path", cl::init(false), cl::Hidden,
268 cl::desc("Enable VPlan-native vectorization path with "
269 "support for outer loop vectorization."));
271 // FIXME: Remove this switch once we have divergence analysis. Currently we
272 // assume divergent non-backedge branches when this switch is true.
273 cl::opt<bool> EnableVPlanPredication(
274 "enable-vplan-predication", cl::init(false), cl::Hidden,
275 cl::desc("Enable VPlan-native vectorization path predicator with "
276 "support for outer loop vectorization."));
278 // This flag enables the stress testing of the VPlan H-CFG construction in the
279 // VPlan-native vectorization path. It must be used in conjuction with
280 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
281 // verification of the H-CFGs built.
282 static cl::opt<bool> VPlanBuildStressTest(
283 "vplan-build-stress-test", cl::init(false), cl::Hidden,
284 cl::desc(
285 "Build VPlan for every supported loop nest in the function and bail "
286 "out right after the build (stress test the VPlan H-CFG construction "
287 "in the VPlan-native vectorization path)."));
289 cl::opt<bool> llvm::EnableLoopInterleaving(
290 "interleave-loops", cl::init(true), cl::Hidden,
291 cl::desc("Enable loop interleaving in Loop vectorization passes"));
292 cl::opt<bool> llvm::EnableLoopVectorization(
293 "vectorize-loops", cl::init(true), cl::Hidden,
294 cl::desc("Run the Loop vectorization passes"));
296 /// A helper function for converting Scalar types to vector types.
297 /// If the incoming type is void, we return void. If the VF is 1, we return
298 /// the scalar type.
299 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
300 if (Scalar->isVoidTy() || VF == 1)
301 return Scalar;
302 return VectorType::get(Scalar, VF);
305 /// A helper function that returns the type of loaded or stored value.
306 static Type *getMemInstValueType(Value *I) {
307 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
308 "Expected Load or Store instruction");
309 if (auto *LI = dyn_cast<LoadInst>(I))
310 return LI->getType();
311 return cast<StoreInst>(I)->getValueOperand()->getType();
314 /// A helper function that returns true if the given type is irregular. The
315 /// type is irregular if its allocated size doesn't equal the store size of an
316 /// element of the corresponding vector type at the given vectorization factor.
317 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
318 // Determine if an array of VF elements of type Ty is "bitcast compatible"
319 // with a <VF x Ty> vector.
320 if (VF > 1) {
321 auto *VectorTy = VectorType::get(Ty, VF);
322 return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
325 // If the vectorization factor is one, we just check if an array of type Ty
326 // requires padding between elements.
327 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
330 /// A helper function that returns the reciprocal of the block probability of
331 /// predicated blocks. If we return X, we are assuming the predicated block
332 /// will execute once for every X iterations of the loop header.
334 /// TODO: We should use actual block probability here, if available. Currently,
335 /// we always assume predicated blocks have a 50% chance of executing.
336 static unsigned getReciprocalPredBlockProb() { return 2; }
338 /// A helper function that adds a 'fast' flag to floating-point operations.
339 static Value *addFastMathFlag(Value *V) {
340 if (isa<FPMathOperator>(V))
341 cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
342 return V;
345 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
346 if (isa<FPMathOperator>(V))
347 cast<Instruction>(V)->setFastMathFlags(FMF);
348 return V;
351 /// A helper function that returns an integer or floating-point constant with
352 /// value C.
353 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
354 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
355 : ConstantFP::get(Ty, C);
358 namespace llvm {
360 /// InnerLoopVectorizer vectorizes loops which contain only one basic
361 /// block to a specified vectorization factor (VF).
362 /// This class performs the widening of scalars into vectors, or multiple
363 /// scalars. This class also implements the following features:
364 /// * It inserts an epilogue loop for handling loops that don't have iteration
365 /// counts that are known to be a multiple of the vectorization factor.
366 /// * It handles the code generation for reduction variables.
367 /// * Scalarization (implementation using scalars) of un-vectorizable
368 /// instructions.
369 /// InnerLoopVectorizer does not perform any vectorization-legality
370 /// checks, and relies on the caller to check for the different legality
371 /// aspects. The InnerLoopVectorizer relies on the
372 /// LoopVectorizationLegality class to provide information about the induction
373 /// and reduction variables that were found to a given vectorization factor.
374 class InnerLoopVectorizer {
375 public:
376 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
377 LoopInfo *LI, DominatorTree *DT,
378 const TargetLibraryInfo *TLI,
379 const TargetTransformInfo *TTI, AssumptionCache *AC,
380 OptimizationRemarkEmitter *ORE, unsigned VecWidth,
381 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
382 LoopVectorizationCostModel *CM)
383 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
384 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
385 Builder(PSE.getSE()->getContext()),
386 VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
387 virtual ~InnerLoopVectorizer() = default;
389 /// Create a new empty loop. Unlink the old loop and connect the new one.
390 /// Return the pre-header block of the new loop.
391 BasicBlock *createVectorizedLoopSkeleton();
393 /// Widen a single instruction within the innermost loop.
394 void widenInstruction(Instruction &I);
396 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
397 void fixVectorizedLoop();
399 // Return true if any runtime check is added.
400 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
402 /// A type for vectorized values in the new loop. Each value from the
403 /// original loop, when vectorized, is represented by UF vector values in the
404 /// new unrolled loop, where UF is the unroll factor.
405 using VectorParts = SmallVector<Value *, 2>;
407 /// Vectorize a single PHINode in a block. This method handles the induction
408 /// variable canonicalization. It supports both VF = 1 for unrolled loops and
409 /// arbitrary length vectors.
410 void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
412 /// A helper function to scalarize a single Instruction in the innermost loop.
413 /// Generates a sequence of scalar instances for each lane between \p MinLane
414 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
415 /// inclusive..
416 void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
417 bool IfPredicateInstr);
419 /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
420 /// is provided, the integer induction variable will first be truncated to
421 /// the corresponding type.
422 void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
424 /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
425 /// vector or scalar value on-demand if one is not yet available. When
426 /// vectorizing a loop, we visit the definition of an instruction before its
427 /// uses. When visiting the definition, we either vectorize or scalarize the
428 /// instruction, creating an entry for it in the corresponding map. (In some
429 /// cases, such as induction variables, we will create both vector and scalar
430 /// entries.) Then, as we encounter uses of the definition, we derive values
431 /// for each scalar or vector use unless such a value is already available.
432 /// For example, if we scalarize a definition and one of its uses is vector,
433 /// we build the required vector on-demand with an insertelement sequence
434 /// when visiting the use. Otherwise, if the use is scalar, we can use the
435 /// existing scalar definition.
437 /// Return a value in the new loop corresponding to \p V from the original
438 /// loop at unroll index \p Part. If the value has already been vectorized,
439 /// the corresponding vector entry in VectorLoopValueMap is returned. If,
440 /// however, the value has a scalar entry in VectorLoopValueMap, we construct
441 /// a new vector value on-demand by inserting the scalar values into a vector
442 /// with an insertelement sequence. If the value has been neither vectorized
443 /// nor scalarized, it must be loop invariant, so we simply broadcast the
444 /// value into a vector.
445 Value *getOrCreateVectorValue(Value *V, unsigned Part);
447 /// Return a value in the new loop corresponding to \p V from the original
448 /// loop at unroll and vector indices \p Instance. If the value has been
449 /// vectorized but not scalarized, the necessary extractelement instruction
450 /// will be generated.
451 Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
453 /// Construct the vector value of a scalarized value \p V one lane at a time.
454 void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
456 /// Try to vectorize the interleaved access group that \p Instr belongs to,
457 /// optionally masking the vector operations if \p BlockInMask is non-null.
458 void vectorizeInterleaveGroup(Instruction *Instr,
459 VectorParts *BlockInMask = nullptr);
461 /// Vectorize Load and Store instructions, optionally masking the vector
462 /// operations if \p BlockInMask is non-null.
463 void vectorizeMemoryInstruction(Instruction *Instr,
464 VectorParts *BlockInMask = nullptr);
466 /// Set the debug location in the builder using the debug location in
467 /// the instruction.
468 void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
470 /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
471 void fixNonInductionPHIs(void);
473 protected:
474 friend class LoopVectorizationPlanner;
476 /// A small list of PHINodes.
477 using PhiVector = SmallVector<PHINode *, 4>;
479 /// A type for scalarized values in the new loop. Each value from the
480 /// original loop, when scalarized, is represented by UF x VF scalar values
481 /// in the new unrolled loop, where UF is the unroll factor and VF is the
482 /// vectorization factor.
483 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
485 /// Set up the values of the IVs correctly when exiting the vector loop.
486 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
487 Value *CountRoundDown, Value *EndValue,
488 BasicBlock *MiddleBlock);
490 /// Create a new induction variable inside L.
491 PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
492 Value *Step, Instruction *DL);
494 /// Handle all cross-iteration phis in the header.
495 void fixCrossIterationPHIs();
497 /// Fix a first-order recurrence. This is the second phase of vectorizing
498 /// this phi node.
499 void fixFirstOrderRecurrence(PHINode *Phi);
501 /// Fix a reduction cross-iteration phi. This is the second phase of
502 /// vectorizing this phi node.
503 void fixReduction(PHINode *Phi);
505 /// The Loop exit block may have single value PHI nodes with some
506 /// incoming value. While vectorizing we only handled real values
507 /// that were defined inside the loop and we should have one value for
508 /// each predecessor of its parent basic block. See PR14725.
509 void fixLCSSAPHIs();
511 /// Iteratively sink the scalarized operands of a predicated instruction into
512 /// the block that was created for it.
513 void sinkScalarOperands(Instruction *PredInst);
515 /// Shrinks vector element sizes to the smallest bitwidth they can be legally
516 /// represented as.
517 void truncateToMinimalBitwidths();
519 /// Insert the new loop to the loop hierarchy and pass manager
520 /// and update the analysis passes.
521 void updateAnalysis();
523 /// Create a broadcast instruction. This method generates a broadcast
524 /// instruction (shuffle) for loop invariant values and for the induction
525 /// value. If this is the induction variable then we extend it to N, N+1, ...
526 /// this is needed because each iteration in the loop corresponds to a SIMD
527 /// element.
528 virtual Value *getBroadcastInstrs(Value *V);
530 /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
531 /// to each vector element of Val. The sequence starts at StartIndex.
532 /// \p Opcode is relevant for FP induction variable.
533 virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
534 Instruction::BinaryOps Opcode =
535 Instruction::BinaryOpsEnd);
537 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
538 /// variable on which to base the steps, \p Step is the size of the step, and
539 /// \p EntryVal is the value from the original loop that maps to the steps.
540 /// Note that \p EntryVal doesn't have to be an induction variable - it
541 /// can also be a truncate instruction.
542 void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
543 const InductionDescriptor &ID);
545 /// Create a vector induction phi node based on an existing scalar one. \p
546 /// EntryVal is the value from the original loop that maps to the vector phi
547 /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
548 /// truncate instruction, instead of widening the original IV, we widen a
549 /// version of the IV truncated to \p EntryVal's type.
550 void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
551 Value *Step, Instruction *EntryVal);
553 /// Returns true if an instruction \p I should be scalarized instead of
554 /// vectorized for the chosen vectorization factor.
555 bool shouldScalarizeInstruction(Instruction *I) const;
557 /// Returns true if we should generate a scalar version of \p IV.
558 bool needsScalarInduction(Instruction *IV) const;
560 /// If there is a cast involved in the induction variable \p ID, which should
561 /// be ignored in the vectorized loop body, this function records the
562 /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
563 /// cast. We had already proved that the casted Phi is equal to the uncasted
564 /// Phi in the vectorized loop (under a runtime guard), and therefore
565 /// there is no need to vectorize the cast - the same value can be used in the
566 /// vector loop for both the Phi and the cast.
567 /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
568 /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
570 /// \p EntryVal is the value from the original loop that maps to the vector
571 /// phi node and is used to distinguish what is the IV currently being
572 /// processed - original one (if \p EntryVal is a phi corresponding to the
573 /// original IV) or the "newly-created" one based on the proof mentioned above
574 /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
575 /// latter case \p EntryVal is a TruncInst and we must not record anything for
576 /// that IV, but it's error-prone to expect callers of this routine to care
577 /// about that, hence this explicit parameter.
578 void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
579 const Instruction *EntryVal,
580 Value *VectorLoopValue,
581 unsigned Part,
582 unsigned Lane = UINT_MAX);
584 /// Generate a shuffle sequence that will reverse the vector Vec.
585 virtual Value *reverseVector(Value *Vec);
587 /// Returns (and creates if needed) the original loop trip count.
588 Value *getOrCreateTripCount(Loop *NewLoop);
590 /// Returns (and creates if needed) the trip count of the widened loop.
591 Value *getOrCreateVectorTripCount(Loop *NewLoop);
593 /// Returns a bitcasted value to the requested vector type.
594 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
595 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
596 const DataLayout &DL);
598 /// Emit a bypass check to see if the vector trip count is zero, including if
599 /// it overflows.
600 void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
602 /// Emit a bypass check to see if all of the SCEV assumptions we've
603 /// had to make are correct.
604 void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
606 /// Emit bypass checks to check any memory assumptions we may have made.
607 void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
609 /// Compute the transformed value of Index at offset StartValue using step
610 /// StepValue.
611 /// For integer induction, returns StartValue + Index * StepValue.
612 /// For pointer induction, returns StartValue[Index * StepValue].
613 /// FIXME: The newly created binary instructions should contain nsw/nuw
614 /// flags, which can be found from the original scalar operations.
615 Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
616 const DataLayout &DL,
617 const InductionDescriptor &ID) const;
619 /// Add additional metadata to \p To that was not present on \p Orig.
621 /// Currently this is used to add the noalias annotations based on the
622 /// inserted memchecks. Use this for instructions that are *cloned* into the
623 /// vector loop.
624 void addNewMetadata(Instruction *To, const Instruction *Orig);
626 /// Add metadata from one instruction to another.
628 /// This includes both the original MDs from \p From and additional ones (\see
629 /// addNewMetadata). Use this for *newly created* instructions in the vector
630 /// loop.
631 void addMetadata(Instruction *To, Instruction *From);
633 /// Similar to the previous function but it adds the metadata to a
634 /// vector of instructions.
635 void addMetadata(ArrayRef<Value *> To, Instruction *From);
637 /// The original loop.
638 Loop *OrigLoop;
640 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
641 /// dynamic knowledge to simplify SCEV expressions and converts them to a
642 /// more usable form.
643 PredicatedScalarEvolution &PSE;
645 /// Loop Info.
646 LoopInfo *LI;
648 /// Dominator Tree.
649 DominatorTree *DT;
651 /// Alias Analysis.
652 AliasAnalysis *AA;
654 /// Target Library Info.
655 const TargetLibraryInfo *TLI;
657 /// Target Transform Info.
658 const TargetTransformInfo *TTI;
660 /// Assumption Cache.
661 AssumptionCache *AC;
663 /// Interface to emit optimization remarks.
664 OptimizationRemarkEmitter *ORE;
666 /// LoopVersioning. It's only set up (non-null) if memchecks were
667 /// used.
669 /// This is currently only used to add no-alias metadata based on the
670 /// memchecks. The actually versioning is performed manually.
671 std::unique_ptr<LoopVersioning> LVer;
673 /// The vectorization SIMD factor to use. Each vector will have this many
674 /// vector elements.
675 unsigned VF;
677 /// The vectorization unroll factor to use. Each scalar is vectorized to this
678 /// many different vector instructions.
679 unsigned UF;
681 /// The builder that we use
682 IRBuilder<> Builder;
684 // --- Vectorization state ---
686 /// The vector-loop preheader.
687 BasicBlock *LoopVectorPreHeader;
689 /// The scalar-loop preheader.
690 BasicBlock *LoopScalarPreHeader;
692 /// Middle Block between the vector and the scalar.
693 BasicBlock *LoopMiddleBlock;
695 /// The ExitBlock of the scalar loop.
696 BasicBlock *LoopExitBlock;
698 /// The vector loop body.
699 BasicBlock *LoopVectorBody;
701 /// The scalar loop body.
702 BasicBlock *LoopScalarBody;
704 /// A list of all bypass blocks. The first block is the entry of the loop.
705 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
707 /// The new Induction variable which was added to the new block.
708 PHINode *Induction = nullptr;
710 /// The induction variable of the old basic block.
711 PHINode *OldInduction = nullptr;
713 /// Maps values from the original loop to their corresponding values in the
714 /// vectorized loop. A key value can map to either vector values, scalar
715 /// values or both kinds of values, depending on whether the key was
716 /// vectorized and scalarized.
717 VectorizerValueMap VectorLoopValueMap;
719 /// Store instructions that were predicated.
720 SmallVector<Instruction *, 4> PredicatedInstructions;
722 /// Trip count of the original loop.
723 Value *TripCount = nullptr;
725 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
726 Value *VectorTripCount = nullptr;
728 /// The legality analysis.
729 LoopVectorizationLegality *Legal;
731 /// The profitablity analysis.
732 LoopVectorizationCostModel *Cost;
734 // Record whether runtime checks are added.
735 bool AddedSafetyChecks = false;
737 // Holds the end values for each induction variable. We save the end values
738 // so we can later fix-up the external users of the induction variables.
739 DenseMap<PHINode *, Value *> IVEndValues;
741 // Vector of original scalar PHIs whose corresponding widened PHIs need to be
742 // fixed up at the end of vector code generation.
743 SmallVector<PHINode *, 8> OrigPHIsToFix;
746 class InnerLoopUnroller : public InnerLoopVectorizer {
747 public:
748 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
749 LoopInfo *LI, DominatorTree *DT,
750 const TargetLibraryInfo *TLI,
751 const TargetTransformInfo *TTI, AssumptionCache *AC,
752 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
753 LoopVectorizationLegality *LVL,
754 LoopVectorizationCostModel *CM)
755 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
756 UnrollFactor, LVL, CM) {}
758 private:
759 Value *getBroadcastInstrs(Value *V) override;
760 Value *getStepVector(Value *Val, int StartIdx, Value *Step,
761 Instruction::BinaryOps Opcode =
762 Instruction::BinaryOpsEnd) override;
763 Value *reverseVector(Value *Vec) override;
766 } // end namespace llvm
768 /// Look for a meaningful debug location on the instruction or it's
769 /// operands.
770 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
771 if (!I)
772 return I;
774 DebugLoc Empty;
775 if (I->getDebugLoc() != Empty)
776 return I;
778 for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
779 if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
780 if (OpInst->getDebugLoc() != Empty)
781 return OpInst;
784 return I;
787 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
788 if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
789 const DILocation *DIL = Inst->getDebugLoc();
790 if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
791 !isa<DbgInfoIntrinsic>(Inst)) {
792 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
793 if (NewDIL)
794 B.SetCurrentDebugLocation(NewDIL.getValue());
795 else
796 LLVM_DEBUG(dbgs()
797 << "Failed to create new discriminator: "
798 << DIL->getFilename() << " Line: " << DIL->getLine());
800 else
801 B.SetCurrentDebugLocation(DIL);
802 } else
803 B.SetCurrentDebugLocation(DebugLoc());
806 /// Write a record \p DebugMsg about vectorization failure to the debug
807 /// output stream. If \p I is passed, it is an instruction that prevents
808 /// vectorization.
809 #ifndef NDEBUG
810 static void debugVectorizationFailure(const StringRef DebugMsg,
811 Instruction *I) {
812 dbgs() << "LV: Not vectorizing: " << DebugMsg;
813 if (I != nullptr)
814 dbgs() << " " << *I;
815 else
816 dbgs() << '.';
817 dbgs() << '\n';
819 #endif
821 /// Create an analysis remark that explains why vectorization failed
823 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
824 /// RemarkName is the identifier for the remark. If \p I is passed it is an
825 /// instruction that prevents vectorization. Otherwise \p TheLoop is used for
826 /// the location of the remark. \return the remark object that can be
827 /// streamed to.
828 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
829 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
830 Value *CodeRegion = TheLoop->getHeader();
831 DebugLoc DL = TheLoop->getStartLoc();
833 if (I) {
834 CodeRegion = I->getParent();
835 // If there is no debug location attached to the instruction, revert back to
836 // using the loop's.
837 if (I->getDebugLoc())
838 DL = I->getDebugLoc();
841 OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
842 R << "loop not vectorized: ";
843 return R;
846 namespace llvm {
848 void reportVectorizationFailure(const StringRef DebugMsg,
849 const StringRef OREMsg, const StringRef ORETag,
850 OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
851 LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
852 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
853 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
854 ORETag, TheLoop, I) << OREMsg);
857 } // end namespace llvm
859 #ifndef NDEBUG
860 /// \return string containing a file name and a line # for the given loop.
861 static std::string getDebugLocString(const Loop *L) {
862 std::string Result;
863 if (L) {
864 raw_string_ostream OS(Result);
865 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
866 LoopDbgLoc.print(OS);
867 else
868 // Just print the module name.
869 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
870 OS.flush();
872 return Result;
874 #endif
876 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
877 const Instruction *Orig) {
878 // If the loop was versioned with memchecks, add the corresponding no-alias
879 // metadata.
880 if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
881 LVer->annotateInstWithNoAlias(To, Orig);
884 void InnerLoopVectorizer::addMetadata(Instruction *To,
885 Instruction *From) {
886 propagateMetadata(To, From);
887 addNewMetadata(To, From);
890 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
891 Instruction *From) {
892 for (Value *V : To) {
893 if (Instruction *I = dyn_cast<Instruction>(V))
894 addMetadata(I, From);
898 namespace llvm {
900 // Loop vectorization cost-model hints how the scalar epilogue loop should be
901 // lowered.
902 enum ScalarEpilogueLowering {
904 // The default: allowing scalar epilogues.
905 CM_ScalarEpilogueAllowed,
907 // Vectorization with OptForSize: don't allow epilogues.
908 CM_ScalarEpilogueNotAllowedOptSize,
910 // A special case of vectorisation with OptForSize: loops with a very small
911 // trip count are considered for vectorization under OptForSize, thereby
912 // making sure the cost of their loop body is dominant, free of runtime
913 // guards and scalar iteration overheads.
914 CM_ScalarEpilogueNotAllowedLowTripLoop,
916 // Loop hint predicate indicating an epilogue is undesired.
917 CM_ScalarEpilogueNotNeededUsePredicate
920 /// LoopVectorizationCostModel - estimates the expected speedups due to
921 /// vectorization.
922 /// In many cases vectorization is not profitable. This can happen because of
923 /// a number of reasons. In this class we mainly attempt to predict the
924 /// expected speedup/slowdowns due to the supported instruction set. We use the
925 /// TargetTransformInfo to query the different backends for the cost of
926 /// different operations.
927 class LoopVectorizationCostModel {
928 public:
929 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
930 PredicatedScalarEvolution &PSE, LoopInfo *LI,
931 LoopVectorizationLegality *Legal,
932 const TargetTransformInfo &TTI,
933 const TargetLibraryInfo *TLI, DemandedBits *DB,
934 AssumptionCache *AC,
935 OptimizationRemarkEmitter *ORE, const Function *F,
936 const LoopVectorizeHints *Hints,
937 InterleavedAccessInfo &IAI)
938 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
939 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
940 Hints(Hints), InterleaveInfo(IAI) {}
942 /// \return An upper bound for the vectorization factor, or None if
943 /// vectorization and interleaving should be avoided up front.
944 Optional<unsigned> computeMaxVF();
946 /// \return True if runtime checks are required for vectorization, and false
947 /// otherwise.
948 bool runtimeChecksRequired();
950 /// \return The most profitable vectorization factor and the cost of that VF.
951 /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
952 /// then this vectorization factor will be selected if vectorization is
953 /// possible.
954 VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
956 /// Setup cost-based decisions for user vectorization factor.
957 void selectUserVectorizationFactor(unsigned UserVF) {
958 collectUniformsAndScalars(UserVF);
959 collectInstsToScalarize(UserVF);
962 /// \return The size (in bits) of the smallest and widest types in the code
963 /// that needs to be vectorized. We ignore values that remain scalar such as
964 /// 64 bit loop indices.
965 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
967 /// \return The desired interleave count.
968 /// If interleave count has been specified by metadata it will be returned.
969 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
970 /// are the selected vectorization factor and the cost of the selected VF.
971 unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
973 /// Memory access instruction may be vectorized in more than one way.
974 /// Form of instruction after vectorization depends on cost.
975 /// This function takes cost-based decisions for Load/Store instructions
976 /// and collects them in a map. This decisions map is used for building
977 /// the lists of loop-uniform and loop-scalar instructions.
978 /// The calculated cost is saved with widening decision in order to
979 /// avoid redundant calculations.
980 void setCostBasedWideningDecision(unsigned VF);
982 /// A struct that represents some properties of the register usage
983 /// of a loop.
984 struct RegisterUsage {
985 /// Holds the number of loop invariant values that are used in the loop.
986 unsigned LoopInvariantRegs;
988 /// Holds the maximum number of concurrent live intervals in the loop.
989 unsigned MaxLocalUsers;
992 /// \return Returns information about the register usages of the loop for the
993 /// given vectorization factors.
994 SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
996 /// Collect values we want to ignore in the cost model.
997 void collectValuesToIgnore();
999 /// \returns The smallest bitwidth each instruction can be represented with.
1000 /// The vector equivalents of these instructions should be truncated to this
1001 /// type.
1002 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1003 return MinBWs;
1006 /// \returns True if it is more profitable to scalarize instruction \p I for
1007 /// vectorization factor \p VF.
1008 bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1009 assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1011 // Cost model is not run in the VPlan-native path - return conservative
1012 // result until this changes.
1013 if (EnableVPlanNativePath)
1014 return false;
1016 auto Scalars = InstsToScalarize.find(VF);
1017 assert(Scalars != InstsToScalarize.end() &&
1018 "VF not yet analyzed for scalarization profitability");
1019 return Scalars->second.find(I) != Scalars->second.end();
1022 /// Returns true if \p I is known to be uniform after vectorization.
1023 bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1024 if (VF == 1)
1025 return true;
1027 // Cost model is not run in the VPlan-native path - return conservative
1028 // result until this changes.
1029 if (EnableVPlanNativePath)
1030 return false;
1032 auto UniformsPerVF = Uniforms.find(VF);
1033 assert(UniformsPerVF != Uniforms.end() &&
1034 "VF not yet analyzed for uniformity");
1035 return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1038 /// Returns true if \p I is known to be scalar after vectorization.
1039 bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1040 if (VF == 1)
1041 return true;
1043 // Cost model is not run in the VPlan-native path - return conservative
1044 // result until this changes.
1045 if (EnableVPlanNativePath)
1046 return false;
1048 auto ScalarsPerVF = Scalars.find(VF);
1049 assert(ScalarsPerVF != Scalars.end() &&
1050 "Scalar values are not calculated for VF");
1051 return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1054 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1055 /// for vectorization factor \p VF.
1056 bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1057 return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1058 !isProfitableToScalarize(I, VF) &&
1059 !isScalarAfterVectorization(I, VF);
1062 /// Decision that was taken during cost calculation for memory instruction.
1063 enum InstWidening {
1064 CM_Unknown,
1065 CM_Widen, // For consecutive accesses with stride +1.
1066 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1067 CM_Interleave,
1068 CM_GatherScatter,
1069 CM_Scalarize
1072 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1073 /// instruction \p I and vector width \p VF.
1074 void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1075 unsigned Cost) {
1076 assert(VF >= 2 && "Expected VF >=2");
1077 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1080 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1081 /// interleaving group \p Grp and vector width \p VF.
1082 void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1083 InstWidening W, unsigned Cost) {
1084 assert(VF >= 2 && "Expected VF >=2");
1085 /// Broadcast this decicion to all instructions inside the group.
1086 /// But the cost will be assigned to one instruction only.
1087 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1088 if (auto *I = Grp->getMember(i)) {
1089 if (Grp->getInsertPos() == I)
1090 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1091 else
1092 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1097 /// Return the cost model decision for the given instruction \p I and vector
1098 /// width \p VF. Return CM_Unknown if this instruction did not pass
1099 /// through the cost modeling.
1100 InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1101 assert(VF >= 2 && "Expected VF >=2");
1103 // Cost model is not run in the VPlan-native path - return conservative
1104 // result until this changes.
1105 if (EnableVPlanNativePath)
1106 return CM_GatherScatter;
1108 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1109 auto Itr = WideningDecisions.find(InstOnVF);
1110 if (Itr == WideningDecisions.end())
1111 return CM_Unknown;
1112 return Itr->second.first;
1115 /// Return the vectorization cost for the given instruction \p I and vector
1116 /// width \p VF.
1117 unsigned getWideningCost(Instruction *I, unsigned VF) {
1118 assert(VF >= 2 && "Expected VF >=2");
1119 std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1120 assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1121 "The cost is not calculated");
1122 return WideningDecisions[InstOnVF].second;
1125 /// Return True if instruction \p I is an optimizable truncate whose operand
1126 /// is an induction variable. Such a truncate will be removed by adding a new
1127 /// induction variable with the destination type.
1128 bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1129 // If the instruction is not a truncate, return false.
1130 auto *Trunc = dyn_cast<TruncInst>(I);
1131 if (!Trunc)
1132 return false;
1134 // Get the source and destination types of the truncate.
1135 Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1136 Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1138 // If the truncate is free for the given types, return false. Replacing a
1139 // free truncate with an induction variable would add an induction variable
1140 // update instruction to each iteration of the loop. We exclude from this
1141 // check the primary induction variable since it will need an update
1142 // instruction regardless.
1143 Value *Op = Trunc->getOperand(0);
1144 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1145 return false;
1147 // If the truncated value is not an induction variable, return false.
1148 return Legal->isInductionPhi(Op);
1151 /// Collects the instructions to scalarize for each predicated instruction in
1152 /// the loop.
1153 void collectInstsToScalarize(unsigned VF);
1155 /// Collect Uniform and Scalar values for the given \p VF.
1156 /// The sets depend on CM decision for Load/Store instructions
1157 /// that may be vectorized as interleave, gather-scatter or scalarized.
1158 void collectUniformsAndScalars(unsigned VF) {
1159 // Do the analysis once.
1160 if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1161 return;
1162 setCostBasedWideningDecision(VF);
1163 collectLoopUniforms(VF);
1164 collectLoopScalars(VF);
1167 /// Returns true if the target machine supports masked store operation
1168 /// for the given \p DataType and kind of access to \p Ptr.
1169 bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
1170 return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
1173 /// Returns true if the target machine supports masked load operation
1174 /// for the given \p DataType and kind of access to \p Ptr.
1175 bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
1176 return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
1179 /// Returns true if the target machine supports masked scatter operation
1180 /// for the given \p DataType.
1181 bool isLegalMaskedScatter(Type *DataType) {
1182 return TTI.isLegalMaskedScatter(DataType);
1185 /// Returns true if the target machine supports masked gather operation
1186 /// for the given \p DataType.
1187 bool isLegalMaskedGather(Type *DataType) {
1188 return TTI.isLegalMaskedGather(DataType);
1191 /// Returns true if the target machine can represent \p V as a masked gather
1192 /// or scatter operation.
1193 bool isLegalGatherOrScatter(Value *V) {
1194 bool LI = isa<LoadInst>(V);
1195 bool SI = isa<StoreInst>(V);
1196 if (!LI && !SI)
1197 return false;
1198 auto *Ty = getMemInstValueType(V);
1199 return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1202 /// Returns true if \p I is an instruction that will be scalarized with
1203 /// predication. Such instructions include conditional stores and
1204 /// instructions that may divide by zero.
1205 /// If a non-zero VF has been calculated, we check if I will be scalarized
1206 /// predication for that VF.
1207 bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1209 // Returns true if \p I is an instruction that will be predicated either
1210 // through scalar predication or masked load/store or masked gather/scatter.
1211 // Superset of instructions that return true for isScalarWithPredication.
1212 bool isPredicatedInst(Instruction *I) {
1213 if (!blockNeedsPredication(I->getParent()))
1214 return false;
1215 // Loads and stores that need some form of masked operation are predicated
1216 // instructions.
1217 if (isa<LoadInst>(I) || isa<StoreInst>(I))
1218 return Legal->isMaskRequired(I);
1219 return isScalarWithPredication(I);
1222 /// Returns true if \p I is a memory instruction with consecutive memory
1223 /// access that can be widened.
1224 bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1226 /// Returns true if \p I is a memory instruction in an interleaved-group
1227 /// of memory accesses that can be vectorized with wide vector loads/stores
1228 /// and shuffles.
1229 bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1231 /// Check if \p Instr belongs to any interleaved access group.
1232 bool isAccessInterleaved(Instruction *Instr) {
1233 return InterleaveInfo.isInterleaved(Instr);
1236 /// Get the interleaved access group that \p Instr belongs to.
1237 const InterleaveGroup<Instruction> *
1238 getInterleavedAccessGroup(Instruction *Instr) {
1239 return InterleaveInfo.getInterleaveGroup(Instr);
1242 /// Returns true if an interleaved group requires a scalar iteration
1243 /// to handle accesses with gaps, and there is nothing preventing us from
1244 /// creating a scalar epilogue.
1245 bool requiresScalarEpilogue() const {
1246 return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1249 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1250 /// loop hint annotation.
1251 bool isScalarEpilogueAllowed() const {
1252 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1255 /// Returns true if all loop blocks should be masked to fold tail loop.
1256 bool foldTailByMasking() const { return FoldTailByMasking; }
1258 bool blockNeedsPredication(BasicBlock *BB) {
1259 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1262 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1263 /// with factor VF. Return the cost of the instruction, including
1264 /// scalarization overhead if it's needed.
1265 unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1267 /// Estimate cost of a call instruction CI if it were vectorized with factor
1268 /// VF. Return the cost of the instruction, including scalarization overhead
1269 /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1270 /// scalarized -
1271 /// i.e. either vector version isn't available, or is too expensive.
1272 unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1274 private:
1275 unsigned NumPredStores = 0;
1277 /// \return An upper bound for the vectorization factor, larger than zero.
1278 /// One is returned if vectorization should best be avoided due to cost.
1279 unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1281 /// The vectorization cost is a combination of the cost itself and a boolean
1282 /// indicating whether any of the contributing operations will actually
1283 /// operate on
1284 /// vector values after type legalization in the backend. If this latter value
1285 /// is
1286 /// false, then all operations will be scalarized (i.e. no vectorization has
1287 /// actually taken place).
1288 using VectorizationCostTy = std::pair<unsigned, bool>;
1290 /// Returns the expected execution cost. The unit of the cost does
1291 /// not matter because we use the 'cost' units to compare different
1292 /// vector widths. The cost that is returned is *not* normalized by
1293 /// the factor width.
1294 VectorizationCostTy expectedCost(unsigned VF);
1296 /// Returns the execution time cost of an instruction for a given vector
1297 /// width. Vector width of one means scalar.
1298 VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1300 /// The cost-computation logic from getInstructionCost which provides
1301 /// the vector type as an output parameter.
1302 unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1304 /// Calculate vectorization cost of memory instruction \p I.
1305 unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1307 /// The cost computation for scalarized memory instruction.
1308 unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1310 /// The cost computation for interleaving group of memory instructions.
1311 unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1313 /// The cost computation for Gather/Scatter instruction.
1314 unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1316 /// The cost computation for widening instruction \p I with consecutive
1317 /// memory access.
1318 unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1320 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1321 /// Load: scalar load + broadcast.
1322 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1323 /// element)
1324 unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1326 /// Estimate the overhead of scalarizing an instruction. This is a
1327 /// convenience wrapper for the type-based getScalarizationOverhead API.
1328 unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1330 /// Returns whether the instruction is a load or store and will be a emitted
1331 /// as a vector operation.
1332 bool isConsecutiveLoadOrStore(Instruction *I);
1334 /// Returns true if an artificially high cost for emulated masked memrefs
1335 /// should be used.
1336 bool useEmulatedMaskMemRefHack(Instruction *I);
1338 /// Map of scalar integer values to the smallest bitwidth they can be legally
1339 /// represented as. The vector equivalents of these values should be truncated
1340 /// to this type.
1341 MapVector<Instruction *, uint64_t> MinBWs;
1343 /// A type representing the costs for instructions if they were to be
1344 /// scalarized rather than vectorized. The entries are Instruction-Cost
1345 /// pairs.
1346 using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1348 /// A set containing all BasicBlocks that are known to present after
1349 /// vectorization as a predicated block.
1350 SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1352 /// Records whether it is allowed to have the original scalar loop execute at
1353 /// least once. This may be needed as a fallback loop in case runtime
1354 /// aliasing/dependence checks fail, or to handle the tail/remainder
1355 /// iterations when the trip count is unknown or doesn't divide by the VF,
1356 /// or as a peel-loop to handle gaps in interleave-groups.
1357 /// Under optsize and when the trip count is very small we don't allow any
1358 /// iterations to execute in the scalar loop.
1359 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1361 /// All blocks of loop are to be masked to fold tail of scalar iterations.
1362 bool FoldTailByMasking = false;
1364 /// A map holding scalar costs for different vectorization factors. The
1365 /// presence of a cost for an instruction in the mapping indicates that the
1366 /// instruction will be scalarized when vectorizing with the associated
1367 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1368 DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1370 /// Holds the instructions known to be uniform after vectorization.
1371 /// The data is collected per VF.
1372 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1374 /// Holds the instructions known to be scalar after vectorization.
1375 /// The data is collected per VF.
1376 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1378 /// Holds the instructions (address computations) that are forced to be
1379 /// scalarized.
1380 DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1382 /// Returns the expected difference in cost from scalarizing the expression
1383 /// feeding a predicated instruction \p PredInst. The instructions to
1384 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1385 /// non-negative return value implies the expression will be scalarized.
1386 /// Currently, only single-use chains are considered for scalarization.
1387 int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1388 unsigned VF);
1390 /// Collect the instructions that are uniform after vectorization. An
1391 /// instruction is uniform if we represent it with a single scalar value in
1392 /// the vectorized loop corresponding to each vector iteration. Examples of
1393 /// uniform instructions include pointer operands of consecutive or
1394 /// interleaved memory accesses. Note that although uniformity implies an
1395 /// instruction will be scalar, the reverse is not true. In general, a
1396 /// scalarized instruction will be represented by VF scalar values in the
1397 /// vectorized loop, each corresponding to an iteration of the original
1398 /// scalar loop.
1399 void collectLoopUniforms(unsigned VF);
1401 /// Collect the instructions that are scalar after vectorization. An
1402 /// instruction is scalar if it is known to be uniform or will be scalarized
1403 /// during vectorization. Non-uniform scalarized instructions will be
1404 /// represented by VF values in the vectorized loop, each corresponding to an
1405 /// iteration of the original scalar loop.
1406 void collectLoopScalars(unsigned VF);
1408 /// Keeps cost model vectorization decision and cost for instructions.
1409 /// Right now it is used for memory instructions only.
1410 using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1411 std::pair<InstWidening, unsigned>>;
1413 DecisionList WideningDecisions;
1415 /// Returns true if \p V is expected to be vectorized and it needs to be
1416 /// extracted.
1417 bool needsExtract(Value *V, unsigned VF) const {
1418 Instruction *I = dyn_cast<Instruction>(V);
1419 if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1420 return false;
1422 // Assume we can vectorize V (and hence we need extraction) if the
1423 // scalars are not computed yet. This can happen, because it is called
1424 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1425 // the scalars are collected. That should be a safe assumption in most
1426 // cases, because we check if the operands have vectorizable types
1427 // beforehand in LoopVectorizationLegality.
1428 return Scalars.find(VF) == Scalars.end() ||
1429 !isScalarAfterVectorization(I, VF);
1432 /// Returns a range containing only operands needing to be extracted.
1433 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1434 unsigned VF) {
1435 return SmallVector<Value *, 4>(make_filter_range(
1436 Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1439 public:
1440 /// The loop that we evaluate.
1441 Loop *TheLoop;
1443 /// Predicated scalar evolution analysis.
1444 PredicatedScalarEvolution &PSE;
1446 /// Loop Info analysis.
1447 LoopInfo *LI;
1449 /// Vectorization legality.
1450 LoopVectorizationLegality *Legal;
1452 /// Vector target information.
1453 const TargetTransformInfo &TTI;
1455 /// Target Library Info.
1456 const TargetLibraryInfo *TLI;
1458 /// Demanded bits analysis.
1459 DemandedBits *DB;
1461 /// Assumption cache.
1462 AssumptionCache *AC;
1464 /// Interface to emit optimization remarks.
1465 OptimizationRemarkEmitter *ORE;
1467 const Function *TheFunction;
1469 /// Loop Vectorize Hint.
1470 const LoopVectorizeHints *Hints;
1472 /// The interleave access information contains groups of interleaved accesses
1473 /// with the same stride and close to each other.
1474 InterleavedAccessInfo &InterleaveInfo;
1476 /// Values to ignore in the cost model.
1477 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1479 /// Values to ignore in the cost model when VF > 1.
1480 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1483 } // end namespace llvm
1485 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1486 // vectorization. The loop needs to be annotated with #pragma omp simd
1487 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1488 // vector length information is not provided, vectorization is not considered
1489 // explicit. Interleave hints are not allowed either. These limitations will be
1490 // relaxed in the future.
1491 // Please, note that we are currently forced to abuse the pragma 'clang
1492 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1493 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1494 // provides *explicit vectorization hints* (LV can bypass legal checks and
1495 // assume that vectorization is legal). However, both hints are implemented
1496 // using the same metadata (llvm.loop.vectorize, processed by
1497 // LoopVectorizeHints). This will be fixed in the future when the native IR
1498 // representation for pragma 'omp simd' is introduced.
1499 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1500 OptimizationRemarkEmitter *ORE) {
1501 assert(!OuterLp->empty() && "This is not an outer loop");
1502 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1504 // Only outer loops with an explicit vectorization hint are supported.
1505 // Unannotated outer loops are ignored.
1506 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1507 return false;
1509 Function *Fn = OuterLp->getHeader()->getParent();
1510 if (!Hints.allowVectorization(Fn, OuterLp,
1511 true /*VectorizeOnlyWhenForced*/)) {
1512 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1513 return false;
1516 if (Hints.getInterleave() > 1) {
1517 // TODO: Interleave support is future work.
1518 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1519 "outer loops.\n");
1520 Hints.emitRemarkWithHints();
1521 return false;
1524 return true;
1527 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1528 OptimizationRemarkEmitter *ORE,
1529 SmallVectorImpl<Loop *> &V) {
1530 // Collect inner loops and outer loops without irreducible control flow. For
1531 // now, only collect outer loops that have explicit vectorization hints. If we
1532 // are stress testing the VPlan H-CFG construction, we collect the outermost
1533 // loop of every loop nest.
1534 if (L.empty() || VPlanBuildStressTest ||
1535 (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1536 LoopBlocksRPO RPOT(&L);
1537 RPOT.perform(LI);
1538 if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1539 V.push_back(&L);
1540 // TODO: Collect inner loops inside marked outer loops in case
1541 // vectorization fails for the outer loop. Do not invoke
1542 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1543 // already known to be reducible. We can use an inherited attribute for
1544 // that.
1545 return;
1548 for (Loop *InnerL : L)
1549 collectSupportedLoops(*InnerL, LI, ORE, V);
1552 namespace {
1554 /// The LoopVectorize Pass.
1555 struct LoopVectorize : public FunctionPass {
1556 /// Pass identification, replacement for typeid
1557 static char ID;
1559 LoopVectorizePass Impl;
1561 explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1562 bool VectorizeOnlyWhenForced = false)
1563 : FunctionPass(ID) {
1564 Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1565 Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1566 initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1569 bool runOnFunction(Function &F) override {
1570 if (skipFunction(F))
1571 return false;
1573 auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1574 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1575 auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1576 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1577 auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1578 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1579 auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1580 auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1581 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1582 auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1583 auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1584 auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1585 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1587 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1588 [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1590 return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1591 GetLAA, *ORE, PSI);
1594 void getAnalysisUsage(AnalysisUsage &AU) const override {
1595 AU.addRequired<AssumptionCacheTracker>();
1596 AU.addRequired<BlockFrequencyInfoWrapperPass>();
1597 AU.addRequired<DominatorTreeWrapperPass>();
1598 AU.addRequired<LoopInfoWrapperPass>();
1599 AU.addRequired<ScalarEvolutionWrapperPass>();
1600 AU.addRequired<TargetTransformInfoWrapperPass>();
1601 AU.addRequired<AAResultsWrapperPass>();
1602 AU.addRequired<LoopAccessLegacyAnalysis>();
1603 AU.addRequired<DemandedBitsWrapperPass>();
1604 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1606 // We currently do not preserve loopinfo/dominator analyses with outer loop
1607 // vectorization. Until this is addressed, mark these analyses as preserved
1608 // only for non-VPlan-native path.
1609 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1610 if (!EnableVPlanNativePath) {
1611 AU.addPreserved<LoopInfoWrapperPass>();
1612 AU.addPreserved<DominatorTreeWrapperPass>();
1615 AU.addPreserved<BasicAAWrapperPass>();
1616 AU.addPreserved<GlobalsAAWrapperPass>();
1617 AU.addRequired<ProfileSummaryInfoWrapperPass>();
1621 } // end anonymous namespace
1623 //===----------------------------------------------------------------------===//
1624 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1625 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1626 //===----------------------------------------------------------------------===//
1628 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1629 // We need to place the broadcast of invariant variables outside the loop,
1630 // but only if it's proven safe to do so. Else, broadcast will be inside
1631 // vector loop body.
1632 Instruction *Instr = dyn_cast<Instruction>(V);
1633 bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1634 (!Instr ||
1635 DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1636 // Place the code for broadcasting invariant variables in the new preheader.
1637 IRBuilder<>::InsertPointGuard Guard(Builder);
1638 if (SafeToHoist)
1639 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1641 // Broadcast the scalar into all locations in the vector.
1642 Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1644 return Shuf;
1647 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1648 const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1649 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1650 "Expected either an induction phi-node or a truncate of it!");
1651 Value *Start = II.getStartValue();
1653 // Construct the initial value of the vector IV in the vector loop preheader
1654 auto CurrIP = Builder.saveIP();
1655 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1656 if (isa<TruncInst>(EntryVal)) {
1657 assert(Start->getType()->isIntegerTy() &&
1658 "Truncation requires an integer type");
1659 auto *TruncType = cast<IntegerType>(EntryVal->getType());
1660 Step = Builder.CreateTrunc(Step, TruncType);
1661 Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1663 Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1664 Value *SteppedStart =
1665 getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1667 // We create vector phi nodes for both integer and floating-point induction
1668 // variables. Here, we determine the kind of arithmetic we will perform.
1669 Instruction::BinaryOps AddOp;
1670 Instruction::BinaryOps MulOp;
1671 if (Step->getType()->isIntegerTy()) {
1672 AddOp = Instruction::Add;
1673 MulOp = Instruction::Mul;
1674 } else {
1675 AddOp = II.getInductionOpcode();
1676 MulOp = Instruction::FMul;
1679 // Multiply the vectorization factor by the step using integer or
1680 // floating-point arithmetic as appropriate.
1681 Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1682 Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1684 // Create a vector splat to use in the induction update.
1686 // FIXME: If the step is non-constant, we create the vector splat with
1687 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1688 // handle a constant vector splat.
1689 Value *SplatVF = isa<Constant>(Mul)
1690 ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1691 : Builder.CreateVectorSplat(VF, Mul);
1692 Builder.restoreIP(CurrIP);
1694 // We may need to add the step a number of times, depending on the unroll
1695 // factor. The last of those goes into the PHI.
1696 PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1697 &*LoopVectorBody->getFirstInsertionPt());
1698 VecInd->setDebugLoc(EntryVal->getDebugLoc());
1699 Instruction *LastInduction = VecInd;
1700 for (unsigned Part = 0; Part < UF; ++Part) {
1701 VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1703 if (isa<TruncInst>(EntryVal))
1704 addMetadata(LastInduction, EntryVal);
1705 recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1707 LastInduction = cast<Instruction>(addFastMathFlag(
1708 Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1709 LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1712 // Move the last step to the end of the latch block. This ensures consistent
1713 // placement of all induction updates.
1714 auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1715 auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1716 auto *ICmp = cast<Instruction>(Br->getCondition());
1717 LastInduction->moveBefore(ICmp);
1718 LastInduction->setName("vec.ind.next");
1720 VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1721 VecInd->addIncoming(LastInduction, LoopVectorLatch);
1724 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1725 return Cost->isScalarAfterVectorization(I, VF) ||
1726 Cost->isProfitableToScalarize(I, VF);
1729 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1730 if (shouldScalarizeInstruction(IV))
1731 return true;
1732 auto isScalarInst = [&](User *U) -> bool {
1733 auto *I = cast<Instruction>(U);
1734 return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1736 return llvm::any_of(IV->users(), isScalarInst);
1739 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1740 const InductionDescriptor &ID, const Instruction *EntryVal,
1741 Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1742 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1743 "Expected either an induction phi-node or a truncate of it!");
1745 // This induction variable is not the phi from the original loop but the
1746 // newly-created IV based on the proof that casted Phi is equal to the
1747 // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1748 // re-uses the same InductionDescriptor that original IV uses but we don't
1749 // have to do any recording in this case - that is done when original IV is
1750 // processed.
1751 if (isa<TruncInst>(EntryVal))
1752 return;
1754 const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1755 if (Casts.empty())
1756 return;
1757 // Only the first Cast instruction in the Casts vector is of interest.
1758 // The rest of the Casts (if exist) have no uses outside the
1759 // induction update chain itself.
1760 Instruction *CastInst = *Casts.begin();
1761 if (Lane < UINT_MAX)
1762 VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1763 else
1764 VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1767 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1768 assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1769 "Primary induction variable must have an integer type");
1771 auto II = Legal->getInductionVars()->find(IV);
1772 assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1774 auto ID = II->second;
1775 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1777 // The scalar value to broadcast. This will be derived from the canonical
1778 // induction variable.
1779 Value *ScalarIV = nullptr;
1781 // The value from the original loop to which we are mapping the new induction
1782 // variable.
1783 Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1785 // True if we have vectorized the induction variable.
1786 auto VectorizedIV = false;
1788 // Determine if we want a scalar version of the induction variable. This is
1789 // true if the induction variable itself is not widened, or if it has at
1790 // least one user in the loop that is not widened.
1791 auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1793 // Generate code for the induction step. Note that induction steps are
1794 // required to be loop-invariant
1795 assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1796 "Induction step should be loop invariant");
1797 auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1798 Value *Step = nullptr;
1799 if (PSE.getSE()->isSCEVable(IV->getType())) {
1800 SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1801 Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1802 LoopVectorPreHeader->getTerminator());
1803 } else {
1804 Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1807 // Try to create a new independent vector induction variable. If we can't
1808 // create the phi node, we will splat the scalar induction variable in each
1809 // loop iteration.
1810 if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1811 createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1812 VectorizedIV = true;
1815 // If we haven't yet vectorized the induction variable, or if we will create
1816 // a scalar one, we need to define the scalar induction variable and step
1817 // values. If we were given a truncation type, truncate the canonical
1818 // induction variable and step. Otherwise, derive these values from the
1819 // induction descriptor.
1820 if (!VectorizedIV || NeedsScalarIV) {
1821 ScalarIV = Induction;
1822 if (IV != OldInduction) {
1823 ScalarIV = IV->getType()->isIntegerTy()
1824 ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1825 : Builder.CreateCast(Instruction::SIToFP, Induction,
1826 IV->getType());
1827 ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1828 ScalarIV->setName("offset.idx");
1830 if (Trunc) {
1831 auto *TruncType = cast<IntegerType>(Trunc->getType());
1832 assert(Step->getType()->isIntegerTy() &&
1833 "Truncation requires an integer step");
1834 ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1835 Step = Builder.CreateTrunc(Step, TruncType);
1839 // If we haven't yet vectorized the induction variable, splat the scalar
1840 // induction variable, and build the necessary step vectors.
1841 // TODO: Don't do it unless the vectorized IV is really required.
1842 if (!VectorizedIV) {
1843 Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1844 for (unsigned Part = 0; Part < UF; ++Part) {
1845 Value *EntryPart =
1846 getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1847 VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1848 if (Trunc)
1849 addMetadata(EntryPart, Trunc);
1850 recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1854 // If an induction variable is only used for counting loop iterations or
1855 // calculating addresses, it doesn't need to be widened. Create scalar steps
1856 // that can be used by instructions we will later scalarize. Note that the
1857 // addition of the scalar steps will not increase the number of instructions
1858 // in the loop in the common case prior to InstCombine. We will be trading
1859 // one vector extract for each scalar step.
1860 if (NeedsScalarIV)
1861 buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1864 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1865 Instruction::BinaryOps BinOp) {
1866 // Create and check the types.
1867 assert(Val->getType()->isVectorTy() && "Must be a vector");
1868 int VLen = Val->getType()->getVectorNumElements();
1870 Type *STy = Val->getType()->getScalarType();
1871 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1872 "Induction Step must be an integer or FP");
1873 assert(Step->getType() == STy && "Step has wrong type");
1875 SmallVector<Constant *, 8> Indices;
1877 if (STy->isIntegerTy()) {
1878 // Create a vector of consecutive numbers from zero to VF.
1879 for (int i = 0; i < VLen; ++i)
1880 Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1882 // Add the consecutive indices to the vector value.
1883 Constant *Cv = ConstantVector::get(Indices);
1884 assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1885 Step = Builder.CreateVectorSplat(VLen, Step);
1886 assert(Step->getType() == Val->getType() && "Invalid step vec");
1887 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1888 // which can be found from the original scalar operations.
1889 Step = Builder.CreateMul(Cv, Step);
1890 return Builder.CreateAdd(Val, Step, "induction");
1893 // Floating point induction.
1894 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1895 "Binary Opcode should be specified for FP induction");
1896 // Create a vector of consecutive numbers from zero to VF.
1897 for (int i = 0; i < VLen; ++i)
1898 Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1900 // Add the consecutive indices to the vector value.
1901 Constant *Cv = ConstantVector::get(Indices);
1903 Step = Builder.CreateVectorSplat(VLen, Step);
1905 // Floating point operations had to be 'fast' to enable the induction.
1906 FastMathFlags Flags;
1907 Flags.setFast();
1909 Value *MulOp = Builder.CreateFMul(Cv, Step);
1910 if (isa<Instruction>(MulOp))
1911 // Have to check, MulOp may be a constant
1912 cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1914 Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1915 if (isa<Instruction>(BOp))
1916 cast<Instruction>(BOp)->setFastMathFlags(Flags);
1917 return BOp;
1920 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1921 Instruction *EntryVal,
1922 const InductionDescriptor &ID) {
1923 // We shouldn't have to build scalar steps if we aren't vectorizing.
1924 assert(VF > 1 && "VF should be greater than one");
1926 // Get the value type and ensure it and the step have the same integer type.
1927 Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1928 assert(ScalarIVTy == Step->getType() &&
1929 "Val and Step should have the same type");
1931 // We build scalar steps for both integer and floating-point induction
1932 // variables. Here, we determine the kind of arithmetic we will perform.
1933 Instruction::BinaryOps AddOp;
1934 Instruction::BinaryOps MulOp;
1935 if (ScalarIVTy->isIntegerTy()) {
1936 AddOp = Instruction::Add;
1937 MulOp = Instruction::Mul;
1938 } else {
1939 AddOp = ID.getInductionOpcode();
1940 MulOp = Instruction::FMul;
1943 // Determine the number of scalars we need to generate for each unroll
1944 // iteration. If EntryVal is uniform, we only need to generate the first
1945 // lane. Otherwise, we generate all VF values.
1946 unsigned Lanes =
1947 Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1948 : VF;
1949 // Compute the scalar steps and save the results in VectorLoopValueMap.
1950 for (unsigned Part = 0; Part < UF; ++Part) {
1951 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1952 auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1953 auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1954 auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1955 VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1956 recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1961 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1962 assert(V != Induction && "The new induction variable should not be used.");
1963 assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1964 assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1966 // If we have a stride that is replaced by one, do it here. Defer this for
1967 // the VPlan-native path until we start running Legal checks in that path.
1968 if (!EnableVPlanNativePath && Legal->hasStride(V))
1969 V = ConstantInt::get(V->getType(), 1);
1971 // If we have a vector mapped to this value, return it.
1972 if (VectorLoopValueMap.hasVectorValue(V, Part))
1973 return VectorLoopValueMap.getVectorValue(V, Part);
1975 // If the value has not been vectorized, check if it has been scalarized
1976 // instead. If it has been scalarized, and we actually need the value in
1977 // vector form, we will construct the vector values on demand.
1978 if (VectorLoopValueMap.hasAnyScalarValue(V)) {
1979 Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
1981 // If we've scalarized a value, that value should be an instruction.
1982 auto *I = cast<Instruction>(V);
1984 // If we aren't vectorizing, we can just copy the scalar map values over to
1985 // the vector map.
1986 if (VF == 1) {
1987 VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
1988 return ScalarValue;
1991 // Get the last scalar instruction we generated for V and Part. If the value
1992 // is known to be uniform after vectorization, this corresponds to lane zero
1993 // of the Part unroll iteration. Otherwise, the last instruction is the one
1994 // we created for the last vector lane of the Part unroll iteration.
1995 unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
1996 auto *LastInst = cast<Instruction>(
1997 VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
1999 // Set the insert point after the last scalarized instruction. This ensures
2000 // the insertelement sequence will directly follow the scalar definitions.
2001 auto OldIP = Builder.saveIP();
2002 auto NewIP = std::next(BasicBlock::iterator(LastInst));
2003 Builder.SetInsertPoint(&*NewIP);
2005 // However, if we are vectorizing, we need to construct the vector values.
2006 // If the value is known to be uniform after vectorization, we can just
2007 // broadcast the scalar value corresponding to lane zero for each unroll
2008 // iteration. Otherwise, we construct the vector values using insertelement
2009 // instructions. Since the resulting vectors are stored in
2010 // VectorLoopValueMap, we will only generate the insertelements once.
2011 Value *VectorValue = nullptr;
2012 if (Cost->isUniformAfterVectorization(I, VF)) {
2013 VectorValue = getBroadcastInstrs(ScalarValue);
2014 VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2015 } else {
2016 // Initialize packing with insertelements to start from undef.
2017 Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2018 VectorLoopValueMap.setVectorValue(V, Part, Undef);
2019 for (unsigned Lane = 0; Lane < VF; ++Lane)
2020 packScalarIntoVectorValue(V, {Part, Lane});
2021 VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2023 Builder.restoreIP(OldIP);
2024 return VectorValue;
2027 // If this scalar is unknown, assume that it is a constant or that it is
2028 // loop invariant. Broadcast V and save the value for future uses.
2029 Value *B = getBroadcastInstrs(V);
2030 VectorLoopValueMap.setVectorValue(V, Part, B);
2031 return B;
2034 Value *
2035 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2036 const VPIteration &Instance) {
2037 // If the value is not an instruction contained in the loop, it should
2038 // already be scalar.
2039 if (OrigLoop->isLoopInvariant(V))
2040 return V;
2042 assert(Instance.Lane > 0
2043 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2044 : true && "Uniform values only have lane zero");
2046 // If the value from the original loop has not been vectorized, it is
2047 // represented by UF x VF scalar values in the new loop. Return the requested
2048 // scalar value.
2049 if (VectorLoopValueMap.hasScalarValue(V, Instance))
2050 return VectorLoopValueMap.getScalarValue(V, Instance);
2052 // If the value has not been scalarized, get its entry in VectorLoopValueMap
2053 // for the given unroll part. If this entry is not a vector type (i.e., the
2054 // vectorization factor is one), there is no need to generate an
2055 // extractelement instruction.
2056 auto *U = getOrCreateVectorValue(V, Instance.Part);
2057 if (!U->getType()->isVectorTy()) {
2058 assert(VF == 1 && "Value not scalarized has non-vector type");
2059 return U;
2062 // Otherwise, the value from the original loop has been vectorized and is
2063 // represented by UF vector values. Extract and return the requested scalar
2064 // value from the appropriate vector lane.
2065 return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2068 void InnerLoopVectorizer::packScalarIntoVectorValue(
2069 Value *V, const VPIteration &Instance) {
2070 assert(V != Induction && "The new induction variable should not be used.");
2071 assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2072 assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2074 Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2075 Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2076 VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2077 Builder.getInt32(Instance.Lane));
2078 VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2081 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2082 assert(Vec->getType()->isVectorTy() && "Invalid type");
2083 SmallVector<Constant *, 8> ShuffleMask;
2084 for (unsigned i = 0; i < VF; ++i)
2085 ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2087 return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2088 ConstantVector::get(ShuffleMask),
2089 "reverse");
2092 // Return whether we allow using masked interleave-groups (for dealing with
2093 // strided loads/stores that reside in predicated blocks, or for dealing
2094 // with gaps).
2095 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2096 // If an override option has been passed in for interleaved accesses, use it.
2097 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2098 return EnableMaskedInterleavedMemAccesses;
2100 return TTI.enableMaskedInterleavedAccessVectorization();
2103 // Try to vectorize the interleave group that \p Instr belongs to.
2105 // E.g. Translate following interleaved load group (factor = 3):
2106 // for (i = 0; i < N; i+=3) {
2107 // R = Pic[i]; // Member of index 0
2108 // G = Pic[i+1]; // Member of index 1
2109 // B = Pic[i+2]; // Member of index 2
2110 // ... // do something to R, G, B
2111 // }
2112 // To:
2113 // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2114 // %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements
2115 // %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements
2116 // %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements
2118 // Or translate following interleaved store group (factor = 3):
2119 // for (i = 0; i < N; i+=3) {
2120 // ... do something to R, G, B
2121 // Pic[i] = R; // Member of index 0
2122 // Pic[i+1] = G; // Member of index 1
2123 // Pic[i+2] = B; // Member of index 2
2124 // }
2125 // To:
2126 // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2127 // %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2128 // %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2129 // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2130 // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2131 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2132 VectorParts *BlockInMask) {
2133 const InterleaveGroup<Instruction> *Group =
2134 Cost->getInterleavedAccessGroup(Instr);
2135 assert(Group && "Fail to get an interleaved access group.");
2137 // Skip if current instruction is not the insert position.
2138 if (Instr != Group->getInsertPos())
2139 return;
2141 const DataLayout &DL = Instr->getModule()->getDataLayout();
2142 Value *Ptr = getLoadStorePointerOperand(Instr);
2144 // Prepare for the vector type of the interleaved load/store.
2145 Type *ScalarTy = getMemInstValueType(Instr);
2146 unsigned InterleaveFactor = Group->getFactor();
2147 Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2148 Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2150 // Prepare for the new pointers.
2151 setDebugLocFromInst(Builder, Ptr);
2152 SmallVector<Value *, 2> NewPtrs;
2153 unsigned Index = Group->getIndex(Instr);
2155 VectorParts Mask;
2156 bool IsMaskForCondRequired = BlockInMask;
2157 if (IsMaskForCondRequired) {
2158 Mask = *BlockInMask;
2159 // TODO: extend the masked interleaved-group support to reversed access.
2160 assert(!Group->isReverse() && "Reversed masked interleave-group "
2161 "not supported.");
2164 // If the group is reverse, adjust the index to refer to the last vector lane
2165 // instead of the first. We adjust the index from the first vector lane,
2166 // rather than directly getting the pointer for lane VF - 1, because the
2167 // pointer operand of the interleaved access is supposed to be uniform. For
2168 // uniform instructions, we're only required to generate a value for the
2169 // first vector lane in each unroll iteration.
2170 if (Group->isReverse())
2171 Index += (VF - 1) * Group->getFactor();
2173 bool InBounds = false;
2174 if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2175 InBounds = gep->isInBounds();
2177 for (unsigned Part = 0; Part < UF; Part++) {
2178 Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2180 // Notice current instruction could be any index. Need to adjust the address
2181 // to the member of index 0.
2183 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2184 // b = A[i]; // Member of index 0
2185 // Current pointer is pointed to A[i+1], adjust it to A[i].
2187 // E.g. A[i+1] = a; // Member of index 1
2188 // A[i] = b; // Member of index 0
2189 // A[i+2] = c; // Member of index 2 (Current instruction)
2190 // Current pointer is pointed to A[i+2], adjust it to A[i].
2191 NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
2192 if (InBounds)
2193 cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2195 // Cast to the vector pointer type.
2196 NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2199 setDebugLocFromInst(Builder, Instr);
2200 Value *UndefVec = UndefValue::get(VecTy);
2202 Value *MaskForGaps = nullptr;
2203 if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2204 MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2205 assert(MaskForGaps && "Mask for Gaps is required but it is null");
2208 // Vectorize the interleaved load group.
2209 if (isa<LoadInst>(Instr)) {
2210 // For each unroll part, create a wide load for the group.
2211 SmallVector<Value *, 2> NewLoads;
2212 for (unsigned Part = 0; Part < UF; Part++) {
2213 Instruction *NewLoad;
2214 if (IsMaskForCondRequired || MaskForGaps) {
2215 assert(useMaskedInterleavedAccesses(*TTI) &&
2216 "masked interleaved groups are not allowed.");
2217 Value *GroupMask = MaskForGaps;
2218 if (IsMaskForCondRequired) {
2219 auto *Undefs = UndefValue::get(Mask[Part]->getType());
2220 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2221 Value *ShuffledMask = Builder.CreateShuffleVector(
2222 Mask[Part], Undefs, RepMask, "interleaved.mask");
2223 GroupMask = MaskForGaps
2224 ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2225 MaskForGaps)
2226 : ShuffledMask;
2228 NewLoad =
2229 Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2230 GroupMask, UndefVec, "wide.masked.vec");
2232 else
2233 NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
2234 Group->getAlignment(), "wide.vec");
2235 Group->addMetadata(NewLoad);
2236 NewLoads.push_back(NewLoad);
2239 // For each member in the group, shuffle out the appropriate data from the
2240 // wide loads.
2241 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2242 Instruction *Member = Group->getMember(I);
2244 // Skip the gaps in the group.
2245 if (!Member)
2246 continue;
2248 Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2249 for (unsigned Part = 0; Part < UF; Part++) {
2250 Value *StridedVec = Builder.CreateShuffleVector(
2251 NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2253 // If this member has different type, cast the result type.
2254 if (Member->getType() != ScalarTy) {
2255 VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2256 StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2259 if (Group->isReverse())
2260 StridedVec = reverseVector(StridedVec);
2262 VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2265 return;
2268 // The sub vector type for current instruction.
2269 VectorType *SubVT = VectorType::get(ScalarTy, VF);
2271 // Vectorize the interleaved store group.
2272 for (unsigned Part = 0; Part < UF; Part++) {
2273 // Collect the stored vector from each member.
2274 SmallVector<Value *, 4> StoredVecs;
2275 for (unsigned i = 0; i < InterleaveFactor; i++) {
2276 // Interleaved store group doesn't allow a gap, so each index has a member
2277 Instruction *Member = Group->getMember(i);
2278 assert(Member && "Fail to get a member from an interleaved store group");
2280 Value *StoredVec = getOrCreateVectorValue(
2281 cast<StoreInst>(Member)->getValueOperand(), Part);
2282 if (Group->isReverse())
2283 StoredVec = reverseVector(StoredVec);
2285 // If this member has different type, cast it to a unified type.
2287 if (StoredVec->getType() != SubVT)
2288 StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2290 StoredVecs.push_back(StoredVec);
2293 // Concatenate all vectors into a wide vector.
2294 Value *WideVec = concatenateVectors(Builder, StoredVecs);
2296 // Interleave the elements in the wide vector.
2297 Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2298 Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2299 "interleaved.vec");
2301 Instruction *NewStoreInstr;
2302 if (IsMaskForCondRequired) {
2303 auto *Undefs = UndefValue::get(Mask[Part]->getType());
2304 auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2305 Value *ShuffledMask = Builder.CreateShuffleVector(
2306 Mask[Part], Undefs, RepMask, "interleaved.mask");
2307 NewStoreInstr = Builder.CreateMaskedStore(
2308 IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2310 else
2311 NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2312 Group->getAlignment());
2314 Group->addMetadata(NewStoreInstr);
2318 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2319 VectorParts *BlockInMask) {
2320 // Attempt to issue a wide load.
2321 LoadInst *LI = dyn_cast<LoadInst>(Instr);
2322 StoreInst *SI = dyn_cast<StoreInst>(Instr);
2324 assert((LI || SI) && "Invalid Load/Store instruction");
2326 LoopVectorizationCostModel::InstWidening Decision =
2327 Cost->getWideningDecision(Instr, VF);
2328 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2329 "CM decision should be taken at this point");
2330 if (Decision == LoopVectorizationCostModel::CM_Interleave)
2331 return vectorizeInterleaveGroup(Instr);
2333 Type *ScalarDataTy = getMemInstValueType(Instr);
2334 Type *DataTy = VectorType::get(ScalarDataTy, VF);
2335 Value *Ptr = getLoadStorePointerOperand(Instr);
2336 unsigned Alignment = getLoadStoreAlignment(Instr);
2337 // An alignment of 0 means target abi alignment. We need to use the scalar's
2338 // target abi alignment in such a case.
2339 const DataLayout &DL = Instr->getModule()->getDataLayout();
2340 if (!Alignment)
2341 Alignment = DL.getABITypeAlignment(ScalarDataTy);
2342 unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2344 // Determine if the pointer operand of the access is either consecutive or
2345 // reverse consecutive.
2346 bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2347 bool ConsecutiveStride =
2348 Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2349 bool CreateGatherScatter =
2350 (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2352 // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2353 // gather/scatter. Otherwise Decision should have been to Scalarize.
2354 assert((ConsecutiveStride || CreateGatherScatter) &&
2355 "The instruction should be scalarized");
2357 // Handle consecutive loads/stores.
2358 if (ConsecutiveStride)
2359 Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2361 VectorParts Mask;
2362 bool isMaskRequired = BlockInMask;
2363 if (isMaskRequired)
2364 Mask = *BlockInMask;
2366 bool InBounds = false;
2367 if (auto *gep = dyn_cast<GetElementPtrInst>(
2368 getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2369 InBounds = gep->isInBounds();
2371 const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2372 // Calculate the pointer for the specific unroll-part.
2373 GetElementPtrInst *PartPtr = nullptr;
2375 if (Reverse) {
2376 // If the address is consecutive but reversed, then the
2377 // wide store needs to start at the last vector element.
2378 PartPtr = cast<GetElementPtrInst>(
2379 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2380 PartPtr->setIsInBounds(InBounds);
2381 PartPtr = cast<GetElementPtrInst>(
2382 Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2383 PartPtr->setIsInBounds(InBounds);
2384 if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2385 Mask[Part] = reverseVector(Mask[Part]);
2386 } else {
2387 PartPtr = cast<GetElementPtrInst>(
2388 Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2389 PartPtr->setIsInBounds(InBounds);
2392 return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2395 // Handle Stores:
2396 if (SI) {
2397 setDebugLocFromInst(Builder, SI);
2399 for (unsigned Part = 0; Part < UF; ++Part) {
2400 Instruction *NewSI = nullptr;
2401 Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2402 if (CreateGatherScatter) {
2403 Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2404 Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2405 NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2406 MaskPart);
2407 } else {
2408 if (Reverse) {
2409 // If we store to reverse consecutive memory locations, then we need
2410 // to reverse the order of elements in the stored value.
2411 StoredVal = reverseVector(StoredVal);
2412 // We don't want to update the value in the map as it might be used in
2413 // another expression. So don't call resetVectorValue(StoredVal).
2415 auto *VecPtr = CreateVecPtr(Part, Ptr);
2416 if (isMaskRequired)
2417 NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2418 Mask[Part]);
2419 else
2420 NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2422 addMetadata(NewSI, SI);
2424 return;
2427 // Handle loads.
2428 assert(LI && "Must have a load instruction");
2429 setDebugLocFromInst(Builder, LI);
2430 for (unsigned Part = 0; Part < UF; ++Part) {
2431 Value *NewLI;
2432 if (CreateGatherScatter) {
2433 Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2434 Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2435 NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2436 nullptr, "wide.masked.gather");
2437 addMetadata(NewLI, LI);
2438 } else {
2439 auto *VecPtr = CreateVecPtr(Part, Ptr);
2440 if (isMaskRequired)
2441 NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
2442 UndefValue::get(DataTy),
2443 "wide.masked.load");
2444 else
2445 NewLI =
2446 Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2448 // Add metadata to the load, but setVectorValue to the reverse shuffle.
2449 addMetadata(NewLI, LI);
2450 if (Reverse)
2451 NewLI = reverseVector(NewLI);
2453 VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2457 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2458 const VPIteration &Instance,
2459 bool IfPredicateInstr) {
2460 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2462 setDebugLocFromInst(Builder, Instr);
2464 // Does this instruction return a value ?
2465 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2467 Instruction *Cloned = Instr->clone();
2468 if (!IsVoidRetTy)
2469 Cloned->setName(Instr->getName() + ".cloned");
2471 // Replace the operands of the cloned instructions with their scalar
2472 // equivalents in the new loop.
2473 for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2474 auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2475 Cloned->setOperand(op, NewOp);
2477 addNewMetadata(Cloned, Instr);
2479 // Place the cloned scalar in the new loop.
2480 Builder.Insert(Cloned);
2482 // Add the cloned scalar to the scalar map entry.
2483 VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2485 // If we just cloned a new assumption, add it the assumption cache.
2486 if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2487 if (II->getIntrinsicID() == Intrinsic::assume)
2488 AC->registerAssumption(II);
2490 // End if-block.
2491 if (IfPredicateInstr)
2492 PredicatedInstructions.push_back(Cloned);
2495 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2496 Value *End, Value *Step,
2497 Instruction *DL) {
2498 BasicBlock *Header = L->getHeader();
2499 BasicBlock *Latch = L->getLoopLatch();
2500 // As we're just creating this loop, it's possible no latch exists
2501 // yet. If so, use the header as this will be a single block loop.
2502 if (!Latch)
2503 Latch = Header;
2505 IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2506 Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2507 setDebugLocFromInst(Builder, OldInst);
2508 auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2510 Builder.SetInsertPoint(Latch->getTerminator());
2511 setDebugLocFromInst(Builder, OldInst);
2513 // Create i+1 and fill the PHINode.
2514 Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2515 Induction->addIncoming(Start, L->getLoopPreheader());
2516 Induction->addIncoming(Next, Latch);
2517 // Create the compare.
2518 Value *ICmp = Builder.CreateICmpEQ(Next, End);
2519 Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2521 // Now we have two terminators. Remove the old one from the block.
2522 Latch->getTerminator()->eraseFromParent();
2524 return Induction;
2527 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2528 if (TripCount)
2529 return TripCount;
2531 assert(L && "Create Trip Count for null loop.");
2532 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2533 // Find the loop boundaries.
2534 ScalarEvolution *SE = PSE.getSE();
2535 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2536 assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2537 "Invalid loop count");
2539 Type *IdxTy = Legal->getWidestInductionType();
2540 assert(IdxTy && "No type for induction");
2542 // The exit count might have the type of i64 while the phi is i32. This can
2543 // happen if we have an induction variable that is sign extended before the
2544 // compare. The only way that we get a backedge taken count is that the
2545 // induction variable was signed and as such will not overflow. In such a case
2546 // truncation is legal.
2547 if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2548 IdxTy->getPrimitiveSizeInBits())
2549 BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2550 BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2552 // Get the total trip count from the count by adding 1.
2553 const SCEV *ExitCount = SE->getAddExpr(
2554 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2556 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2558 // Expand the trip count and place the new instructions in the preheader.
2559 // Notice that the pre-header does not change, only the loop body.
2560 SCEVExpander Exp(*SE, DL, "induction");
2562 // Count holds the overall loop count (N).
2563 TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2564 L->getLoopPreheader()->getTerminator());
2566 if (TripCount->getType()->isPointerTy())
2567 TripCount =
2568 CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2569 L->getLoopPreheader()->getTerminator());
2571 return TripCount;
2574 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2575 if (VectorTripCount)
2576 return VectorTripCount;
2578 Value *TC = getOrCreateTripCount(L);
2579 IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2581 Type *Ty = TC->getType();
2582 Constant *Step = ConstantInt::get(Ty, VF * UF);
2584 // If the tail is to be folded by masking, round the number of iterations N
2585 // up to a multiple of Step instead of rounding down. This is done by first
2586 // adding Step-1 and then rounding down. Note that it's ok if this addition
2587 // overflows: the vector induction variable will eventually wrap to zero given
2588 // that it starts at zero and its Step is a power of two; the loop will then
2589 // exit, with the last early-exit vector comparison also producing all-true.
2590 if (Cost->foldTailByMasking()) {
2591 assert(isPowerOf2_32(VF * UF) &&
2592 "VF*UF must be a power of 2 when folding tail by masking");
2593 TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2596 // Now we need to generate the expression for the part of the loop that the
2597 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2598 // iterations are not required for correctness, or N - Step, otherwise. Step
2599 // is equal to the vectorization factor (number of SIMD elements) times the
2600 // unroll factor (number of SIMD instructions).
2601 Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2603 // If there is a non-reversed interleaved group that may speculatively access
2604 // memory out-of-bounds, we need to ensure that there will be at least one
2605 // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2606 // the trip count, we set the remainder to be equal to the step. If the step
2607 // does not evenly divide the trip count, no adjustment is necessary since
2608 // there will already be scalar iterations. Note that the minimum iterations
2609 // check ensures that N >= Step.
2610 if (VF > 1 && Cost->requiresScalarEpilogue()) {
2611 auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2612 R = Builder.CreateSelect(IsZero, Step, R);
2615 VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2617 return VectorTripCount;
2620 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2621 const DataLayout &DL) {
2622 // Verify that V is a vector type with same number of elements as DstVTy.
2623 unsigned VF = DstVTy->getNumElements();
2624 VectorType *SrcVecTy = cast<VectorType>(V->getType());
2625 assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2626 Type *SrcElemTy = SrcVecTy->getElementType();
2627 Type *DstElemTy = DstVTy->getElementType();
2628 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2629 "Vector elements must have same size");
2631 // Do a direct cast if element types are castable.
2632 if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2633 return Builder.CreateBitOrPointerCast(V, DstVTy);
2635 // V cannot be directly casted to desired vector type.
2636 // May happen when V is a floating point vector but DstVTy is a vector of
2637 // pointers or vice-versa. Handle this using a two-step bitcast using an
2638 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2639 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2640 "Only one type should be a pointer type");
2641 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2642 "Only one type should be a floating point type");
2643 Type *IntTy =
2644 IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2645 VectorType *VecIntTy = VectorType::get(IntTy, VF);
2646 Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2647 return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2650 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2651 BasicBlock *Bypass) {
2652 Value *Count = getOrCreateTripCount(L);
2653 BasicBlock *BB = L->getLoopPreheader();
2654 IRBuilder<> Builder(BB->getTerminator());
2656 // Generate code to check if the loop's trip count is less than VF * UF, or
2657 // equal to it in case a scalar epilogue is required; this implies that the
2658 // vector trip count is zero. This check also covers the case where adding one
2659 // to the backedge-taken count overflowed leading to an incorrect trip count
2660 // of zero. In this case we will also jump to the scalar loop.
2661 auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2662 : ICmpInst::ICMP_ULT;
2664 // If tail is to be folded, vector loop takes care of all iterations.
2665 Value *CheckMinIters = Builder.getFalse();
2666 if (!Cost->foldTailByMasking())
2667 CheckMinIters = Builder.CreateICmp(
2668 P, Count, ConstantInt::get(Count->getType(), VF * UF),
2669 "min.iters.check");
2671 BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2672 // Update dominator tree immediately if the generated block is a
2673 // LoopBypassBlock because SCEV expansions to generate loop bypass
2674 // checks may query it before the current function is finished.
2675 DT->addNewBlock(NewBB, BB);
2676 if (L->getParentLoop())
2677 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2678 ReplaceInstWithInst(BB->getTerminator(),
2679 BranchInst::Create(Bypass, NewBB, CheckMinIters));
2680 LoopBypassBlocks.push_back(BB);
2683 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2684 BasicBlock *BB = L->getLoopPreheader();
2686 // Generate the code to check that the SCEV assumptions that we made.
2687 // We want the new basic block to start at the first instruction in a
2688 // sequence of instructions that form a check.
2689 SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2690 "scev.check");
2691 Value *SCEVCheck =
2692 Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2694 if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2695 if (C->isZero())
2696 return;
2698 assert(!BB->getParent()->hasOptSize() &&
2699 "Cannot SCEV check stride or overflow when optimizing for size");
2701 // Create a new block containing the stride check.
2702 BB->setName("vector.scevcheck");
2703 auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2704 // Update dominator tree immediately if the generated block is a
2705 // LoopBypassBlock because SCEV expansions to generate loop bypass
2706 // checks may query it before the current function is finished.
2707 DT->addNewBlock(NewBB, BB);
2708 if (L->getParentLoop())
2709 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2710 ReplaceInstWithInst(BB->getTerminator(),
2711 BranchInst::Create(Bypass, NewBB, SCEVCheck));
2712 LoopBypassBlocks.push_back(BB);
2713 AddedSafetyChecks = true;
2716 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2717 // VPlan-native path does not do any analysis for runtime checks currently.
2718 if (EnableVPlanNativePath)
2719 return;
2721 BasicBlock *BB = L->getLoopPreheader();
2723 // Generate the code that checks in runtime if arrays overlap. We put the
2724 // checks into a separate block to make the more common case of few elements
2725 // faster.
2726 Instruction *FirstCheckInst;
2727 Instruction *MemRuntimeCheck;
2728 std::tie(FirstCheckInst, MemRuntimeCheck) =
2729 Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
2730 if (!MemRuntimeCheck)
2731 return;
2733 assert(!BB->getParent()->hasOptSize() &&
2734 "Cannot emit memory checks when optimizing for size");
2736 // Create a new block containing the memory check.
2737 BB->setName("vector.memcheck");
2738 auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2739 // Update dominator tree immediately if the generated block is a
2740 // LoopBypassBlock because SCEV expansions to generate loop bypass
2741 // checks may query it before the current function is finished.
2742 DT->addNewBlock(NewBB, BB);
2743 if (L->getParentLoop())
2744 L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2745 ReplaceInstWithInst(BB->getTerminator(),
2746 BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2747 LoopBypassBlocks.push_back(BB);
2748 AddedSafetyChecks = true;
2750 // We currently don't use LoopVersioning for the actual loop cloning but we
2751 // still use it to add the noalias metadata.
2752 LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2753 PSE.getSE());
2754 LVer->prepareNoAliasMetadata();
2757 Value *InnerLoopVectorizer::emitTransformedIndex(
2758 IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2759 const InductionDescriptor &ID) const {
2761 SCEVExpander Exp(*SE, DL, "induction");
2762 auto Step = ID.getStep();
2763 auto StartValue = ID.getStartValue();
2764 assert(Index->getType() == Step->getType() &&
2765 "Index type does not match StepValue type");
2767 // Note: the IR at this point is broken. We cannot use SE to create any new
2768 // SCEV and then expand it, hoping that SCEV's simplification will give us
2769 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2770 // lead to various SCEV crashes. So all we can do is to use builder and rely
2771 // on InstCombine for future simplifications. Here we handle some trivial
2772 // cases only.
2773 auto CreateAdd = [&B](Value *X, Value *Y) {
2774 assert(X->getType() == Y->getType() && "Types don't match!");
2775 if (auto *CX = dyn_cast<ConstantInt>(X))
2776 if (CX->isZero())
2777 return Y;
2778 if (auto *CY = dyn_cast<ConstantInt>(Y))
2779 if (CY->isZero())
2780 return X;
2781 return B.CreateAdd(X, Y);
2784 auto CreateMul = [&B](Value *X, Value *Y) {
2785 assert(X->getType() == Y->getType() && "Types don't match!");
2786 if (auto *CX = dyn_cast<ConstantInt>(X))
2787 if (CX->isOne())
2788 return Y;
2789 if (auto *CY = dyn_cast<ConstantInt>(Y))
2790 if (CY->isOne())
2791 return X;
2792 return B.CreateMul(X, Y);
2795 switch (ID.getKind()) {
2796 case InductionDescriptor::IK_IntInduction: {
2797 assert(Index->getType() == StartValue->getType() &&
2798 "Index type does not match StartValue type");
2799 if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2800 return B.CreateSub(StartValue, Index);
2801 auto *Offset = CreateMul(
2802 Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2803 return CreateAdd(StartValue, Offset);
2805 case InductionDescriptor::IK_PtrInduction: {
2806 assert(isa<SCEVConstant>(Step) &&
2807 "Expected constant step for pointer induction");
2808 return B.CreateGEP(
2809 StartValue->getType()->getPointerElementType(), StartValue,
2810 CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2811 &*B.GetInsertPoint())));
2813 case InductionDescriptor::IK_FpInduction: {
2814 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2815 auto InductionBinOp = ID.getInductionBinOp();
2816 assert(InductionBinOp &&
2817 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2818 InductionBinOp->getOpcode() == Instruction::FSub) &&
2819 "Original bin op should be defined for FP induction");
2821 Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2823 // Floating point operations had to be 'fast' to enable the induction.
2824 FastMathFlags Flags;
2825 Flags.setFast();
2827 Value *MulExp = B.CreateFMul(StepValue, Index);
2828 if (isa<Instruction>(MulExp))
2829 // We have to check, the MulExp may be a constant.
2830 cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2832 Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2833 "induction");
2834 if (isa<Instruction>(BOp))
2835 cast<Instruction>(BOp)->setFastMathFlags(Flags);
2837 return BOp;
2839 case InductionDescriptor::IK_NoInduction:
2840 return nullptr;
2842 llvm_unreachable("invalid enum");
2845 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2847 In this function we generate a new loop. The new loop will contain
2848 the vectorized instructions while the old loop will continue to run the
2849 scalar remainder.
2851 [ ] <-- loop iteration number check.
2854 | [ ] <-- vector loop bypass (may consist of multiple blocks).
2855 | / |
2856 | / v
2857 || [ ] <-- vector pre header.
2858 |/ |
2860 | [ ] \
2861 | [ ]_| <-- vector loop.
2864 | -[ ] <--- middle-block.
2865 | / |
2866 | / v
2867 -|- >[ ] <--- new preheader.
2870 | [ ] \
2871 | [ ]_| <-- old scalar loop to handle remainder.
2874 >[ ] <-- exit block.
2878 BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2879 BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2880 BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2881 MDNode *OrigLoopID = OrigLoop->getLoopID();
2882 assert(VectorPH && "Invalid loop structure");
2883 assert(ExitBlock && "Must have an exit block");
2885 // Some loops have a single integer induction variable, while other loops
2886 // don't. One example is c++ iterators that often have multiple pointer
2887 // induction variables. In the code below we also support a case where we
2888 // don't have a single induction variable.
2890 // We try to obtain an induction variable from the original loop as hard
2891 // as possible. However if we don't find one that:
2892 // - is an integer
2893 // - counts from zero, stepping by one
2894 // - is the size of the widest induction variable type
2895 // then we create a new one.
2896 OldInduction = Legal->getPrimaryInduction();
2897 Type *IdxTy = Legal->getWidestInductionType();
2899 // Split the single block loop into the two loop structure described above.
2900 BasicBlock *VecBody =
2901 VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2902 BasicBlock *MiddleBlock =
2903 VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2904 BasicBlock *ScalarPH =
2905 MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2907 // Create and register the new vector loop.
2908 Loop *Lp = LI->AllocateLoop();
2909 Loop *ParentLoop = OrigLoop->getParentLoop();
2911 // Insert the new loop into the loop nest and register the new basic blocks
2912 // before calling any utilities such as SCEV that require valid LoopInfo.
2913 if (ParentLoop) {
2914 ParentLoop->addChildLoop(Lp);
2915 ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2916 ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2917 } else {
2918 LI->addTopLevelLoop(Lp);
2920 Lp->addBasicBlockToLoop(VecBody, *LI);
2922 // Find the loop boundaries.
2923 Value *Count = getOrCreateTripCount(Lp);
2925 Value *StartIdx = ConstantInt::get(IdxTy, 0);
2927 // Now, compare the new count to zero. If it is zero skip the vector loop and
2928 // jump to the scalar loop. This check also covers the case where the
2929 // backedge-taken count is uint##_max: adding one to it will overflow leading
2930 // to an incorrect trip count of zero. In this (rare) case we will also jump
2931 // to the scalar loop.
2932 emitMinimumIterationCountCheck(Lp, ScalarPH);
2934 // Generate the code to check any assumptions that we've made for SCEV
2935 // expressions.
2936 emitSCEVChecks(Lp, ScalarPH);
2938 // Generate the code that checks in runtime if arrays overlap. We put the
2939 // checks into a separate block to make the more common case of few elements
2940 // faster.
2941 emitMemRuntimeChecks(Lp, ScalarPH);
2943 // Generate the induction variable.
2944 // The loop step is equal to the vectorization factor (num of SIMD elements)
2945 // times the unroll factor (num of SIMD instructions).
2946 Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2947 Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2948 Induction =
2949 createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2950 getDebugLocFromInstOrOperands(OldInduction));
2952 // We are going to resume the execution of the scalar loop.
2953 // Go over all of the induction variables that we found and fix the
2954 // PHIs that are left in the scalar version of the loop.
2955 // The starting values of PHI nodes depend on the counter of the last
2956 // iteration in the vectorized loop.
2957 // If we come from a bypass edge then we need to start from the original
2958 // start value.
2960 // This variable saves the new starting index for the scalar loop. It is used
2961 // to test if there are any tail iterations left once the vector loop has
2962 // completed.
2963 LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
2964 for (auto &InductionEntry : *List) {
2965 PHINode *OrigPhi = InductionEntry.first;
2966 InductionDescriptor II = InductionEntry.second;
2968 // Create phi nodes to merge from the backedge-taken check block.
2969 PHINode *BCResumeVal = PHINode::Create(
2970 OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
2971 // Copy original phi DL over to the new one.
2972 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2973 Value *&EndValue = IVEndValues[OrigPhi];
2974 if (OrigPhi == OldInduction) {
2975 // We know what the end value is.
2976 EndValue = CountRoundDown;
2977 } else {
2978 IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
2979 Type *StepType = II.getStep()->getType();
2980 Instruction::CastOps CastOp =
2981 CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
2982 Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
2983 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2984 EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
2985 EndValue->setName("ind.end");
2988 // The new PHI merges the original incoming value, in case of a bypass,
2989 // or the value at the end of the vectorized loop.
2990 BCResumeVal->addIncoming(EndValue, MiddleBlock);
2992 // Fix the scalar body counter (PHI node).
2993 // The old induction's phi node in the scalar body needs the truncated
2994 // value.
2995 for (BasicBlock *BB : LoopBypassBlocks)
2996 BCResumeVal->addIncoming(II.getStartValue(), BB);
2997 OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);
3000 // We need the OrigLoop (scalar loop part) latch terminator to help
3001 // produce correct debug info for the middle block BB instructions.
3002 // The legality check stage guarantees that the loop will have a single
3003 // latch.
3004 assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3005 "Scalar loop latch terminator isn't a branch");
3006 BranchInst *ScalarLatchBr =
3007 cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3009 // Add a check in the middle block to see if we have completed
3010 // all of the iterations in the first vector loop.
3011 // If (N - N%VF) == N, then we *don't* need to run the remainder.
3012 // If tail is to be folded, we know we don't need to run the remainder.
3013 Value *CmpN = Builder.getTrue();
3014 if (!Cost->foldTailByMasking()) {
3015 CmpN =
3016 CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3017 CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
3019 // Here we use the same DebugLoc as the scalar loop latch branch instead
3020 // of the corresponding compare because they may have ended up with
3021 // different line numbers and we want to avoid awkward line stepping while
3022 // debugging. Eg. if the compare has got a line number inside the loop.
3023 cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3026 BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
3027 BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3028 ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
3030 // Get ready to start creating new instructions into the vectorized body.
3031 Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
3033 // Save the state.
3034 LoopVectorPreHeader = Lp->getLoopPreheader();
3035 LoopScalarPreHeader = ScalarPH;
3036 LoopMiddleBlock = MiddleBlock;
3037 LoopExitBlock = ExitBlock;
3038 LoopVectorBody = VecBody;
3039 LoopScalarBody = OldBasicBlock;
3041 Optional<MDNode *> VectorizedLoopID =
3042 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3043 LLVMLoopVectorizeFollowupVectorized});
3044 if (VectorizedLoopID.hasValue()) {
3045 Lp->setLoopID(VectorizedLoopID.getValue());
3047 // Do not setAlreadyVectorized if loop attributes have been defined
3048 // explicitly.
3049 return LoopVectorPreHeader;
3052 // Keep all loop hints from the original loop on the vector loop (we'll
3053 // replace the vectorizer-specific hints below).
3054 if (MDNode *LID = OrigLoop->getLoopID())
3055 Lp->setLoopID(LID);
3057 LoopVectorizeHints Hints(Lp, true, *ORE);
3058 Hints.setAlreadyVectorized();
3060 return LoopVectorPreHeader;
3063 // Fix up external users of the induction variable. At this point, we are
3064 // in LCSSA form, with all external PHIs that use the IV having one input value,
3065 // coming from the remainder loop. We need those PHIs to also have a correct
3066 // value for the IV when arriving directly from the middle block.
3067 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3068 const InductionDescriptor &II,
3069 Value *CountRoundDown, Value *EndValue,
3070 BasicBlock *MiddleBlock) {
3071 // There are two kinds of external IV usages - those that use the value
3072 // computed in the last iteration (the PHI) and those that use the penultimate
3073 // value (the value that feeds into the phi from the loop latch).
3074 // We allow both, but they, obviously, have different values.
3076 assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3078 DenseMap<Value *, Value *> MissingVals;
3080 // An external user of the last iteration's value should see the value that
3081 // the remainder loop uses to initialize its own IV.
3082 Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3083 for (User *U : PostInc->users()) {
3084 Instruction *UI = cast<Instruction>(U);
3085 if (!OrigLoop->contains(UI)) {
3086 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3087 MissingVals[UI] = EndValue;
3091 // An external user of the penultimate value need to see EndValue - Step.
3092 // The simplest way to get this is to recompute it from the constituent SCEVs,
3093 // that is Start + (Step * (CRD - 1)).
3094 for (User *U : OrigPhi->users()) {
3095 auto *UI = cast<Instruction>(U);
3096 if (!OrigLoop->contains(UI)) {
3097 const DataLayout &DL =
3098 OrigLoop->getHeader()->getModule()->getDataLayout();
3099 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3101 IRBuilder<> B(MiddleBlock->getTerminator());
3102 Value *CountMinusOne = B.CreateSub(
3103 CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3104 Value *CMO =
3105 !II.getStep()->getType()->isIntegerTy()
3106 ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3107 II.getStep()->getType())
3108 : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3109 CMO->setName("cast.cmo");
3110 Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3111 Escape->setName("ind.escape");
3112 MissingVals[UI] = Escape;
3116 for (auto &I : MissingVals) {
3117 PHINode *PHI = cast<PHINode>(I.first);
3118 // One corner case we have to handle is two IVs "chasing" each-other,
3119 // that is %IV2 = phi [...], [ %IV1, %latch ]
3120 // In this case, if IV1 has an external use, we need to avoid adding both
3121 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3122 // don't already have an incoming value for the middle block.
3123 if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3124 PHI->addIncoming(I.second, MiddleBlock);
3128 namespace {
3130 struct CSEDenseMapInfo {
3131 static bool canHandle(const Instruction *I) {
3132 return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3133 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3136 static inline Instruction *getEmptyKey() {
3137 return DenseMapInfo<Instruction *>::getEmptyKey();
3140 static inline Instruction *getTombstoneKey() {
3141 return DenseMapInfo<Instruction *>::getTombstoneKey();
3144 static unsigned getHashValue(const Instruction *I) {
3145 assert(canHandle(I) && "Unknown instruction!");
3146 return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3147 I->value_op_end()));
3150 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3151 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3152 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3153 return LHS == RHS;
3154 return LHS->isIdenticalTo(RHS);
3158 } // end anonymous namespace
3160 ///Perform cse of induction variable instructions.
3161 static void cse(BasicBlock *BB) {
3162 // Perform simple cse.
3163 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3164 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3165 Instruction *In = &*I++;
3167 if (!CSEDenseMapInfo::canHandle(In))
3168 continue;
3170 // Check if we can replace this instruction with any of the
3171 // visited instructions.
3172 if (Instruction *V = CSEMap.lookup(In)) {
3173 In->replaceAllUsesWith(V);
3174 In->eraseFromParent();
3175 continue;
3178 CSEMap[In] = In;
3182 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3183 unsigned VF,
3184 bool &NeedToScalarize) {
3185 Function *F = CI->getCalledFunction();
3186 StringRef FnName = CI->getCalledFunction()->getName();
3187 Type *ScalarRetTy = CI->getType();
3188 SmallVector<Type *, 4> Tys, ScalarTys;
3189 for (auto &ArgOp : CI->arg_operands())
3190 ScalarTys.push_back(ArgOp->getType());
3192 // Estimate cost of scalarized vector call. The source operands are assumed
3193 // to be vectors, so we need to extract individual elements from there,
3194 // execute VF scalar calls, and then gather the result into the vector return
3195 // value.
3196 unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3197 if (VF == 1)
3198 return ScalarCallCost;
3200 // Compute corresponding vector type for return value and arguments.
3201 Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3202 for (Type *ScalarTy : ScalarTys)
3203 Tys.push_back(ToVectorTy(ScalarTy, VF));
3205 // Compute costs of unpacking argument values for the scalar calls and
3206 // packing the return values to a vector.
3207 unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3209 unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3211 // If we can't emit a vector call for this function, then the currently found
3212 // cost is the cost we need to return.
3213 NeedToScalarize = true;
3214 if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3215 return Cost;
3217 // If the corresponding vector cost is cheaper, return its cost.
3218 unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3219 if (VectorCallCost < Cost) {
3220 NeedToScalarize = false;
3221 return VectorCallCost;
3223 return Cost;
3226 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3227 unsigned VF) {
3228 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3229 assert(ID && "Expected intrinsic call!");
3231 FastMathFlags FMF;
3232 if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3233 FMF = FPMO->getFastMathFlags();
3235 SmallVector<Value *, 4> Operands(CI->arg_operands());
3236 return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3239 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3240 auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3241 auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3242 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3244 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3245 auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3246 auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3247 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3250 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3251 // For every instruction `I` in MinBWs, truncate the operands, create a
3252 // truncated version of `I` and reextend its result. InstCombine runs
3253 // later and will remove any ext/trunc pairs.
3254 SmallPtrSet<Value *, 4> Erased;
3255 for (const auto &KV : Cost->getMinimalBitwidths()) {
3256 // If the value wasn't vectorized, we must maintain the original scalar
3257 // type. The absence of the value from VectorLoopValueMap indicates that it
3258 // wasn't vectorized.
3259 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3260 continue;
3261 for (unsigned Part = 0; Part < UF; ++Part) {
3262 Value *I = getOrCreateVectorValue(KV.first, Part);
3263 if (Erased.find(I) != Erased.end() || I->use_empty() ||
3264 !isa<Instruction>(I))
3265 continue;
3266 Type *OriginalTy = I->getType();
3267 Type *ScalarTruncatedTy =
3268 IntegerType::get(OriginalTy->getContext(), KV.second);
3269 Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3270 OriginalTy->getVectorNumElements());
3271 if (TruncatedTy == OriginalTy)
3272 continue;
3274 IRBuilder<> B(cast<Instruction>(I));
3275 auto ShrinkOperand = [&](Value *V) -> Value * {
3276 if (auto *ZI = dyn_cast<ZExtInst>(V))
3277 if (ZI->getSrcTy() == TruncatedTy)
3278 return ZI->getOperand(0);
3279 return B.CreateZExtOrTrunc(V, TruncatedTy);
3282 // The actual instruction modification depends on the instruction type,
3283 // unfortunately.
3284 Value *NewI = nullptr;
3285 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3286 NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3287 ShrinkOperand(BO->getOperand(1)));
3289 // Any wrapping introduced by shrinking this operation shouldn't be
3290 // considered undefined behavior. So, we can't unconditionally copy
3291 // arithmetic wrapping flags to NewI.
3292 cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3293 } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3294 NewI =
3295 B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3296 ShrinkOperand(CI->getOperand(1)));
3297 } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3298 NewI = B.CreateSelect(SI->getCondition(),
3299 ShrinkOperand(SI->getTrueValue()),
3300 ShrinkOperand(SI->getFalseValue()));
3301 } else if (auto *CI = dyn_cast<CastInst>(I)) {
3302 switch (CI->getOpcode()) {
3303 default:
3304 llvm_unreachable("Unhandled cast!");
3305 case Instruction::Trunc:
3306 NewI = ShrinkOperand(CI->getOperand(0));
3307 break;
3308 case Instruction::SExt:
3309 NewI = B.CreateSExtOrTrunc(
3310 CI->getOperand(0),
3311 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3312 break;
3313 case Instruction::ZExt:
3314 NewI = B.CreateZExtOrTrunc(
3315 CI->getOperand(0),
3316 smallestIntegerVectorType(OriginalTy, TruncatedTy));
3317 break;
3319 } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3320 auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3321 auto *O0 = B.CreateZExtOrTrunc(
3322 SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3323 auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3324 auto *O1 = B.CreateZExtOrTrunc(
3325 SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3327 NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3328 } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3329 // Don't do anything with the operands, just extend the result.
3330 continue;
3331 } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3332 auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3333 auto *O0 = B.CreateZExtOrTrunc(
3334 IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3335 auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3336 NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3337 } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3338 auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3339 auto *O0 = B.CreateZExtOrTrunc(
3340 EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3341 NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3342 } else {
3343 // If we don't know what to do, be conservative and don't do anything.
3344 continue;
3347 // Lastly, extend the result.
3348 NewI->takeName(cast<Instruction>(I));
3349 Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3350 I->replaceAllUsesWith(Res);
3351 cast<Instruction>(I)->eraseFromParent();
3352 Erased.insert(I);
3353 VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3357 // We'll have created a bunch of ZExts that are now parentless. Clean up.
3358 for (const auto &KV : Cost->getMinimalBitwidths()) {
3359 // If the value wasn't vectorized, we must maintain the original scalar
3360 // type. The absence of the value from VectorLoopValueMap indicates that it
3361 // wasn't vectorized.
3362 if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3363 continue;
3364 for (unsigned Part = 0; Part < UF; ++Part) {
3365 Value *I = getOrCreateVectorValue(KV.first, Part);
3366 ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3367 if (Inst && Inst->use_empty()) {
3368 Value *NewI = Inst->getOperand(0);
3369 Inst->eraseFromParent();
3370 VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3376 void InnerLoopVectorizer::fixVectorizedLoop() {
3377 // Insert truncates and extends for any truncated instructions as hints to
3378 // InstCombine.
3379 if (VF > 1)
3380 truncateToMinimalBitwidths();
3382 // Fix widened non-induction PHIs by setting up the PHI operands.
3383 if (OrigPHIsToFix.size()) {
3384 assert(EnableVPlanNativePath &&
3385 "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3386 fixNonInductionPHIs();
3389 // At this point every instruction in the original loop is widened to a
3390 // vector form. Now we need to fix the recurrences in the loop. These PHI
3391 // nodes are currently empty because we did not want to introduce cycles.
3392 // This is the second stage of vectorizing recurrences.
3393 fixCrossIterationPHIs();
3395 // Update the dominator tree.
3397 // FIXME: After creating the structure of the new loop, the dominator tree is
3398 // no longer up-to-date, and it remains that way until we update it
3399 // here. An out-of-date dominator tree is problematic for SCEV,
3400 // because SCEVExpander uses it to guide code generation. The
3401 // vectorizer use SCEVExpanders in several places. Instead, we should
3402 // keep the dominator tree up-to-date as we go.
3403 updateAnalysis();
3405 // Fix-up external users of the induction variables.
3406 for (auto &Entry : *Legal->getInductionVars())
3407 fixupIVUsers(Entry.first, Entry.second,
3408 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3409 IVEndValues[Entry.first], LoopMiddleBlock);
3411 fixLCSSAPHIs();
3412 for (Instruction *PI : PredicatedInstructions)
3413 sinkScalarOperands(&*PI);
3415 // Remove redundant induction instructions.
3416 cse(LoopVectorBody);
3419 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3420 // In order to support recurrences we need to be able to vectorize Phi nodes.
3421 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3422 // stage #2: We now need to fix the recurrences by adding incoming edges to
3423 // the currently empty PHI nodes. At this point every instruction in the
3424 // original loop is widened to a vector form so we can use them to construct
3425 // the incoming edges.
3426 for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3427 // Handle first-order recurrences and reductions that need to be fixed.
3428 if (Legal->isFirstOrderRecurrence(&Phi))
3429 fixFirstOrderRecurrence(&Phi);
3430 else if (Legal->isReductionVariable(&Phi))
3431 fixReduction(&Phi);
3435 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3436 // This is the second phase of vectorizing first-order recurrences. An
3437 // overview of the transformation is described below. Suppose we have the
3438 // following loop.
3440 // for (int i = 0; i < n; ++i)
3441 // b[i] = a[i] - a[i - 1];
3443 // There is a first-order recurrence on "a". For this loop, the shorthand
3444 // scalar IR looks like:
3446 // scalar.ph:
3447 // s_init = a[-1]
3448 // br scalar.body
3450 // scalar.body:
3451 // i = phi [0, scalar.ph], [i+1, scalar.body]
3452 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3453 // s2 = a[i]
3454 // b[i] = s2 - s1
3455 // br cond, scalar.body, ...
3457 // In this example, s1 is a recurrence because it's value depends on the
3458 // previous iteration. In the first phase of vectorization, we created a
3459 // temporary value for s1. We now complete the vectorization and produce the
3460 // shorthand vector IR shown below (for VF = 4, UF = 1).
3462 // vector.ph:
3463 // v_init = vector(..., ..., ..., a[-1])
3464 // br vector.body
3466 // vector.body
3467 // i = phi [0, vector.ph], [i+4, vector.body]
3468 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3469 // v2 = a[i, i+1, i+2, i+3];
3470 // v3 = vector(v1(3), v2(0, 1, 2))
3471 // b[i, i+1, i+2, i+3] = v2 - v3
3472 // br cond, vector.body, middle.block
3474 // middle.block:
3475 // x = v2(3)
3476 // br scalar.ph
3478 // scalar.ph:
3479 // s_init = phi [x, middle.block], [a[-1], otherwise]
3480 // br scalar.body
3482 // After execution completes the vector loop, we extract the next value of
3483 // the recurrence (x) to use as the initial value in the scalar loop.
3485 // Get the original loop preheader and single loop latch.
3486 auto *Preheader = OrigLoop->getLoopPreheader();
3487 auto *Latch = OrigLoop->getLoopLatch();
3489 // Get the initial and previous values of the scalar recurrence.
3490 auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3491 auto *Previous = Phi->getIncomingValueForBlock(Latch);
3493 // Create a vector from the initial value.
3494 auto *VectorInit = ScalarInit;
3495 if (VF > 1) {
3496 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3497 VectorInit = Builder.CreateInsertElement(
3498 UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3499 Builder.getInt32(VF - 1), "vector.recur.init");
3502 // We constructed a temporary phi node in the first phase of vectorization.
3503 // This phi node will eventually be deleted.
3504 Builder.SetInsertPoint(
3505 cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3507 // Create a phi node for the new recurrence. The current value will either be
3508 // the initial value inserted into a vector or loop-varying vector value.
3509 auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3510 VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3512 // Get the vectorized previous value of the last part UF - 1. It appears last
3513 // among all unrolled iterations, due to the order of their construction.
3514 Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3516 // Set the insertion point after the previous value if it is an instruction.
3517 // Note that the previous value may have been constant-folded so it is not
3518 // guaranteed to be an instruction in the vector loop. Also, if the previous
3519 // value is a phi node, we should insert after all the phi nodes to avoid
3520 // breaking basic block verification.
3521 if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
3522 isa<PHINode>(PreviousLastPart))
3523 Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3524 else
3525 Builder.SetInsertPoint(
3526 &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
3528 // We will construct a vector for the recurrence by combining the values for
3529 // the current and previous iterations. This is the required shuffle mask.
3530 SmallVector<Constant *, 8> ShuffleMask(VF);
3531 ShuffleMask[0] = Builder.getInt32(VF - 1);
3532 for (unsigned I = 1; I < VF; ++I)
3533 ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3535 // The vector from which to take the initial value for the current iteration
3536 // (actual or unrolled). Initially, this is the vector phi node.
3537 Value *Incoming = VecPhi;
3539 // Shuffle the current and previous vector and update the vector parts.
3540 for (unsigned Part = 0; Part < UF; ++Part) {
3541 Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3542 Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3543 auto *Shuffle =
3544 VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3545 ConstantVector::get(ShuffleMask))
3546 : Incoming;
3547 PhiPart->replaceAllUsesWith(Shuffle);
3548 cast<Instruction>(PhiPart)->eraseFromParent();
3549 VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3550 Incoming = PreviousPart;
3553 // Fix the latch value of the new recurrence in the vector loop.
3554 VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3556 // Extract the last vector element in the middle block. This will be the
3557 // initial value for the recurrence when jumping to the scalar loop.
3558 auto *ExtractForScalar = Incoming;
3559 if (VF > 1) {
3560 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3561 ExtractForScalar = Builder.CreateExtractElement(
3562 ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3564 // Extract the second last element in the middle block if the
3565 // Phi is used outside the loop. We need to extract the phi itself
3566 // and not the last element (the phi update in the current iteration). This
3567 // will be the value when jumping to the exit block from the LoopMiddleBlock,
3568 // when the scalar loop is not run at all.
3569 Value *ExtractForPhiUsedOutsideLoop = nullptr;
3570 if (VF > 1)
3571 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3572 Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3573 // When loop is unrolled without vectorizing, initialize
3574 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3575 // `Incoming`. This is analogous to the vectorized case above: extracting the
3576 // second last element when VF > 1.
3577 else if (UF > 1)
3578 ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3580 // Fix the initial value of the original recurrence in the scalar loop.
3581 Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3582 auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3583 for (auto *BB : predecessors(LoopScalarPreHeader)) {
3584 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3585 Start->addIncoming(Incoming, BB);
3588 Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3589 Phi->setName("scalar.recur");
3591 // Finally, fix users of the recurrence outside the loop. The users will need
3592 // either the last value of the scalar recurrence or the last value of the
3593 // vector recurrence we extracted in the middle block. Since the loop is in
3594 // LCSSA form, we just need to find all the phi nodes for the original scalar
3595 // recurrence in the exit block, and then add an edge for the middle block.
3596 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3597 if (LCSSAPhi.getIncomingValue(0) == Phi) {
3598 LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3603 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3604 Constant *Zero = Builder.getInt32(0);
3606 // Get it's reduction variable descriptor.
3607 assert(Legal->isReductionVariable(Phi) &&
3608 "Unable to find the reduction variable");
3609 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3611 RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3612 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3613 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3614 RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3615 RdxDesc.getMinMaxRecurrenceKind();
3616 setDebugLocFromInst(Builder, ReductionStartValue);
3618 // We need to generate a reduction vector from the incoming scalar.
3619 // To do so, we need to generate the 'identity' vector and override
3620 // one of the elements with the incoming scalar reduction. We need
3621 // to do it in the vector-loop preheader.
3622 Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3624 // This is the vector-clone of the value that leaves the loop.
3625 Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3627 // Find the reduction identity variable. Zero for addition, or, xor,
3628 // one for multiplication, -1 for And.
3629 Value *Identity;
3630 Value *VectorStart;
3631 if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3632 RK == RecurrenceDescriptor::RK_FloatMinMax) {
3633 // MinMax reduction have the start value as their identify.
3634 if (VF == 1) {
3635 VectorStart = Identity = ReductionStartValue;
3636 } else {
3637 VectorStart = Identity =
3638 Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3640 } else {
3641 // Handle other reduction kinds:
3642 Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3643 RK, VecTy->getScalarType());
3644 if (VF == 1) {
3645 Identity = Iden;
3646 // This vector is the Identity vector where the first element is the
3647 // incoming scalar reduction.
3648 VectorStart = ReductionStartValue;
3649 } else {
3650 Identity = ConstantVector::getSplat(VF, Iden);
3652 // This vector is the Identity vector where the first element is the
3653 // incoming scalar reduction.
3654 VectorStart =
3655 Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3659 // Fix the vector-loop phi.
3661 // Reductions do not have to start at zero. They can start with
3662 // any loop invariant values.
3663 BasicBlock *Latch = OrigLoop->getLoopLatch();
3664 Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3665 for (unsigned Part = 0; Part < UF; ++Part) {
3666 Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3667 Value *Val = getOrCreateVectorValue(LoopVal, Part);
3668 // Make sure to add the reduction stat value only to the
3669 // first unroll part.
3670 Value *StartVal = (Part == 0) ? VectorStart : Identity;
3671 cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3672 cast<PHINode>(VecRdxPhi)
3673 ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3676 // Before each round, move the insertion point right between
3677 // the PHIs and the values we are going to write.
3678 // This allows us to write both PHINodes and the extractelement
3679 // instructions.
3680 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3682 setDebugLocFromInst(Builder, LoopExitInst);
3684 // If tail is folded by masking, the vector value to leave the loop should be
3685 // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3686 // instead of the former.
3687 if (Cost->foldTailByMasking()) {
3688 for (unsigned Part = 0; Part < UF; ++Part) {
3689 Value *VecLoopExitInst =
3690 VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3691 Value *Sel = nullptr;
3692 for (User *U : VecLoopExitInst->users()) {
3693 if (isa<SelectInst>(U)) {
3694 assert(!Sel && "Reduction exit feeding two selects");
3695 Sel = U;
3696 } else
3697 assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3699 assert(Sel && "Reduction exit feeds no select");
3700 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3704 // If the vector reduction can be performed in a smaller type, we truncate
3705 // then extend the loop exit value to enable InstCombine to evaluate the
3706 // entire expression in the smaller type.
3707 if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3708 Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3709 Builder.SetInsertPoint(
3710 LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3711 VectorParts RdxParts(UF);
3712 for (unsigned Part = 0; Part < UF; ++Part) {
3713 RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3714 Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3715 Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3716 : Builder.CreateZExt(Trunc, VecTy);
3717 for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3718 UI != RdxParts[Part]->user_end();)
3719 if (*UI != Trunc) {
3720 (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3721 RdxParts[Part] = Extnd;
3722 } else {
3723 ++UI;
3726 Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3727 for (unsigned Part = 0; Part < UF; ++Part) {
3728 RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3729 VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3733 // Reduce all of the unrolled parts into a single vector.
3734 Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3735 unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3737 // The middle block terminator has already been assigned a DebugLoc here (the
3738 // OrigLoop's single latch terminator). We want the whole middle block to
3739 // appear to execute on this line because: (a) it is all compiler generated,
3740 // (b) these instructions are always executed after evaluating the latch
3741 // conditional branch, and (c) other passes may add new predecessors which
3742 // terminate on this line. This is the easiest way to ensure we don't
3743 // accidentally cause an extra step back into the loop while debugging.
3744 setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3745 for (unsigned Part = 1; Part < UF; ++Part) {
3746 Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3747 if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3748 // Floating point operations had to be 'fast' to enable the reduction.
3749 ReducedPartRdx = addFastMathFlag(
3750 Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3751 ReducedPartRdx, "bin.rdx"),
3752 RdxDesc.getFastMathFlags());
3753 else
3754 ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3755 RdxPart);
3758 if (VF > 1) {
3759 bool NoNaN = Legal->hasFunNoNaNAttr();
3760 ReducedPartRdx =
3761 createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3762 // If the reduction can be performed in a smaller type, we need to extend
3763 // the reduction to the wider type before we branch to the original loop.
3764 if (Phi->getType() != RdxDesc.getRecurrenceType())
3765 ReducedPartRdx =
3766 RdxDesc.isSigned()
3767 ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3768 : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3771 // Create a phi node that merges control-flow from the backedge-taken check
3772 // block and the middle block.
3773 PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3774 LoopScalarPreHeader->getTerminator());
3775 for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3776 BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3777 BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3779 // Now, we need to fix the users of the reduction variable
3780 // inside and outside of the scalar remainder loop.
3781 // We know that the loop is in LCSSA form. We need to update the
3782 // PHI nodes in the exit blocks.
3783 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3784 // All PHINodes need to have a single entry edge, or two if
3785 // we already fixed them.
3786 assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3788 // We found a reduction value exit-PHI. Update it with the
3789 // incoming bypass edge.
3790 if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3791 LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3792 } // end of the LCSSA phi scan.
3794 // Fix the scalar loop reduction variable with the incoming reduction sum
3795 // from the vector body and from the backedge value.
3796 int IncomingEdgeBlockIdx =
3797 Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3798 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3799 // Pick the other block.
3800 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3801 Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3802 Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3805 void InnerLoopVectorizer::fixLCSSAPHIs() {
3806 for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3807 if (LCSSAPhi.getNumIncomingValues() == 1) {
3808 auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3809 // Non-instruction incoming values will have only one value.
3810 unsigned LastLane = 0;
3811 if (isa<Instruction>(IncomingValue))
3812 LastLane = Cost->isUniformAfterVectorization(
3813 cast<Instruction>(IncomingValue), VF)
3815 : VF - 1;
3816 // Can be a loop invariant incoming value or the last scalar value to be
3817 // extracted from the vectorized loop.
3818 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3819 Value *lastIncomingValue =
3820 getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3821 LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3826 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3827 // The basic block and loop containing the predicated instruction.
3828 auto *PredBB = PredInst->getParent();
3829 auto *VectorLoop = LI->getLoopFor(PredBB);
3831 // Initialize a worklist with the operands of the predicated instruction.
3832 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3834 // Holds instructions that we need to analyze again. An instruction may be
3835 // reanalyzed if we don't yet know if we can sink it or not.
3836 SmallVector<Instruction *, 8> InstsToReanalyze;
3838 // Returns true if a given use occurs in the predicated block. Phi nodes use
3839 // their operands in their corresponding predecessor blocks.
3840 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3841 auto *I = cast<Instruction>(U.getUser());
3842 BasicBlock *BB = I->getParent();
3843 if (auto *Phi = dyn_cast<PHINode>(I))
3844 BB = Phi->getIncomingBlock(
3845 PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3846 return BB == PredBB;
3849 // Iteratively sink the scalarized operands of the predicated instruction
3850 // into the block we created for it. When an instruction is sunk, it's
3851 // operands are then added to the worklist. The algorithm ends after one pass
3852 // through the worklist doesn't sink a single instruction.
3853 bool Changed;
3854 do {
3855 // Add the instructions that need to be reanalyzed to the worklist, and
3856 // reset the changed indicator.
3857 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3858 InstsToReanalyze.clear();
3859 Changed = false;
3861 while (!Worklist.empty()) {
3862 auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3864 // We can't sink an instruction if it is a phi node, is already in the
3865 // predicated block, is not in the loop, or may have side effects.
3866 if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3867 !VectorLoop->contains(I) || I->mayHaveSideEffects())
3868 continue;
3870 // It's legal to sink the instruction if all its uses occur in the
3871 // predicated block. Otherwise, there's nothing to do yet, and we may
3872 // need to reanalyze the instruction.
3873 if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3874 InstsToReanalyze.push_back(I);
3875 continue;
3878 // Move the instruction to the beginning of the predicated block, and add
3879 // it's operands to the worklist.
3880 I->moveBefore(&*PredBB->getFirstInsertionPt());
3881 Worklist.insert(I->op_begin(), I->op_end());
3883 // The sinking may have enabled other instructions to be sunk, so we will
3884 // need to iterate.
3885 Changed = true;
3887 } while (Changed);
3890 void InnerLoopVectorizer::fixNonInductionPHIs() {
3891 for (PHINode *OrigPhi : OrigPHIsToFix) {
3892 PHINode *NewPhi =
3893 cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3894 unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3896 SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3897 predecessors(OrigPhi->getParent()));
3898 SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3899 predecessors(NewPhi->getParent()));
3900 assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3901 "Scalar and Vector BB should have the same number of predecessors");
3903 // The insertion point in Builder may be invalidated by the time we get
3904 // here. Force the Builder insertion point to something valid so that we do
3905 // not run into issues during insertion point restore in
3906 // getOrCreateVectorValue calls below.
3907 Builder.SetInsertPoint(NewPhi);
3909 // The predecessor order is preserved and we can rely on mapping between
3910 // scalar and vector block predecessors.
3911 for (unsigned i = 0; i < NumIncomingValues; ++i) {
3912 BasicBlock *NewPredBB = VectorBBPredecessors[i];
3914 // When looking up the new scalar/vector values to fix up, use incoming
3915 // values from original phi.
3916 Value *ScIncV =
3917 OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3919 // Scalar incoming value may need a broadcast
3920 Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3921 NewPhi->addIncoming(NewIncV, NewPredBB);
3926 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
3927 unsigned VF) {
3928 PHINode *P = cast<PHINode>(PN);
3929 if (EnableVPlanNativePath) {
3930 // Currently we enter here in the VPlan-native path for non-induction
3931 // PHIs where all control flow is uniform. We simply widen these PHIs.
3932 // Create a vector phi with no operands - the vector phi operands will be
3933 // set at the end of vector code generation.
3934 Type *VecTy =
3935 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3936 Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3937 VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3938 OrigPHIsToFix.push_back(P);
3940 return;
3943 assert(PN->getParent() == OrigLoop->getHeader() &&
3944 "Non-header phis should have been handled elsewhere");
3946 // In order to support recurrences we need to be able to vectorize Phi nodes.
3947 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3948 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3949 // this value when we vectorize all of the instructions that use the PHI.
3950 if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
3951 for (unsigned Part = 0; Part < UF; ++Part) {
3952 // This is phase one of vectorizing PHIs.
3953 Type *VecTy =
3954 (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3955 Value *EntryPart = PHINode::Create(
3956 VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
3957 VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
3959 return;
3962 setDebugLocFromInst(Builder, P);
3964 // This PHINode must be an induction variable.
3965 // Make sure that we know about it.
3966 assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
3968 InductionDescriptor II = Legal->getInductionVars()->lookup(P);
3969 const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3971 // FIXME: The newly created binary instructions should contain nsw/nuw flags,
3972 // which can be found from the original scalar operations.
3973 switch (II.getKind()) {
3974 case InductionDescriptor::IK_NoInduction:
3975 llvm_unreachable("Unknown induction");
3976 case InductionDescriptor::IK_IntInduction:
3977 case InductionDescriptor::IK_FpInduction:
3978 llvm_unreachable("Integer/fp induction is handled elsewhere.");
3979 case InductionDescriptor::IK_PtrInduction: {
3980 // Handle the pointer induction variable case.
3981 assert(P->getType()->isPointerTy() && "Unexpected type.");
3982 // This is the normalized GEP that starts counting at zero.
3983 Value *PtrInd = Induction;
3984 PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
3985 // Determine the number of scalars we need to generate for each unroll
3986 // iteration. If the instruction is uniform, we only need to generate the
3987 // first lane. Otherwise, we generate all VF values.
3988 unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
3989 // These are the scalar results. Notice that we don't generate vector GEPs
3990 // because scalar GEPs result in better code.
3991 for (unsigned Part = 0; Part < UF; ++Part) {
3992 for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
3993 Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
3994 Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
3995 Value *SclrGep =
3996 emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
3997 SclrGep->setName("next.gep");
3998 VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4001 return;
4006 /// A helper function for checking whether an integer division-related
4007 /// instruction may divide by zero (in which case it must be predicated if
4008 /// executed conditionally in the scalar code).
4009 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4010 /// Non-zero divisors that are non compile-time constants will not be
4011 /// converted into multiplication, so we will still end up scalarizing
4012 /// the division, but can do so w/o predication.
4013 static bool mayDivideByZero(Instruction &I) {
4014 assert((I.getOpcode() == Instruction::UDiv ||
4015 I.getOpcode() == Instruction::SDiv ||
4016 I.getOpcode() == Instruction::URem ||
4017 I.getOpcode() == Instruction::SRem) &&
4018 "Unexpected instruction");
4019 Value *Divisor = I.getOperand(1);
4020 auto *CInt = dyn_cast<ConstantInt>(Divisor);
4021 return !CInt || CInt->isZero();
4024 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4025 switch (I.getOpcode()) {
4026 case Instruction::Br:
4027 case Instruction::PHI:
4028 llvm_unreachable("This instruction is handled by a different recipe.");
4029 case Instruction::GetElementPtr: {
4030 // Construct a vector GEP by widening the operands of the scalar GEP as
4031 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4032 // results in a vector of pointers when at least one operand of the GEP
4033 // is vector-typed. Thus, to keep the representation compact, we only use
4034 // vector-typed operands for loop-varying values.
4035 auto *GEP = cast<GetElementPtrInst>(&I);
4037 if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
4038 // If we are vectorizing, but the GEP has only loop-invariant operands,
4039 // the GEP we build (by only using vector-typed operands for
4040 // loop-varying values) would be a scalar pointer. Thus, to ensure we
4041 // produce a vector of pointers, we need to either arbitrarily pick an
4042 // operand to broadcast, or broadcast a clone of the original GEP.
4043 // Here, we broadcast a clone of the original.
4045 // TODO: If at some point we decide to scalarize instructions having
4046 // loop-invariant operands, this special case will no longer be
4047 // required. We would add the scalarization decision to
4048 // collectLoopScalars() and teach getVectorValue() to broadcast
4049 // the lane-zero scalar value.
4050 auto *Clone = Builder.Insert(GEP->clone());
4051 for (unsigned Part = 0; Part < UF; ++Part) {
4052 Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4053 VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
4054 addMetadata(EntryPart, GEP);
4056 } else {
4057 // If the GEP has at least one loop-varying operand, we are sure to
4058 // produce a vector of pointers. But if we are only unrolling, we want
4059 // to produce a scalar GEP for each unroll part. Thus, the GEP we
4060 // produce with the code below will be scalar (if VF == 1) or vector
4061 // (otherwise). Note that for the unroll-only case, we still maintain
4062 // values in the vector mapping with initVector, as we do for other
4063 // instructions.
4064 for (unsigned Part = 0; Part < UF; ++Part) {
4065 // The pointer operand of the new GEP. If it's loop-invariant, we
4066 // won't broadcast it.
4067 auto *Ptr =
4068 OrigLoop->isLoopInvariant(GEP->getPointerOperand())
4069 ? GEP->getPointerOperand()
4070 : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4072 // Collect all the indices for the new GEP. If any index is
4073 // loop-invariant, we won't broadcast it.
4074 SmallVector<Value *, 4> Indices;
4075 for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
4076 if (OrigLoop->isLoopInvariant(U.get()))
4077 Indices.push_back(U.get());
4078 else
4079 Indices.push_back(getOrCreateVectorValue(U.get(), Part));
4082 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4083 // but it should be a vector, otherwise.
4084 auto *NewGEP =
4085 GEP->isInBounds()
4086 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4087 Indices)
4088 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4089 assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4090 "NewGEP is not a pointer vector");
4091 VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
4092 addMetadata(NewGEP, GEP);
4096 break;
4098 case Instruction::UDiv:
4099 case Instruction::SDiv:
4100 case Instruction::SRem:
4101 case Instruction::URem:
4102 case Instruction::Add:
4103 case Instruction::FAdd:
4104 case Instruction::Sub:
4105 case Instruction::FSub:
4106 case Instruction::FNeg:
4107 case Instruction::Mul:
4108 case Instruction::FMul:
4109 case Instruction::FDiv:
4110 case Instruction::FRem:
4111 case Instruction::Shl:
4112 case Instruction::LShr:
4113 case Instruction::AShr:
4114 case Instruction::And:
4115 case Instruction::Or:
4116 case Instruction::Xor: {
4117 // Just widen unops and binops.
4118 setDebugLocFromInst(Builder, &I);
4120 for (unsigned Part = 0; Part < UF; ++Part) {
4121 SmallVector<Value *, 2> Ops;
4122 for (Value *Op : I.operands())
4123 Ops.push_back(getOrCreateVectorValue(Op, Part));
4125 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4127 if (auto *VecOp = dyn_cast<Instruction>(V))
4128 VecOp->copyIRFlags(&I);
4130 // Use this vector value for all users of the original instruction.
4131 VectorLoopValueMap.setVectorValue(&I, Part, V);
4132 addMetadata(V, &I);
4135 break;
4137 case Instruction::Select: {
4138 // Widen selects.
4139 // If the selector is loop invariant we can create a select
4140 // instruction with a scalar condition. Otherwise, use vector-select.
4141 auto *SE = PSE.getSE();
4142 bool InvariantCond =
4143 SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4144 setDebugLocFromInst(Builder, &I);
4146 // The condition can be loop invariant but still defined inside the
4147 // loop. This means that we can't just use the original 'cond' value.
4148 // We have to take the 'vectorized' value and pick the first lane.
4149 // Instcombine will make this a no-op.
4151 auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4153 for (unsigned Part = 0; Part < UF; ++Part) {
4154 Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4155 Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4156 Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4157 Value *Sel =
4158 Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4159 VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4160 addMetadata(Sel, &I);
4163 break;
4166 case Instruction::ICmp:
4167 case Instruction::FCmp: {
4168 // Widen compares. Generate vector compares.
4169 bool FCmp = (I.getOpcode() == Instruction::FCmp);
4170 auto *Cmp = dyn_cast<CmpInst>(&I);
4171 setDebugLocFromInst(Builder, Cmp);
4172 for (unsigned Part = 0; Part < UF; ++Part) {
4173 Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4174 Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4175 Value *C = nullptr;
4176 if (FCmp) {
4177 // Propagate fast math flags.
4178 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4179 Builder.setFastMathFlags(Cmp->getFastMathFlags());
4180 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4181 } else {
4182 C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4184 VectorLoopValueMap.setVectorValue(&I, Part, C);
4185 addMetadata(C, &I);
4188 break;
4191 case Instruction::ZExt:
4192 case Instruction::SExt:
4193 case Instruction::FPToUI:
4194 case Instruction::FPToSI:
4195 case Instruction::FPExt:
4196 case Instruction::PtrToInt:
4197 case Instruction::IntToPtr:
4198 case Instruction::SIToFP:
4199 case Instruction::UIToFP:
4200 case Instruction::Trunc:
4201 case Instruction::FPTrunc:
4202 case Instruction::BitCast: {
4203 auto *CI = dyn_cast<CastInst>(&I);
4204 setDebugLocFromInst(Builder, CI);
4206 /// Vectorize casts.
4207 Type *DestTy =
4208 (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4210 for (unsigned Part = 0; Part < UF; ++Part) {
4211 Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4212 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4213 VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4214 addMetadata(Cast, &I);
4216 break;
4219 case Instruction::Call: {
4220 // Ignore dbg intrinsics.
4221 if (isa<DbgInfoIntrinsic>(I))
4222 break;
4223 setDebugLocFromInst(Builder, &I);
4225 Module *M = I.getParent()->getParent()->getParent();
4226 auto *CI = cast<CallInst>(&I);
4228 StringRef FnName = CI->getCalledFunction()->getName();
4229 Function *F = CI->getCalledFunction();
4230 Type *RetTy = ToVectorTy(CI->getType(), VF);
4231 SmallVector<Type *, 4> Tys;
4232 for (Value *ArgOperand : CI->arg_operands())
4233 Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4235 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4237 // The flag shows whether we use Intrinsic or a usual Call for vectorized
4238 // version of the instruction.
4239 // Is it beneficial to perform intrinsic call compared to lib call?
4240 bool NeedToScalarize;
4241 unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4242 bool UseVectorIntrinsic =
4243 ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4244 assert((UseVectorIntrinsic || !NeedToScalarize) &&
4245 "Instruction should be scalarized elsewhere.");
4247 for (unsigned Part = 0; Part < UF; ++Part) {
4248 SmallVector<Value *, 4> Args;
4249 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4250 Value *Arg = CI->getArgOperand(i);
4251 // Some intrinsics have a scalar argument - don't replace it with a
4252 // vector.
4253 if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4254 Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4255 Args.push_back(Arg);
4258 Function *VectorF;
4259 if (UseVectorIntrinsic) {
4260 // Use vector version of the intrinsic.
4261 Type *TysForDecl[] = {CI->getType()};
4262 if (VF > 1)
4263 TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4264 VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4265 } else {
4266 // Use vector version of the library call.
4267 StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4268 assert(!VFnName.empty() && "Vector function name is empty.");
4269 VectorF = M->getFunction(VFnName);
4270 if (!VectorF) {
4271 // Generate a declaration
4272 FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4273 VectorF =
4274 Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4275 VectorF->copyAttributesFrom(F);
4278 assert(VectorF && "Can't create vector function.");
4280 SmallVector<OperandBundleDef, 1> OpBundles;
4281 CI->getOperandBundlesAsDefs(OpBundles);
4282 CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4284 if (isa<FPMathOperator>(V))
4285 V->copyFastMathFlags(CI);
4287 VectorLoopValueMap.setVectorValue(&I, Part, V);
4288 addMetadata(V, &I);
4291 break;
4294 default:
4295 // This instruction is not vectorized by simple widening.
4296 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4297 llvm_unreachable("Unhandled instruction!");
4298 } // end of switch.
4301 void InnerLoopVectorizer::updateAnalysis() {
4302 // Forget the original basic block.
4303 PSE.getSE()->forgetLoop(OrigLoop);
4305 // DT is not kept up-to-date for outer loop vectorization
4306 if (EnableVPlanNativePath)
4307 return;
4309 // Update the dominator tree information.
4310 assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
4311 "Entry does not dominate exit.");
4313 DT->addNewBlock(LoopMiddleBlock,
4314 LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4315 DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
4316 DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
4317 DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
4318 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
4321 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4322 // We should not collect Scalars more than once per VF. Right now, this
4323 // function is called from collectUniformsAndScalars(), which already does
4324 // this check. Collecting Scalars for VF=1 does not make any sense.
4325 assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4326 "This function should not be visited twice for the same VF");
4328 SmallSetVector<Instruction *, 8> Worklist;
4330 // These sets are used to seed the analysis with pointers used by memory
4331 // accesses that will remain scalar.
4332 SmallSetVector<Instruction *, 8> ScalarPtrs;
4333 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4335 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4336 // The pointer operands of loads and stores will be scalar as long as the
4337 // memory access is not a gather or scatter operation. The value operand of a
4338 // store will remain scalar if the store is scalarized.
4339 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4340 InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4341 assert(WideningDecision != CM_Unknown &&
4342 "Widening decision should be ready at this moment");
4343 if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4344 if (Ptr == Store->getValueOperand())
4345 return WideningDecision == CM_Scalarize;
4346 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4347 "Ptr is neither a value or pointer operand");
4348 return WideningDecision != CM_GatherScatter;
4351 // A helper that returns true if the given value is a bitcast or
4352 // getelementptr instruction contained in the loop.
4353 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4354 return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4355 isa<GetElementPtrInst>(V)) &&
4356 !TheLoop->isLoopInvariant(V);
4359 // A helper that evaluates a memory access's use of a pointer. If the use
4360 // will be a scalar use, and the pointer is only used by memory accesses, we
4361 // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4362 // PossibleNonScalarPtrs.
4363 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4364 // We only care about bitcast and getelementptr instructions contained in
4365 // the loop.
4366 if (!isLoopVaryingBitCastOrGEP(Ptr))
4367 return;
4369 // If the pointer has already been identified as scalar (e.g., if it was
4370 // also identified as uniform), there's nothing to do.
4371 auto *I = cast<Instruction>(Ptr);
4372 if (Worklist.count(I))
4373 return;
4375 // If the use of the pointer will be a scalar use, and all users of the
4376 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4377 // place the pointer in PossibleNonScalarPtrs.
4378 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4379 return isa<LoadInst>(U) || isa<StoreInst>(U);
4381 ScalarPtrs.insert(I);
4382 else
4383 PossibleNonScalarPtrs.insert(I);
4386 // We seed the scalars analysis with three classes of instructions: (1)
4387 // instructions marked uniform-after-vectorization, (2) bitcast and
4388 // getelementptr instructions used by memory accesses requiring a scalar use,
4389 // and (3) pointer induction variables and their update instructions (we
4390 // currently only scalarize these).
4392 // (1) Add to the worklist all instructions that have been identified as
4393 // uniform-after-vectorization.
4394 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4396 // (2) Add to the worklist all bitcast and getelementptr instructions used by
4397 // memory accesses requiring a scalar use. The pointer operands of loads and
4398 // stores will be scalar as long as the memory accesses is not a gather or
4399 // scatter operation. The value operand of a store will remain scalar if the
4400 // store is scalarized.
4401 for (auto *BB : TheLoop->blocks())
4402 for (auto &I : *BB) {
4403 if (auto *Load = dyn_cast<LoadInst>(&I)) {
4404 evaluatePtrUse(Load, Load->getPointerOperand());
4405 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4406 evaluatePtrUse(Store, Store->getPointerOperand());
4407 evaluatePtrUse(Store, Store->getValueOperand());
4410 for (auto *I : ScalarPtrs)
4411 if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4412 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4413 Worklist.insert(I);
4416 // (3) Add to the worklist all pointer induction variables and their update
4417 // instructions.
4419 // TODO: Once we are able to vectorize pointer induction variables we should
4420 // no longer insert them into the worklist here.
4421 auto *Latch = TheLoop->getLoopLatch();
4422 for (auto &Induction : *Legal->getInductionVars()) {
4423 auto *Ind = Induction.first;
4424 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4425 if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4426 continue;
4427 Worklist.insert(Ind);
4428 Worklist.insert(IndUpdate);
4429 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4430 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4431 << "\n");
4434 // Insert the forced scalars.
4435 // FIXME: Currently widenPHIInstruction() often creates a dead vector
4436 // induction variable when the PHI user is scalarized.
4437 auto ForcedScalar = ForcedScalars.find(VF);
4438 if (ForcedScalar != ForcedScalars.end())
4439 for (auto *I : ForcedScalar->second)
4440 Worklist.insert(I);
4442 // Expand the worklist by looking through any bitcasts and getelementptr
4443 // instructions we've already identified as scalar. This is similar to the
4444 // expansion step in collectLoopUniforms(); however, here we're only
4445 // expanding to include additional bitcasts and getelementptr instructions.
4446 unsigned Idx = 0;
4447 while (Idx != Worklist.size()) {
4448 Instruction *Dst = Worklist[Idx++];
4449 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4450 continue;
4451 auto *Src = cast<Instruction>(Dst->getOperand(0));
4452 if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4453 auto *J = cast<Instruction>(U);
4454 return !TheLoop->contains(J) || Worklist.count(J) ||
4455 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4456 isScalarUse(J, Src));
4457 })) {
4458 Worklist.insert(Src);
4459 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4463 // An induction variable will remain scalar if all users of the induction
4464 // variable and induction variable update remain scalar.
4465 for (auto &Induction : *Legal->getInductionVars()) {
4466 auto *Ind = Induction.first;
4467 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4469 // We already considered pointer induction variables, so there's no reason
4470 // to look at their users again.
4472 // TODO: Once we are able to vectorize pointer induction variables we
4473 // should no longer skip over them here.
4474 if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4475 continue;
4477 // Determine if all users of the induction variable are scalar after
4478 // vectorization.
4479 auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4480 auto *I = cast<Instruction>(U);
4481 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4483 if (!ScalarInd)
4484 continue;
4486 // Determine if all users of the induction variable update instruction are
4487 // scalar after vectorization.
4488 auto ScalarIndUpdate =
4489 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4490 auto *I = cast<Instruction>(U);
4491 return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4493 if (!ScalarIndUpdate)
4494 continue;
4496 // The induction variable and its update instruction will remain scalar.
4497 Worklist.insert(Ind);
4498 Worklist.insert(IndUpdate);
4499 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4500 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4501 << "\n");
4504 Scalars[VF].insert(Worklist.begin(), Worklist.end());
4507 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4508 if (!blockNeedsPredication(I->getParent()))
4509 return false;
4510 switch(I->getOpcode()) {
4511 default:
4512 break;
4513 case Instruction::Load:
4514 case Instruction::Store: {
4515 if (!Legal->isMaskRequired(I))
4516 return false;
4517 auto *Ptr = getLoadStorePointerOperand(I);
4518 auto *Ty = getMemInstValueType(I);
4519 // We have already decided how to vectorize this instruction, get that
4520 // result.
4521 if (VF > 1) {
4522 InstWidening WideningDecision = getWideningDecision(I, VF);
4523 assert(WideningDecision != CM_Unknown &&
4524 "Widening decision should be ready at this moment");
4525 return WideningDecision == CM_Scalarize;
4527 return isa<LoadInst>(I) ?
4528 !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty))
4529 : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
4531 case Instruction::UDiv:
4532 case Instruction::SDiv:
4533 case Instruction::SRem:
4534 case Instruction::URem:
4535 return mayDivideByZero(*I);
4537 return false;
4540 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4541 unsigned VF) {
4542 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4543 assert(getWideningDecision(I, VF) == CM_Unknown &&
4544 "Decision should not be set yet.");
4545 auto *Group = getInterleavedAccessGroup(I);
4546 assert(Group && "Must have a group.");
4548 // If the instruction's allocated size doesn't equal it's type size, it
4549 // requires padding and will be scalarized.
4550 auto &DL = I->getModule()->getDataLayout();
4551 auto *ScalarTy = getMemInstValueType(I);
4552 if (hasIrregularType(ScalarTy, DL, VF))
4553 return false;
4555 // Check if masking is required.
4556 // A Group may need masking for one of two reasons: it resides in a block that
4557 // needs predication, or it was decided to use masking to deal with gaps.
4558 bool PredicatedAccessRequiresMasking =
4559 Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4560 bool AccessWithGapsRequiresMasking =
4561 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4562 if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4563 return true;
4565 // If masked interleaving is required, we expect that the user/target had
4566 // enabled it, because otherwise it either wouldn't have been created or
4567 // it should have been invalidated by the CostModel.
4568 assert(useMaskedInterleavedAccesses(TTI) &&
4569 "Masked interleave-groups for predicated accesses are not enabled.");
4571 auto *Ty = getMemInstValueType(I);
4572 return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
4573 : TTI.isLegalMaskedStore(Ty);
4576 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4577 unsigned VF) {
4578 // Get and ensure we have a valid memory instruction.
4579 LoadInst *LI = dyn_cast<LoadInst>(I);
4580 StoreInst *SI = dyn_cast<StoreInst>(I);
4581 assert((LI || SI) && "Invalid memory instruction");
4583 auto *Ptr = getLoadStorePointerOperand(I);
4585 // In order to be widened, the pointer should be consecutive, first of all.
4586 if (!Legal->isConsecutivePtr(Ptr))
4587 return false;
4589 // If the instruction is a store located in a predicated block, it will be
4590 // scalarized.
4591 if (isScalarWithPredication(I))
4592 return false;
4594 // If the instruction's allocated size doesn't equal it's type size, it
4595 // requires padding and will be scalarized.
4596 auto &DL = I->getModule()->getDataLayout();
4597 auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4598 if (hasIrregularType(ScalarTy, DL, VF))
4599 return false;
4601 return true;
4604 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4605 // We should not collect Uniforms more than once per VF. Right now,
4606 // this function is called from collectUniformsAndScalars(), which
4607 // already does this check. Collecting Uniforms for VF=1 does not make any
4608 // sense.
4610 assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4611 "This function should not be visited twice for the same VF");
4613 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4614 // not analyze again. Uniforms.count(VF) will return 1.
4615 Uniforms[VF].clear();
4617 // We now know that the loop is vectorizable!
4618 // Collect instructions inside the loop that will remain uniform after
4619 // vectorization.
4621 // Global values, params and instructions outside of current loop are out of
4622 // scope.
4623 auto isOutOfScope = [&](Value *V) -> bool {
4624 Instruction *I = dyn_cast<Instruction>(V);
4625 return (!I || !TheLoop->contains(I));
4628 SetVector<Instruction *> Worklist;
4629 BasicBlock *Latch = TheLoop->getLoopLatch();
4631 // Start with the conditional branch. If the branch condition is an
4632 // instruction contained in the loop that is only used by the branch, it is
4633 // uniform.
4634 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4635 if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4636 Worklist.insert(Cmp);
4637 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
4640 // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4641 // are pointers that are treated like consecutive pointers during
4642 // vectorization. The pointer operands of interleaved accesses are an
4643 // example.
4644 SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4646 // Holds pointer operands of instructions that are possibly non-uniform.
4647 SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4649 auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4650 InstWidening WideningDecision = getWideningDecision(I, VF);
4651 assert(WideningDecision != CM_Unknown &&
4652 "Widening decision should be ready at this moment");
4654 return (WideningDecision == CM_Widen ||
4655 WideningDecision == CM_Widen_Reverse ||
4656 WideningDecision == CM_Interleave);
4658 // Iterate over the instructions in the loop, and collect all
4659 // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4660 // that a consecutive-like pointer operand will be scalarized, we collect it
4661 // in PossibleNonUniformPtrs instead. We use two sets here because a single
4662 // getelementptr instruction can be used by both vectorized and scalarized
4663 // memory instructions. For example, if a loop loads and stores from the same
4664 // location, but the store is conditional, the store will be scalarized, and
4665 // the getelementptr won't remain uniform.
4666 for (auto *BB : TheLoop->blocks())
4667 for (auto &I : *BB) {
4668 // If there's no pointer operand, there's nothing to do.
4669 auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4670 if (!Ptr)
4671 continue;
4673 // True if all users of Ptr are memory accesses that have Ptr as their
4674 // pointer operand.
4675 auto UsersAreMemAccesses =
4676 llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4677 return getLoadStorePointerOperand(U) == Ptr;
4680 // Ensure the memory instruction will not be scalarized or used by
4681 // gather/scatter, making its pointer operand non-uniform. If the pointer
4682 // operand is used by any instruction other than a memory access, we
4683 // conservatively assume the pointer operand may be non-uniform.
4684 if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4685 PossibleNonUniformPtrs.insert(Ptr);
4687 // If the memory instruction will be vectorized and its pointer operand
4688 // is consecutive-like, or interleaving - the pointer operand should
4689 // remain uniform.
4690 else
4691 ConsecutiveLikePtrs.insert(Ptr);
4694 // Add to the Worklist all consecutive and consecutive-like pointers that
4695 // aren't also identified as possibly non-uniform.
4696 for (auto *V : ConsecutiveLikePtrs)
4697 if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
4698 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
4699 Worklist.insert(V);
4702 // Expand Worklist in topological order: whenever a new instruction
4703 // is added , its users should be already inside Worklist. It ensures
4704 // a uniform instruction will only be used by uniform instructions.
4705 unsigned idx = 0;
4706 while (idx != Worklist.size()) {
4707 Instruction *I = Worklist[idx++];
4709 for (auto OV : I->operand_values()) {
4710 // isOutOfScope operands cannot be uniform instructions.
4711 if (isOutOfScope(OV))
4712 continue;
4713 // First order recurrence Phi's should typically be considered
4714 // non-uniform.
4715 auto *OP = dyn_cast<PHINode>(OV);
4716 if (OP && Legal->isFirstOrderRecurrence(OP))
4717 continue;
4718 // If all the users of the operand are uniform, then add the
4719 // operand into the uniform worklist.
4720 auto *OI = cast<Instruction>(OV);
4721 if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4722 auto *J = cast<Instruction>(U);
4723 return Worklist.count(J) ||
4724 (OI == getLoadStorePointerOperand(J) &&
4725 isUniformDecision(J, VF));
4726 })) {
4727 Worklist.insert(OI);
4728 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
4733 // Returns true if Ptr is the pointer operand of a memory access instruction
4734 // I, and I is known to not require scalarization.
4735 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4736 return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4739 // For an instruction to be added into Worklist above, all its users inside
4740 // the loop should also be in Worklist. However, this condition cannot be
4741 // true for phi nodes that form a cyclic dependence. We must process phi
4742 // nodes separately. An induction variable will remain uniform if all users
4743 // of the induction variable and induction variable update remain uniform.
4744 // The code below handles both pointer and non-pointer induction variables.
4745 for (auto &Induction : *Legal->getInductionVars()) {
4746 auto *Ind = Induction.first;
4747 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4749 // Determine if all users of the induction variable are uniform after
4750 // vectorization.
4751 auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4752 auto *I = cast<Instruction>(U);
4753 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4754 isVectorizedMemAccessUse(I, Ind);
4756 if (!UniformInd)
4757 continue;
4759 // Determine if all users of the induction variable update instruction are
4760 // uniform after vectorization.
4761 auto UniformIndUpdate =
4762 llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4763 auto *I = cast<Instruction>(U);
4764 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4765 isVectorizedMemAccessUse(I, IndUpdate);
4767 if (!UniformIndUpdate)
4768 continue;
4770 // The induction variable and its update instruction will remain uniform.
4771 Worklist.insert(Ind);
4772 Worklist.insert(IndUpdate);
4773 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
4774 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4775 << "\n");
4778 Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4781 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4782 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4784 if (Legal->getRuntimePointerChecking()->Need) {
4785 reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4786 "runtime pointer checks needed. Enable vectorization of this "
4787 "loop with '#pragma clang loop vectorize(enable)' when "
4788 "compiling with -Os/-Oz",
4789 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4790 return true;
4793 if (!PSE.getUnionPredicate().getPredicates().empty()) {
4794 reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4795 "runtime SCEV checks needed. Enable vectorization of this "
4796 "loop with '#pragma clang loop vectorize(enable)' when "
4797 "compiling with -Os/-Oz",
4798 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4799 return true;
4802 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4803 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4804 reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4805 "runtime stride == 1 checks needed. Enable vectorization of "
4806 "this loop with '#pragma clang loop vectorize(enable)' when "
4807 "compiling with -Os/-Oz",
4808 "CantVersionLoopWithOptForSize", ORE, TheLoop);
4809 return true;
4812 return false;
4815 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4816 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4817 // TODO: It may by useful to do since it's still likely to be dynamically
4818 // uniform if the target can skip.
4819 reportVectorizationFailure(
4820 "Not inserting runtime ptr check for divergent target",
4821 "runtime pointer checks needed. Not enabled for divergent target",
4822 "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4823 return None;
4826 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4827 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4828 if (TC == 1) {
4829 reportVectorizationFailure("Single iteration (non) loop",
4830 "loop trip count is one, irrelevant for vectorization",
4831 "SingleIterationLoop", ORE, TheLoop);
4832 return None;
4835 switch (ScalarEpilogueStatus) {
4836 case CM_ScalarEpilogueAllowed:
4837 return computeFeasibleMaxVF(TC);
4838 case CM_ScalarEpilogueNotNeededUsePredicate:
4839 LLVM_DEBUG(
4840 dbgs() << "LV: vector predicate hint/switch found.\n"
4841 << "LV: Not allowing scalar epilogue, creating predicated "
4842 << "vector loop.\n");
4843 break;
4844 case CM_ScalarEpilogueNotAllowedLowTripLoop:
4845 // fallthrough as a special case of OptForSize
4846 case CM_ScalarEpilogueNotAllowedOptSize:
4847 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4848 LLVM_DEBUG(
4849 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4850 else
4851 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4852 << "count.\n");
4854 // Bail if runtime checks are required, which are not good when optimising
4855 // for size.
4856 if (runtimeChecksRequired())
4857 return None;
4858 break;
4861 // Now try the tail folding
4863 // Invalidate interleave groups that require an epilogue if we can't mask
4864 // the interleave-group.
4865 if (!useMaskedInterleavedAccesses(TTI))
4866 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4868 unsigned MaxVF = computeFeasibleMaxVF(TC);
4869 if (TC > 0 && TC % MaxVF == 0) {
4870 // Accept MaxVF if we do not have a tail.
4871 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4872 return MaxVF;
4875 // If we don't know the precise trip count, or if the trip count that we
4876 // found modulo the vectorization factor is not zero, try to fold the tail
4877 // by masking.
4878 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4879 if (Legal->prepareToFoldTailByMasking()) {
4880 FoldTailByMasking = true;
4881 return MaxVF;
4884 if (TC == 0) {
4885 reportVectorizationFailure(
4886 "Unable to calculate the loop count due to complex control flow",
4887 "unable to calculate the loop count due to complex control flow",
4888 "UnknownLoopCountComplexCFG", ORE, TheLoop);
4889 return None;
4892 reportVectorizationFailure(
4893 "Cannot optimize for size and vectorize at the same time.",
4894 "cannot optimize for size and vectorize at the same time. "
4895 "Enable vectorization of this loop with '#pragma clang loop "
4896 "vectorize(enable)' when compiling with -Os/-Oz",
4897 "NoTailLoopWithOptForSize", ORE, TheLoop);
4898 return None;
4901 unsigned
4902 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
4903 MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4904 unsigned SmallestType, WidestType;
4905 std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4906 unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4908 // Get the maximum safe dependence distance in bits computed by LAA.
4909 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4910 // the memory accesses that is most restrictive (involved in the smallest
4911 // dependence distance).
4912 unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4914 WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4916 unsigned MaxVectorSize = WidestRegister / WidestType;
4918 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4919 << " / " << WidestType << " bits.\n");
4920 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4921 << WidestRegister << " bits.\n");
4923 assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4924 " into one vector!");
4925 if (MaxVectorSize == 0) {
4926 LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4927 MaxVectorSize = 1;
4928 return MaxVectorSize;
4929 } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4930 isPowerOf2_32(ConstTripCount)) {
4931 // We need to clamp the VF to be the ConstTripCount. There is no point in
4932 // choosing a higher viable VF as done in the loop below.
4933 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4934 << ConstTripCount << "\n");
4935 MaxVectorSize = ConstTripCount;
4936 return MaxVectorSize;
4939 unsigned MaxVF = MaxVectorSize;
4940 if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
4941 (MaximizeBandwidth && isScalarEpilogueAllowed())) {
4942 // Collect all viable vectorization factors larger than the default MaxVF
4943 // (i.e. MaxVectorSize).
4944 SmallVector<unsigned, 8> VFs;
4945 unsigned NewMaxVectorSize = WidestRegister / SmallestType;
4946 for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
4947 VFs.push_back(VS);
4949 // For each VF calculate its register usage.
4950 auto RUs = calculateRegisterUsage(VFs);
4952 // Select the largest VF which doesn't require more registers than existing
4953 // ones.
4954 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
4955 for (int i = RUs.size() - 1; i >= 0; --i) {
4956 if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
4957 MaxVF = VFs[i];
4958 break;
4961 if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
4962 if (MaxVF < MinVF) {
4963 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4964 << ") with target's minimum: " << MinVF << '\n');
4965 MaxVF = MinVF;
4969 return MaxVF;
4972 VectorizationFactor
4973 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
4974 float Cost = expectedCost(1).first;
4975 const float ScalarCost = Cost;
4976 unsigned Width = 1;
4977 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
4979 bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
4980 if (ForceVectorization && MaxVF > 1) {
4981 // Ignore scalar width, because the user explicitly wants vectorization.
4982 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4983 // evaluation.
4984 Cost = std::numeric_limits<float>::max();
4987 for (unsigned i = 2; i <= MaxVF; i *= 2) {
4988 // Notice that the vector loop needs to be executed less times, so
4989 // we need to divide the cost of the vector loops by the width of
4990 // the vector elements.
4991 VectorizationCostTy C = expectedCost(i);
4992 float VectorCost = C.first / (float)i;
4993 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
4994 << " costs: " << (int)VectorCost << ".\n");
4995 if (!C.second && !ForceVectorization) {
4996 LLVM_DEBUG(
4997 dbgs() << "LV: Not considering vector loop of width " << i
4998 << " because it will not generate any vector instructions.\n");
4999 continue;
5001 if (VectorCost < Cost) {
5002 Cost = VectorCost;
5003 Width = i;
5007 if (!EnableCondStoresVectorization && NumPredStores) {
5008 reportVectorizationFailure("There are conditional stores.",
5009 "store that is conditionally executed prevents vectorization",
5010 "ConditionalStore", ORE, TheLoop);
5011 Width = 1;
5012 Cost = ScalarCost;
5015 LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5016 << "LV: Vectorization seems to be not beneficial, "
5017 << "but was forced by a user.\n");
5018 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5019 VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5020 return Factor;
5023 std::pair<unsigned, unsigned>
5024 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5025 unsigned MinWidth = -1U;
5026 unsigned MaxWidth = 8;
5027 const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5029 // For each block.
5030 for (BasicBlock *BB : TheLoop->blocks()) {
5031 // For each instruction in the loop.
5032 for (Instruction &I : BB->instructionsWithoutDebug()) {
5033 Type *T = I.getType();
5035 // Skip ignored values.
5036 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5037 continue;
5039 // Only examine Loads, Stores and PHINodes.
5040 if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5041 continue;
5043 // Examine PHI nodes that are reduction variables. Update the type to
5044 // account for the recurrence type.
5045 if (auto *PN = dyn_cast<PHINode>(&I)) {
5046 if (!Legal->isReductionVariable(PN))
5047 continue;
5048 RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
5049 T = RdxDesc.getRecurrenceType();
5052 // Examine the stored values.
5053 if (auto *ST = dyn_cast<StoreInst>(&I))
5054 T = ST->getValueOperand()->getType();
5056 // Ignore loaded pointer types and stored pointer types that are not
5057 // vectorizable.
5059 // FIXME: The check here attempts to predict whether a load or store will
5060 // be vectorized. We only know this for certain after a VF has
5061 // been selected. Here, we assume that if an access can be
5062 // vectorized, it will be. We should also look at extending this
5063 // optimization to non-pointer types.
5065 if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5066 !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5067 continue;
5069 MinWidth = std::min(MinWidth,
5070 (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5071 MaxWidth = std::max(MaxWidth,
5072 (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5076 return {MinWidth, MaxWidth};
5079 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5080 unsigned LoopCost) {
5081 // -- The interleave heuristics --
5082 // We interleave the loop in order to expose ILP and reduce the loop overhead.
5083 // There are many micro-architectural considerations that we can't predict
5084 // at this level. For example, frontend pressure (on decode or fetch) due to
5085 // code size, or the number and capabilities of the execution ports.
5087 // We use the following heuristics to select the interleave count:
5088 // 1. If the code has reductions, then we interleave to break the cross
5089 // iteration dependency.
5090 // 2. If the loop is really small, then we interleave to reduce the loop
5091 // overhead.
5092 // 3. We don't interleave if we think that we will spill registers to memory
5093 // due to the increased register pressure.
5095 if (!isScalarEpilogueAllowed())
5096 return 1;
5098 // We used the distance for the interleave count.
5099 if (Legal->getMaxSafeDepDistBytes() != -1U)
5100 return 1;
5102 // Do not interleave loops with a relatively small trip count.
5103 unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5104 if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
5105 return 1;
5107 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
5108 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5109 << " registers\n");
5111 if (VF == 1) {
5112 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5113 TargetNumRegisters = ForceTargetNumScalarRegs;
5114 } else {
5115 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5116 TargetNumRegisters = ForceTargetNumVectorRegs;
5119 RegisterUsage R = calculateRegisterUsage({VF})[0];
5120 // We divide by these constants so assume that we have at least one
5121 // instruction that uses at least one register.
5122 R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
5124 // We calculate the interleave count using the following formula.
5125 // Subtract the number of loop invariants from the number of available
5126 // registers. These registers are used by all of the interleaved instances.
5127 // Next, divide the remaining registers by the number of registers that is
5128 // required by the loop, in order to estimate how many parallel instances
5129 // fit without causing spills. All of this is rounded down if necessary to be
5130 // a power of two. We want power of two interleave count to simplify any
5131 // addressing operations or alignment considerations.
5132 // We also want power of two interleave counts to ensure that the induction
5133 // variable of the vector loop wraps to zero, when tail is folded by masking;
5134 // this currently happens when OptForSize, in which case IC is set to 1 above.
5135 unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
5136 R.MaxLocalUsers);
5138 // Don't count the induction variable as interleaved.
5139 if (EnableIndVarRegisterHeur)
5140 IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
5141 std::max(1U, (R.MaxLocalUsers - 1)));
5143 // Clamp the interleave ranges to reasonable counts.
5144 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5146 // Check if the user has overridden the max.
5147 if (VF == 1) {
5148 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5149 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5150 } else {
5151 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5152 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5155 // If the trip count is constant, limit the interleave count to be less than
5156 // the trip count divided by VF.
5157 if (TC > 0) {
5158 assert(TC >= VF && "VF exceeds trip count?");
5159 if ((TC / VF) < MaxInterleaveCount)
5160 MaxInterleaveCount = (TC / VF);
5163 // If we did not calculate the cost for VF (because the user selected the VF)
5164 // then we calculate the cost of VF here.
5165 if (LoopCost == 0)
5166 LoopCost = expectedCost(VF).first;
5168 assert(LoopCost && "Non-zero loop cost expected");
5170 // Clamp the calculated IC to be between the 1 and the max interleave count
5171 // that the target and trip count allows.
5172 if (IC > MaxInterleaveCount)
5173 IC = MaxInterleaveCount;
5174 else if (IC < 1)
5175 IC = 1;
5177 // Interleave if we vectorized this loop and there is a reduction that could
5178 // benefit from interleaving.
5179 if (VF > 1 && !Legal->getReductionVars()->empty()) {
5180 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5181 return IC;
5184 // Note that if we've already vectorized the loop we will have done the
5185 // runtime check and so interleaving won't require further checks.
5186 bool InterleavingRequiresRuntimePointerCheck =
5187 (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5189 // We want to interleave small loops in order to reduce the loop overhead and
5190 // potentially expose ILP opportunities.
5191 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5192 if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5193 // We assume that the cost overhead is 1 and we use the cost model
5194 // to estimate the cost of the loop and interleave until the cost of the
5195 // loop overhead is about 5% of the cost of the loop.
5196 unsigned SmallIC =
5197 std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5199 // Interleave until store/load ports (estimated by max interleave count) are
5200 // saturated.
5201 unsigned NumStores = Legal->getNumStores();
5202 unsigned NumLoads = Legal->getNumLoads();
5203 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5204 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5206 // If we have a scalar reduction (vector reductions are already dealt with
5207 // by this point), we can increase the critical path length if the loop
5208 // we're interleaving is inside another loop. Limit, by default to 2, so the
5209 // critical path only gets increased by one reduction operation.
5210 if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5211 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5212 SmallIC = std::min(SmallIC, F);
5213 StoresIC = std::min(StoresIC, F);
5214 LoadsIC = std::min(LoadsIC, F);
5217 if (EnableLoadStoreRuntimeInterleave &&
5218 std::max(StoresIC, LoadsIC) > SmallIC) {
5219 LLVM_DEBUG(
5220 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5221 return std::max(StoresIC, LoadsIC);
5224 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5225 return SmallIC;
5228 // Interleave if this is a large loop (small loops are already dealt with by
5229 // this point) that could benefit from interleaving.
5230 bool HasReductions = !Legal->getReductionVars()->empty();
5231 if (TTI.enableAggressiveInterleaving(HasReductions)) {
5232 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5233 return IC;
5236 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5237 return 1;
5240 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5241 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5242 // This function calculates the register usage by measuring the highest number
5243 // of values that are alive at a single location. Obviously, this is a very
5244 // rough estimation. We scan the loop in a topological order in order and
5245 // assign a number to each instruction. We use RPO to ensure that defs are
5246 // met before their users. We assume that each instruction that has in-loop
5247 // users starts an interval. We record every time that an in-loop value is
5248 // used, so we have a list of the first and last occurrences of each
5249 // instruction. Next, we transpose this data structure into a multi map that
5250 // holds the list of intervals that *end* at a specific location. This multi
5251 // map allows us to perform a linear search. We scan the instructions linearly
5252 // and record each time that a new interval starts, by placing it in a set.
5253 // If we find this value in the multi-map then we remove it from the set.
5254 // The max register usage is the maximum size of the set.
5255 // We also search for instructions that are defined outside the loop, but are
5256 // used inside the loop. We need this number separately from the max-interval
5257 // usage number because when we unroll, loop-invariant values do not take
5258 // more register.
5259 LoopBlocksDFS DFS(TheLoop);
5260 DFS.perform(LI);
5262 RegisterUsage RU;
5264 // Each 'key' in the map opens a new interval. The values
5265 // of the map are the index of the 'last seen' usage of the
5266 // instruction that is the key.
5267 using IntervalMap = DenseMap<Instruction *, unsigned>;
5269 // Maps instruction to its index.
5270 SmallVector<Instruction *, 64> IdxToInstr;
5271 // Marks the end of each interval.
5272 IntervalMap EndPoint;
5273 // Saves the list of instruction indices that are used in the loop.
5274 SmallPtrSet<Instruction *, 8> Ends;
5275 // Saves the list of values that are used in the loop but are
5276 // defined outside the loop, such as arguments and constants.
5277 SmallPtrSet<Value *, 8> LoopInvariants;
5279 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5280 for (Instruction &I : BB->instructionsWithoutDebug()) {
5281 IdxToInstr.push_back(&I);
5283 // Save the end location of each USE.
5284 for (Value *U : I.operands()) {
5285 auto *Instr = dyn_cast<Instruction>(U);
5287 // Ignore non-instruction values such as arguments, constants, etc.
5288 if (!Instr)
5289 continue;
5291 // If this instruction is outside the loop then record it and continue.
5292 if (!TheLoop->contains(Instr)) {
5293 LoopInvariants.insert(Instr);
5294 continue;
5297 // Overwrite previous end points.
5298 EndPoint[Instr] = IdxToInstr.size();
5299 Ends.insert(Instr);
5304 // Saves the list of intervals that end with the index in 'key'.
5305 using InstrList = SmallVector<Instruction *, 2>;
5306 DenseMap<unsigned, InstrList> TransposeEnds;
5308 // Transpose the EndPoints to a list of values that end at each index.
5309 for (auto &Interval : EndPoint)
5310 TransposeEnds[Interval.second].push_back(Interval.first);
5312 SmallPtrSet<Instruction *, 8> OpenIntervals;
5314 // Get the size of the widest register.
5315 unsigned MaxSafeDepDist = -1U;
5316 if (Legal->getMaxSafeDepDistBytes() != -1U)
5317 MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5318 unsigned WidestRegister =
5319 std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5320 const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5322 SmallVector<RegisterUsage, 8> RUs(VFs.size());
5323 SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
5325 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5327 // A lambda that gets the register usage for the given type and VF.
5328 auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5329 if (Ty->isTokenTy())
5330 return 0U;
5331 unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5332 return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5335 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5336 Instruction *I = IdxToInstr[i];
5338 // Remove all of the instructions that end at this location.
5339 InstrList &List = TransposeEnds[i];
5340 for (Instruction *ToRemove : List)
5341 OpenIntervals.erase(ToRemove);
5343 // Ignore instructions that are never used within the loop.
5344 if (Ends.find(I) == Ends.end())
5345 continue;
5347 // Skip ignored values.
5348 if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5349 continue;
5351 // For each VF find the maximum usage of registers.
5352 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5353 if (VFs[j] == 1) {
5354 MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
5355 continue;
5357 collectUniformsAndScalars(VFs[j]);
5358 // Count the number of live intervals.
5359 unsigned RegUsage = 0;
5360 for (auto Inst : OpenIntervals) {
5361 // Skip ignored values for VF > 1.
5362 if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() ||
5363 isScalarAfterVectorization(Inst, VFs[j]))
5364 continue;
5365 RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
5367 MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
5370 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5371 << OpenIntervals.size() << '\n');
5373 // Add the current instruction to the list of open intervals.
5374 OpenIntervals.insert(I);
5377 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5378 unsigned Invariant = 0;
5379 if (VFs[i] == 1)
5380 Invariant = LoopInvariants.size();
5381 else {
5382 for (auto Inst : LoopInvariants)
5383 Invariant += GetRegUsage(Inst->getType(), VFs[i]);
5386 LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
5387 LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
5388 LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant
5389 << '\n');
5391 RU.LoopInvariantRegs = Invariant;
5392 RU.MaxLocalUsers = MaxUsages[i];
5393 RUs[i] = RU;
5396 return RUs;
5399 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5400 // TODO: Cost model for emulated masked load/store is completely
5401 // broken. This hack guides the cost model to use an artificially
5402 // high enough value to practically disable vectorization with such
5403 // operations, except where previously deployed legality hack allowed
5404 // using very low cost values. This is to avoid regressions coming simply
5405 // from moving "masked load/store" check from legality to cost model.
5406 // Masked Load/Gather emulation was previously never allowed.
5407 // Limited number of Masked Store/Scatter emulation was allowed.
5408 assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5409 return isa<LoadInst>(I) ||
5410 (isa<StoreInst>(I) &&
5411 NumPredStores > NumberOfStoresToPredicate);
5414 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5415 // If we aren't vectorizing the loop, or if we've already collected the
5416 // instructions to scalarize, there's nothing to do. Collection may already
5417 // have occurred if we have a user-selected VF and are now computing the
5418 // expected cost for interleaving.
5419 if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5420 return;
5422 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5423 // not profitable to scalarize any instructions, the presence of VF in the
5424 // map will indicate that we've analyzed it already.
5425 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5427 // Find all the instructions that are scalar with predication in the loop and
5428 // determine if it would be better to not if-convert the blocks they are in.
5429 // If so, we also record the instructions to scalarize.
5430 for (BasicBlock *BB : TheLoop->blocks()) {
5431 if (!blockNeedsPredication(BB))
5432 continue;
5433 for (Instruction &I : *BB)
5434 if (isScalarWithPredication(&I)) {
5435 ScalarCostsTy ScalarCosts;
5436 // Do not apply discount logic if hacked cost is needed
5437 // for emulated masked memrefs.
5438 if (!useEmulatedMaskMemRefHack(&I) &&
5439 computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5440 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5441 // Remember that BB will remain after vectorization.
5442 PredicatedBBsAfterVectorization.insert(BB);
5447 int LoopVectorizationCostModel::computePredInstDiscount(
5448 Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5449 unsigned VF) {
5450 assert(!isUniformAfterVectorization(PredInst, VF) &&
5451 "Instruction marked uniform-after-vectorization will be predicated");
5453 // Initialize the discount to zero, meaning that the scalar version and the
5454 // vector version cost the same.
5455 int Discount = 0;
5457 // Holds instructions to analyze. The instructions we visit are mapped in
5458 // ScalarCosts. Those instructions are the ones that would be scalarized if
5459 // we find that the scalar version costs less.
5460 SmallVector<Instruction *, 8> Worklist;
5462 // Returns true if the given instruction can be scalarized.
5463 auto canBeScalarized = [&](Instruction *I) -> bool {
5464 // We only attempt to scalarize instructions forming a single-use chain
5465 // from the original predicated block that would otherwise be vectorized.
5466 // Although not strictly necessary, we give up on instructions we know will
5467 // already be scalar to avoid traversing chains that are unlikely to be
5468 // beneficial.
5469 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5470 isScalarAfterVectorization(I, VF))
5471 return false;
5473 // If the instruction is scalar with predication, it will be analyzed
5474 // separately. We ignore it within the context of PredInst.
5475 if (isScalarWithPredication(I))
5476 return false;
5478 // If any of the instruction's operands are uniform after vectorization,
5479 // the instruction cannot be scalarized. This prevents, for example, a
5480 // masked load from being scalarized.
5482 // We assume we will only emit a value for lane zero of an instruction
5483 // marked uniform after vectorization, rather than VF identical values.
5484 // Thus, if we scalarize an instruction that uses a uniform, we would
5485 // create uses of values corresponding to the lanes we aren't emitting code
5486 // for. This behavior can be changed by allowing getScalarValue to clone
5487 // the lane zero values for uniforms rather than asserting.
5488 for (Use &U : I->operands())
5489 if (auto *J = dyn_cast<Instruction>(U.get()))
5490 if (isUniformAfterVectorization(J, VF))
5491 return false;
5493 // Otherwise, we can scalarize the instruction.
5494 return true;
5497 // Compute the expected cost discount from scalarizing the entire expression
5498 // feeding the predicated instruction. We currently only consider expressions
5499 // that are single-use instruction chains.
5500 Worklist.push_back(PredInst);
5501 while (!Worklist.empty()) {
5502 Instruction *I = Worklist.pop_back_val();
5504 // If we've already analyzed the instruction, there's nothing to do.
5505 if (ScalarCosts.find(I) != ScalarCosts.end())
5506 continue;
5508 // Compute the cost of the vector instruction. Note that this cost already
5509 // includes the scalarization overhead of the predicated instruction.
5510 unsigned VectorCost = getInstructionCost(I, VF).first;
5512 // Compute the cost of the scalarized instruction. This cost is the cost of
5513 // the instruction as if it wasn't if-converted and instead remained in the
5514 // predicated block. We will scale this cost by block probability after
5515 // computing the scalarization overhead.
5516 unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5518 // Compute the scalarization overhead of needed insertelement instructions
5519 // and phi nodes.
5520 if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5521 ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5522 true, false);
5523 ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5526 // Compute the scalarization overhead of needed extractelement
5527 // instructions. For each of the instruction's operands, if the operand can
5528 // be scalarized, add it to the worklist; otherwise, account for the
5529 // overhead.
5530 for (Use &U : I->operands())
5531 if (auto *J = dyn_cast<Instruction>(U.get())) {
5532 assert(VectorType::isValidElementType(J->getType()) &&
5533 "Instruction has non-scalar type");
5534 if (canBeScalarized(J))
5535 Worklist.push_back(J);
5536 else if (needsExtract(J, VF))
5537 ScalarCost += TTI.getScalarizationOverhead(
5538 ToVectorTy(J->getType(),VF), false, true);
5541 // Scale the total scalar cost by block probability.
5542 ScalarCost /= getReciprocalPredBlockProb();
5544 // Compute the discount. A non-negative discount means the vector version
5545 // of the instruction costs more, and scalarizing would be beneficial.
5546 Discount += VectorCost - ScalarCost;
5547 ScalarCosts[I] = ScalarCost;
5550 return Discount;
5553 LoopVectorizationCostModel::VectorizationCostTy
5554 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5555 VectorizationCostTy Cost;
5557 // For each block.
5558 for (BasicBlock *BB : TheLoop->blocks()) {
5559 VectorizationCostTy BlockCost;
5561 // For each instruction in the old loop.
5562 for (Instruction &I : BB->instructionsWithoutDebug()) {
5563 // Skip ignored values.
5564 if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5565 (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5566 continue;
5568 VectorizationCostTy C = getInstructionCost(&I, VF);
5570 // Check if we should override the cost.
5571 if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5572 C.first = ForceTargetInstructionCost;
5574 BlockCost.first += C.first;
5575 BlockCost.second |= C.second;
5576 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5577 << " for VF " << VF << " For instruction: " << I
5578 << '\n');
5581 // If we are vectorizing a predicated block, it will have been
5582 // if-converted. This means that the block's instructions (aside from
5583 // stores and instructions that may divide by zero) will now be
5584 // unconditionally executed. For the scalar case, we may not always execute
5585 // the predicated block. Thus, scale the block's cost by the probability of
5586 // executing it.
5587 if (VF == 1 && blockNeedsPredication(BB))
5588 BlockCost.first /= getReciprocalPredBlockProb();
5590 Cost.first += BlockCost.first;
5591 Cost.second |= BlockCost.second;
5594 return Cost;
5597 /// Gets Address Access SCEV after verifying that the access pattern
5598 /// is loop invariant except the induction variable dependence.
5600 /// This SCEV can be sent to the Target in order to estimate the address
5601 /// calculation cost.
5602 static const SCEV *getAddressAccessSCEV(
5603 Value *Ptr,
5604 LoopVectorizationLegality *Legal,
5605 PredicatedScalarEvolution &PSE,
5606 const Loop *TheLoop) {
5608 auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5609 if (!Gep)
5610 return nullptr;
5612 // We are looking for a gep with all loop invariant indices except for one
5613 // which should be an induction variable.
5614 auto SE = PSE.getSE();
5615 unsigned NumOperands = Gep->getNumOperands();
5616 for (unsigned i = 1; i < NumOperands; ++i) {
5617 Value *Opd = Gep->getOperand(i);
5618 if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5619 !Legal->isInductionVariable(Opd))
5620 return nullptr;
5623 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5624 return PSE.getSCEV(Ptr);
5627 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5628 return Legal->hasStride(I->getOperand(0)) ||
5629 Legal->hasStride(I->getOperand(1));
5632 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5633 unsigned VF) {
5634 assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5635 Type *ValTy = getMemInstValueType(I);
5636 auto SE = PSE.getSE();
5638 unsigned Alignment = getLoadStoreAlignment(I);
5639 unsigned AS = getLoadStoreAddressSpace(I);
5640 Value *Ptr = getLoadStorePointerOperand(I);
5641 Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5643 // Figure out whether the access is strided and get the stride value
5644 // if it's known in compile time
5645 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5647 // Get the cost of the scalar memory instruction and address computation.
5648 unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5650 // Don't pass *I here, since it is scalar but will actually be part of a
5651 // vectorized loop where the user of it is a vectorized instruction.
5652 Cost += VF *
5653 TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
5654 AS);
5656 // Get the overhead of the extractelement and insertelement instructions
5657 // we might create due to scalarization.
5658 Cost += getScalarizationOverhead(I, VF);
5660 // If we have a predicated store, it may not be executed for each vector
5661 // lane. Scale the cost by the probability of executing the predicated
5662 // block.
5663 if (isPredicatedInst(I)) {
5664 Cost /= getReciprocalPredBlockProb();
5666 if (useEmulatedMaskMemRefHack(I))
5667 // Artificially setting to a high enough value to practically disable
5668 // vectorization with such operations.
5669 Cost = 3000000;
5672 return Cost;
5675 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5676 unsigned VF) {
5677 Type *ValTy = getMemInstValueType(I);
5678 Type *VectorTy = ToVectorTy(ValTy, VF);
5679 unsigned Alignment = getLoadStoreAlignment(I);
5680 Value *Ptr = getLoadStorePointerOperand(I);
5681 unsigned AS = getLoadStoreAddressSpace(I);
5682 int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5684 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5685 "Stride should be 1 or -1 for consecutive memory access");
5686 unsigned Cost = 0;
5687 if (Legal->isMaskRequired(I))
5688 Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
5689 else
5690 Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5692 bool Reverse = ConsecutiveStride < 0;
5693 if (Reverse)
5694 Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5695 return Cost;
5698 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5699 unsigned VF) {
5700 Type *ValTy = getMemInstValueType(I);
5701 Type *VectorTy = ToVectorTy(ValTy, VF);
5702 unsigned Alignment = getLoadStoreAlignment(I);
5703 unsigned AS = getLoadStoreAddressSpace(I);
5704 if (isa<LoadInst>(I)) {
5705 return TTI.getAddressComputationCost(ValTy) +
5706 TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5707 TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5709 StoreInst *SI = cast<StoreInst>(I);
5711 bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5712 return TTI.getAddressComputationCost(ValTy) +
5713 TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5714 (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(
5715 Instruction::ExtractElement,
5716 VectorTy, VF - 1));
5719 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5720 unsigned VF) {
5721 Type *ValTy = getMemInstValueType(I);
5722 Type *VectorTy = ToVectorTy(ValTy, VF);
5723 unsigned Alignment = getLoadStoreAlignment(I);
5724 Value *Ptr = getLoadStorePointerOperand(I);
5726 return TTI.getAddressComputationCost(VectorTy) +
5727 TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5728 Legal->isMaskRequired(I), Alignment);
5731 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5732 unsigned VF) {
5733 Type *ValTy = getMemInstValueType(I);
5734 Type *VectorTy = ToVectorTy(ValTy, VF);
5735 unsigned AS = getLoadStoreAddressSpace(I);
5737 auto Group = getInterleavedAccessGroup(I);
5738 assert(Group && "Fail to get an interleaved access group.");
5740 unsigned InterleaveFactor = Group->getFactor();
5741 Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5743 // Holds the indices of existing members in an interleaved load group.
5744 // An interleaved store group doesn't need this as it doesn't allow gaps.
5745 SmallVector<unsigned, 4> Indices;
5746 if (isa<LoadInst>(I)) {
5747 for (unsigned i = 0; i < InterleaveFactor; i++)
5748 if (Group->getMember(i))
5749 Indices.push_back(i);
5752 // Calculate the cost of the whole interleaved group.
5753 bool UseMaskForGaps =
5754 Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5755 unsigned Cost = TTI.getInterleavedMemoryOpCost(
5756 I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5757 Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5759 if (Group->isReverse()) {
5760 // TODO: Add support for reversed masked interleaved access.
5761 assert(!Legal->isMaskRequired(I) &&
5762 "Reverse masked interleaved access not supported.");
5763 Cost += Group->getNumMembers() *
5764 TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5766 return Cost;
5769 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5770 unsigned VF) {
5771 // Calculate scalar cost only. Vectorization cost should be ready at this
5772 // moment.
5773 if (VF == 1) {
5774 Type *ValTy = getMemInstValueType(I);
5775 unsigned Alignment = getLoadStoreAlignment(I);
5776 unsigned AS = getLoadStoreAddressSpace(I);
5778 return TTI.getAddressComputationCost(ValTy) +
5779 TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5781 return getWideningCost(I, VF);
5784 LoopVectorizationCostModel::VectorizationCostTy
5785 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5786 // If we know that this instruction will remain uniform, check the cost of
5787 // the scalar version.
5788 if (isUniformAfterVectorization(I, VF))
5789 VF = 1;
5791 if (VF > 1 && isProfitableToScalarize(I, VF))
5792 return VectorizationCostTy(InstsToScalarize[VF][I], false);
5794 // Forced scalars do not have any scalarization overhead.
5795 auto ForcedScalar = ForcedScalars.find(VF);
5796 if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5797 auto InstSet = ForcedScalar->second;
5798 if (InstSet.find(I) != InstSet.end())
5799 return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5802 Type *VectorTy;
5803 unsigned C = getInstructionCost(I, VF, VectorTy);
5805 bool TypeNotScalarized =
5806 VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5807 return VectorizationCostTy(C, TypeNotScalarized);
5810 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5811 unsigned VF) {
5813 if (VF == 1)
5814 return 0;
5816 unsigned Cost = 0;
5817 Type *RetTy = ToVectorTy(I->getType(), VF);
5818 if (!RetTy->isVoidTy() &&
5819 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5820 Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5822 // Some targets keep addresses scalar.
5823 if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5824 return Cost;
5826 // Some targets support efficient element stores.
5827 if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5828 return Cost;
5830 // Collect operands to consider.
5831 CallInst *CI = dyn_cast<CallInst>(I);
5832 Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5834 // Skip operands that do not require extraction/scalarization and do not incur
5835 // any overhead.
5836 return Cost + TTI.getOperandsScalarizationOverhead(
5837 filterExtractingOperands(Ops, VF), VF);
5840 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
5841 if (VF == 1)
5842 return;
5843 NumPredStores = 0;
5844 for (BasicBlock *BB : TheLoop->blocks()) {
5845 // For each instruction in the old loop.
5846 for (Instruction &I : *BB) {
5847 Value *Ptr = getLoadStorePointerOperand(&I);
5848 if (!Ptr)
5849 continue;
5851 // TODO: We should generate better code and update the cost model for
5852 // predicated uniform stores. Today they are treated as any other
5853 // predicated store (see added test cases in
5854 // invariant-store-vectorization.ll).
5855 if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
5856 NumPredStores++;
5858 if (Legal->isUniform(Ptr) &&
5859 // Conditional loads and stores should be scalarized and predicated.
5860 // isScalarWithPredication cannot be used here since masked
5861 // gather/scatters are not considered scalar with predication.
5862 !Legal->blockNeedsPredication(I.getParent())) {
5863 // TODO: Avoid replicating loads and stores instead of
5864 // relying on instcombine to remove them.
5865 // Load: Scalar load + broadcast
5866 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5867 unsigned Cost = getUniformMemOpCost(&I, VF);
5868 setWideningDecision(&I, VF, CM_Scalarize, Cost);
5869 continue;
5872 // We assume that widening is the best solution when possible.
5873 if (memoryInstructionCanBeWidened(&I, VF)) {
5874 unsigned Cost = getConsecutiveMemOpCost(&I, VF);
5875 int ConsecutiveStride =
5876 Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
5877 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5878 "Expected consecutive stride.");
5879 InstWidening Decision =
5880 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5881 setWideningDecision(&I, VF, Decision, Cost);
5882 continue;
5885 // Choose between Interleaving, Gather/Scatter or Scalarization.
5886 unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
5887 unsigned NumAccesses = 1;
5888 if (isAccessInterleaved(&I)) {
5889 auto Group = getInterleavedAccessGroup(&I);
5890 assert(Group && "Fail to get an interleaved access group.");
5892 // Make one decision for the whole group.
5893 if (getWideningDecision(&I, VF) != CM_Unknown)
5894 continue;
5896 NumAccesses = Group->getNumMembers();
5897 if (interleavedAccessCanBeWidened(&I, VF))
5898 InterleaveCost = getInterleaveGroupCost(&I, VF);
5901 unsigned GatherScatterCost =
5902 isLegalGatherOrScatter(&I)
5903 ? getGatherScatterCost(&I, VF) * NumAccesses
5904 : std::numeric_limits<unsigned>::max();
5906 unsigned ScalarizationCost =
5907 getMemInstScalarizationCost(&I, VF) * NumAccesses;
5909 // Choose better solution for the current VF,
5910 // write down this decision and use it during vectorization.
5911 unsigned Cost;
5912 InstWidening Decision;
5913 if (InterleaveCost <= GatherScatterCost &&
5914 InterleaveCost < ScalarizationCost) {
5915 Decision = CM_Interleave;
5916 Cost = InterleaveCost;
5917 } else if (GatherScatterCost < ScalarizationCost) {
5918 Decision = CM_GatherScatter;
5919 Cost = GatherScatterCost;
5920 } else {
5921 Decision = CM_Scalarize;
5922 Cost = ScalarizationCost;
5924 // If the instructions belongs to an interleave group, the whole group
5925 // receives the same decision. The whole group receives the cost, but
5926 // the cost will actually be assigned to one instruction.
5927 if (auto Group = getInterleavedAccessGroup(&I))
5928 setWideningDecision(Group, VF, Decision, Cost);
5929 else
5930 setWideningDecision(&I, VF, Decision, Cost);
5934 // Make sure that any load of address and any other address computation
5935 // remains scalar unless there is gather/scatter support. This avoids
5936 // inevitable extracts into address registers, and also has the benefit of
5937 // activating LSR more, since that pass can't optimize vectorized
5938 // addresses.
5939 if (TTI.prefersVectorizedAddressing())
5940 return;
5942 // Start with all scalar pointer uses.
5943 SmallPtrSet<Instruction *, 8> AddrDefs;
5944 for (BasicBlock *BB : TheLoop->blocks())
5945 for (Instruction &I : *BB) {
5946 Instruction *PtrDef =
5947 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5948 if (PtrDef && TheLoop->contains(PtrDef) &&
5949 getWideningDecision(&I, VF) != CM_GatherScatter)
5950 AddrDefs.insert(PtrDef);
5953 // Add all instructions used to generate the addresses.
5954 SmallVector<Instruction *, 4> Worklist;
5955 for (auto *I : AddrDefs)
5956 Worklist.push_back(I);
5957 while (!Worklist.empty()) {
5958 Instruction *I = Worklist.pop_back_val();
5959 for (auto &Op : I->operands())
5960 if (auto *InstOp = dyn_cast<Instruction>(Op))
5961 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
5962 AddrDefs.insert(InstOp).second)
5963 Worklist.push_back(InstOp);
5966 for (auto *I : AddrDefs) {
5967 if (isa<LoadInst>(I)) {
5968 // Setting the desired widening decision should ideally be handled in
5969 // by cost functions, but since this involves the task of finding out
5970 // if the loaded register is involved in an address computation, it is
5971 // instead changed here when we know this is the case.
5972 InstWidening Decision = getWideningDecision(I, VF);
5973 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
5974 // Scalarize a widened load of address.
5975 setWideningDecision(I, VF, CM_Scalarize,
5976 (VF * getMemoryInstructionCost(I, 1)));
5977 else if (auto Group = getInterleavedAccessGroup(I)) {
5978 // Scalarize an interleave group of address loads.
5979 for (unsigned I = 0; I < Group->getFactor(); ++I) {
5980 if (Instruction *Member = Group->getMember(I))
5981 setWideningDecision(Member, VF, CM_Scalarize,
5982 (VF * getMemoryInstructionCost(Member, 1)));
5985 } else
5986 // Make sure I gets scalarized and a cost estimate without
5987 // scalarization overhead.
5988 ForcedScalars[VF].insert(I);
5992 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
5993 unsigned VF,
5994 Type *&VectorTy) {
5995 Type *RetTy = I->getType();
5996 if (canTruncateToMinimalBitwidth(I, VF))
5997 RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
5998 VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
5999 auto SE = PSE.getSE();
6001 // TODO: We need to estimate the cost of intrinsic calls.
6002 switch (I->getOpcode()) {
6003 case Instruction::GetElementPtr:
6004 // We mark this instruction as zero-cost because the cost of GEPs in
6005 // vectorized code depends on whether the corresponding memory instruction
6006 // is scalarized or not. Therefore, we handle GEPs with the memory
6007 // instruction cost.
6008 return 0;
6009 case Instruction::Br: {
6010 // In cases of scalarized and predicated instructions, there will be VF
6011 // predicated blocks in the vectorized loop. Each branch around these
6012 // blocks requires also an extract of its vector compare i1 element.
6013 bool ScalarPredicatedBB = false;
6014 BranchInst *BI = cast<BranchInst>(I);
6015 if (VF > 1 && BI->isConditional() &&
6016 (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6017 PredicatedBBsAfterVectorization.end() ||
6018 PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6019 PredicatedBBsAfterVectorization.end()))
6020 ScalarPredicatedBB = true;
6022 if (ScalarPredicatedBB) {
6023 // Return cost for branches around scalarized and predicated blocks.
6024 Type *Vec_i1Ty =
6025 VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6026 return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6027 (TTI.getCFInstrCost(Instruction::Br) * VF));
6028 } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6029 // The back-edge branch will remain, as will all scalar branches.
6030 return TTI.getCFInstrCost(Instruction::Br);
6031 else
6032 // This branch will be eliminated by if-conversion.
6033 return 0;
6034 // Note: We currently assume zero cost for an unconditional branch inside
6035 // a predicated block since it will become a fall-through, although we
6036 // may decide in the future to call TTI for all branches.
6038 case Instruction::PHI: {
6039 auto *Phi = cast<PHINode>(I);
6041 // First-order recurrences are replaced by vector shuffles inside the loop.
6042 // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6043 if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6044 return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6045 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6047 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6048 // converted into select instructions. We require N - 1 selects per phi
6049 // node, where N is the number of incoming values.
6050 if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6051 return (Phi->getNumIncomingValues() - 1) *
6052 TTI.getCmpSelInstrCost(
6053 Instruction::Select, ToVectorTy(Phi->getType(), VF),
6054 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6056 return TTI.getCFInstrCost(Instruction::PHI);
6058 case Instruction::UDiv:
6059 case Instruction::SDiv:
6060 case Instruction::URem:
6061 case Instruction::SRem:
6062 // If we have a predicated instruction, it may not be executed for each
6063 // vector lane. Get the scalarization cost and scale this amount by the
6064 // probability of executing the predicated block. If the instruction is not
6065 // predicated, we fall through to the next case.
6066 if (VF > 1 && isScalarWithPredication(I)) {
6067 unsigned Cost = 0;
6069 // These instructions have a non-void type, so account for the phi nodes
6070 // that we will create. This cost is likely to be zero. The phi node
6071 // cost, if any, should be scaled by the block probability because it
6072 // models a copy at the end of each predicated block.
6073 Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6075 // The cost of the non-predicated instruction.
6076 Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6078 // The cost of insertelement and extractelement instructions needed for
6079 // scalarization.
6080 Cost += getScalarizationOverhead(I, VF);
6082 // Scale the cost by the probability of executing the predicated blocks.
6083 // This assumes the predicated block for each vector lane is equally
6084 // likely.
6085 return Cost / getReciprocalPredBlockProb();
6087 LLVM_FALLTHROUGH;
6088 case Instruction::Add:
6089 case Instruction::FAdd:
6090 case Instruction::Sub:
6091 case Instruction::FSub:
6092 case Instruction::Mul:
6093 case Instruction::FMul:
6094 case Instruction::FDiv:
6095 case Instruction::FRem:
6096 case Instruction::Shl:
6097 case Instruction::LShr:
6098 case Instruction::AShr:
6099 case Instruction::And:
6100 case Instruction::Or:
6101 case Instruction::Xor: {
6102 // Since we will replace the stride by 1 the multiplication should go away.
6103 if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6104 return 0;
6105 // Certain instructions can be cheaper to vectorize if they have a constant
6106 // second vector operand. One example of this are shifts on x86.
6107 Value *Op2 = I->getOperand(1);
6108 TargetTransformInfo::OperandValueProperties Op2VP;
6109 TargetTransformInfo::OperandValueKind Op2VK =
6110 TTI.getOperandInfo(Op2, Op2VP);
6111 if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6112 Op2VK = TargetTransformInfo::OK_UniformValue;
6114 SmallVector<const Value *, 4> Operands(I->operand_values());
6115 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6116 return N * TTI.getArithmeticInstrCost(
6117 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6118 Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands);
6120 case Instruction::FNeg: {
6121 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6122 return N * TTI.getArithmeticInstrCost(
6123 I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6124 TargetTransformInfo::OK_AnyValue,
6125 TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6126 I->getOperand(0));
6128 case Instruction::Select: {
6129 SelectInst *SI = cast<SelectInst>(I);
6130 const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6131 bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6132 Type *CondTy = SI->getCondition()->getType();
6133 if (!ScalarCond)
6134 CondTy = VectorType::get(CondTy, VF);
6136 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6138 case Instruction::ICmp:
6139 case Instruction::FCmp: {
6140 Type *ValTy = I->getOperand(0)->getType();
6141 Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6142 if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6143 ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6144 VectorTy = ToVectorTy(ValTy, VF);
6145 return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6147 case Instruction::Store:
6148 case Instruction::Load: {
6149 unsigned Width = VF;
6150 if (Width > 1) {
6151 InstWidening Decision = getWideningDecision(I, Width);
6152 assert(Decision != CM_Unknown &&
6153 "CM decision should be taken at this point");
6154 if (Decision == CM_Scalarize)
6155 Width = 1;
6157 VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6158 return getMemoryInstructionCost(I, VF);
6160 case Instruction::ZExt:
6161 case Instruction::SExt:
6162 case Instruction::FPToUI:
6163 case Instruction::FPToSI:
6164 case Instruction::FPExt:
6165 case Instruction::PtrToInt:
6166 case Instruction::IntToPtr:
6167 case Instruction::SIToFP:
6168 case Instruction::UIToFP:
6169 case Instruction::Trunc:
6170 case Instruction::FPTrunc:
6171 case Instruction::BitCast: {
6172 // We optimize the truncation of induction variables having constant
6173 // integer steps. The cost of these truncations is the same as the scalar
6174 // operation.
6175 if (isOptimizableIVTruncate(I, VF)) {
6176 auto *Trunc = cast<TruncInst>(I);
6177 return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6178 Trunc->getSrcTy(), Trunc);
6181 Type *SrcScalarTy = I->getOperand(0)->getType();
6182 Type *SrcVecTy =
6183 VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6184 if (canTruncateToMinimalBitwidth(I, VF)) {
6185 // This cast is going to be shrunk. This may remove the cast or it might
6186 // turn it into slightly different cast. For example, if MinBW == 16,
6187 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6189 // Calculate the modified src and dest types.
6190 Type *MinVecTy = VectorTy;
6191 if (I->getOpcode() == Instruction::Trunc) {
6192 SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6193 VectorTy =
6194 largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6195 } else if (I->getOpcode() == Instruction::ZExt ||
6196 I->getOpcode() == Instruction::SExt) {
6197 SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6198 VectorTy =
6199 smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6203 unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6204 return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6206 case Instruction::Call: {
6207 bool NeedToScalarize;
6208 CallInst *CI = cast<CallInst>(I);
6209 unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6210 if (getVectorIntrinsicIDForCall(CI, TLI))
6211 return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6212 return CallCost;
6214 default:
6215 // The cost of executing VF copies of the scalar instruction. This opcode
6216 // is unknown. Assume that it is the same as 'mul'.
6217 return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6218 getScalarizationOverhead(I, VF);
6219 } // end of switch.
6222 char LoopVectorize::ID = 0;
6224 static const char lv_name[] = "Loop Vectorization";
6226 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6227 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6228 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6229 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6230 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6231 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6232 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6233 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6234 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6235 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6236 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6237 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6238 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6239 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6240 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6242 namespace llvm {
6244 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6246 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6247 bool VectorizeOnlyWhenForced) {
6248 return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6251 } // end namespace llvm
6253 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6254 // Check if the pointer operand of a load or store instruction is
6255 // consecutive.
6256 if (auto *Ptr = getLoadStorePointerOperand(Inst))
6257 return Legal->isConsecutivePtr(Ptr);
6258 return false;
6261 void LoopVectorizationCostModel::collectValuesToIgnore() {
6262 // Ignore ephemeral values.
6263 CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6265 // Ignore type-promoting instructions we identified during reduction
6266 // detection.
6267 for (auto &Reduction : *Legal->getReductionVars()) {
6268 RecurrenceDescriptor &RedDes = Reduction.second;
6269 SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6270 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6272 // Ignore type-casting instructions we identified during induction
6273 // detection.
6274 for (auto &Induction : *Legal->getInductionVars()) {
6275 InductionDescriptor &IndDes = Induction.second;
6276 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6277 VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6281 // TODO: we could return a pair of values that specify the max VF and
6282 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6283 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6284 // doesn't have a cost model that can choose which plan to execute if
6285 // more than one is generated.
6286 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6287 LoopVectorizationCostModel &CM) {
6288 unsigned WidestType;
6289 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6290 return WidestVectorRegBits / WidestType;
6293 VectorizationFactor
6294 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6295 unsigned VF = UserVF;
6296 // Outer loop handling: They may require CFG and instruction level
6297 // transformations before even evaluating whether vectorization is profitable.
6298 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6299 // the vectorization pipeline.
6300 if (!OrigLoop->empty()) {
6301 // If the user doesn't provide a vectorization factor, determine a
6302 // reasonable one.
6303 if (!UserVF) {
6304 VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6305 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6307 // Make sure we have a VF > 1 for stress testing.
6308 if (VPlanBuildStressTest && VF < 2) {
6309 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6310 << "overriding computed VF.\n");
6311 VF = 4;
6314 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6315 assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6316 LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6317 << " to build VPlans.\n");
6318 buildVPlans(VF, VF);
6320 // For VPlan build stress testing, we bail out after VPlan construction.
6321 if (VPlanBuildStressTest)
6322 return VectorizationFactor::Disabled();
6324 return {VF, 0};
6327 LLVM_DEBUG(
6328 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6329 "VPlan-native path.\n");
6330 return VectorizationFactor::Disabled();
6333 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6334 assert(OrigLoop->empty() && "Inner loop expected.");
6335 Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6336 if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6337 return None;
6339 // Invalidate interleave groups if all blocks of loop will be predicated.
6340 if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6341 !useMaskedInterleavedAccesses(*TTI)) {
6342 LLVM_DEBUG(
6343 dbgs()
6344 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6345 "which requires masked-interleaved support.\n");
6346 CM.InterleaveInfo.reset();
6349 if (UserVF) {
6350 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6351 assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6352 // Collect the instructions (and their associated costs) that will be more
6353 // profitable to scalarize.
6354 CM.selectUserVectorizationFactor(UserVF);
6355 buildVPlansWithVPRecipes(UserVF, UserVF);
6356 LLVM_DEBUG(printPlans(dbgs()));
6357 return {{UserVF, 0}};
6360 unsigned MaxVF = MaybeMaxVF.getValue();
6361 assert(MaxVF != 0 && "MaxVF is zero.");
6363 for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6364 // Collect Uniform and Scalar instructions after vectorization with VF.
6365 CM.collectUniformsAndScalars(VF);
6367 // Collect the instructions (and their associated costs) that will be more
6368 // profitable to scalarize.
6369 if (VF > 1)
6370 CM.collectInstsToScalarize(VF);
6373 buildVPlansWithVPRecipes(1, MaxVF);
6374 LLVM_DEBUG(printPlans(dbgs()));
6375 if (MaxVF == 1)
6376 return VectorizationFactor::Disabled();
6378 // Select the optimal vectorization factor.
6379 return CM.selectVectorizationFactor(MaxVF);
6382 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6383 LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6384 << '\n');
6385 BestVF = VF;
6386 BestUF = UF;
6388 erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6389 return !Plan->hasVF(VF);
6391 assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6394 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6395 DominatorTree *DT) {
6396 // Perform the actual loop transformation.
6398 // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6399 VPCallbackILV CallbackILV(ILV);
6401 VPTransformState State{BestVF, BestUF, LI,
6402 DT, ILV.Builder, ILV.VectorLoopValueMap,
6403 &ILV, CallbackILV};
6404 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6405 State.TripCount = ILV.getOrCreateTripCount(nullptr);
6407 //===------------------------------------------------===//
6409 // Notice: any optimization or new instruction that go
6410 // into the code below should also be implemented in
6411 // the cost-model.
6413 //===------------------------------------------------===//
6415 // 2. Copy and widen instructions from the old loop into the new loop.
6416 assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6417 VPlans.front()->execute(&State);
6419 // 3. Fix the vectorized code: take care of header phi's, live-outs,
6420 // predication, updating analyses.
6421 ILV.fixVectorizedLoop();
6424 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6425 SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6426 BasicBlock *Latch = OrigLoop->getLoopLatch();
6428 // We create new control-flow for the vectorized loop, so the original
6429 // condition will be dead after vectorization if it's only used by the
6430 // branch.
6431 auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6432 if (Cmp && Cmp->hasOneUse())
6433 DeadInstructions.insert(Cmp);
6435 // We create new "steps" for induction variable updates to which the original
6436 // induction variables map. An original update instruction will be dead if
6437 // all its users except the induction variable are dead.
6438 for (auto &Induction : *Legal->getInductionVars()) {
6439 PHINode *Ind = Induction.first;
6440 auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6441 if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6442 return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6443 DeadInstructions.end();
6445 DeadInstructions.insert(IndUpdate);
6447 // We record as "Dead" also the type-casting instructions we had identified
6448 // during induction analysis. We don't need any handling for them in the
6449 // vectorized loop because we have proven that, under a proper runtime
6450 // test guarding the vectorized loop, the value of the phi, and the casted
6451 // value of the phi, are the same. The last instruction in this casting chain
6452 // will get its scalar/vector/widened def from the scalar/vector/widened def
6453 // of the respective phi node. Any other casts in the induction def-use chain
6454 // have no other uses outside the phi update chain, and will be ignored.
6455 InductionDescriptor &IndDes = Induction.second;
6456 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6457 DeadInstructions.insert(Casts.begin(), Casts.end());
6461 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6463 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6465 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6466 Instruction::BinaryOps BinOp) {
6467 // When unrolling and the VF is 1, we only need to add a simple scalar.
6468 Type *Ty = Val->getType();
6469 assert(!Ty->isVectorTy() && "Val must be a scalar");
6471 if (Ty->isFloatingPointTy()) {
6472 Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6474 // Floating point operations had to be 'fast' to enable the unrolling.
6475 Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6476 return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6478 Constant *C = ConstantInt::get(Ty, StartIdx);
6479 return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6482 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6483 SmallVector<Metadata *, 4> MDs;
6484 // Reserve first location for self reference to the LoopID metadata node.
6485 MDs.push_back(nullptr);
6486 bool IsUnrollMetadata = false;
6487 MDNode *LoopID = L->getLoopID();
6488 if (LoopID) {
6489 // First find existing loop unrolling disable metadata.
6490 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6491 auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6492 if (MD) {
6493 const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6494 IsUnrollMetadata =
6495 S && S->getString().startswith("llvm.loop.unroll.disable");
6497 MDs.push_back(LoopID->getOperand(i));
6501 if (!IsUnrollMetadata) {
6502 // Add runtime unroll disable metadata.
6503 LLVMContext &Context = L->getHeader()->getContext();
6504 SmallVector<Metadata *, 1> DisableOperands;
6505 DisableOperands.push_back(
6506 MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6507 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6508 MDs.push_back(DisableNode);
6509 MDNode *NewLoopID = MDNode::get(Context, MDs);
6510 // Set operand 0 to refer to the loop id itself.
6511 NewLoopID->replaceOperandWith(0, NewLoopID);
6512 L->setLoopID(NewLoopID);
6516 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6517 const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6518 assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6519 bool PredicateAtRangeStart = Predicate(Range.Start);
6521 for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6522 if (Predicate(TmpVF) != PredicateAtRangeStart) {
6523 Range.End = TmpVF;
6524 break;
6527 return PredicateAtRangeStart;
6530 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6531 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6532 /// of VF's starting at a given VF and extending it as much as possible. Each
6533 /// vectorization decision can potentially shorten this sub-range during
6534 /// buildVPlan().
6535 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6536 for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6537 VFRange SubRange = {VF, MaxVF + 1};
6538 VPlans.push_back(buildVPlan(SubRange));
6539 VF = SubRange.End;
6543 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6544 VPlanPtr &Plan) {
6545 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6547 // Look for cached value.
6548 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6549 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6550 if (ECEntryIt != EdgeMaskCache.end())
6551 return ECEntryIt->second;
6553 VPValue *SrcMask = createBlockInMask(Src, Plan);
6555 // The terminator has to be a branch inst!
6556 BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6557 assert(BI && "Unexpected terminator found");
6559 if (!BI->isConditional())
6560 return EdgeMaskCache[Edge] = SrcMask;
6562 VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6563 assert(EdgeMask && "No Edge Mask found for condition");
6565 if (BI->getSuccessor(0) != Dst)
6566 EdgeMask = Builder.createNot(EdgeMask);
6568 if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6569 EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6571 return EdgeMaskCache[Edge] = EdgeMask;
6574 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6575 assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6577 // Look for cached value.
6578 BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6579 if (BCEntryIt != BlockMaskCache.end())
6580 return BCEntryIt->second;
6582 // All-one mask is modelled as no-mask following the convention for masked
6583 // load/store/gather/scatter. Initialize BlockMask to no-mask.
6584 VPValue *BlockMask = nullptr;
6586 if (OrigLoop->getHeader() == BB) {
6587 if (!CM.blockNeedsPredication(BB))
6588 return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6590 // Introduce the early-exit compare IV <= BTC to form header block mask.
6591 // This is used instead of IV < TC because TC may wrap, unlike BTC.
6592 VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6593 VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6594 BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6595 return BlockMaskCache[BB] = BlockMask;
6598 // This is the block mask. We OR all incoming edges.
6599 for (auto *Predecessor : predecessors(BB)) {
6600 VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6601 if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6602 return BlockMaskCache[BB] = EdgeMask;
6604 if (!BlockMask) { // BlockMask has its initialized nullptr value.
6605 BlockMask = EdgeMask;
6606 continue;
6609 BlockMask = Builder.createOr(BlockMask, EdgeMask);
6612 return BlockMaskCache[BB] = BlockMask;
6615 VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
6616 VFRange &Range,
6617 VPlanPtr &Plan) {
6618 const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I);
6619 if (!IG)
6620 return nullptr;
6622 // Now check if IG is relevant for VF's in the given range.
6623 auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {
6624 return [=](unsigned VF) -> bool {
6625 return (VF >= 2 && // Query is illegal for VF == 1
6626 CM.getWideningDecision(I, VF) ==
6627 LoopVectorizationCostModel::CM_Interleave);
6630 if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range))
6631 return nullptr;
6633 // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
6634 // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
6635 // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
6636 assert(I == IG->getInsertPos() &&
6637 "Generating a recipe for an adjunct member of an interleave group");
6639 VPValue *Mask = nullptr;
6640 if (Legal->isMaskRequired(I))
6641 Mask = createBlockInMask(I->getParent(), Plan);
6643 return new VPInterleaveRecipe(IG, Mask);
6646 VPWidenMemoryInstructionRecipe *
6647 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6648 VPlanPtr &Plan) {
6649 if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6650 return nullptr;
6652 auto willWiden = [&](unsigned VF) -> bool {
6653 if (VF == 1)
6654 return false;
6655 if (CM.isScalarAfterVectorization(I, VF) ||
6656 CM.isProfitableToScalarize(I, VF))
6657 return false;
6658 LoopVectorizationCostModel::InstWidening Decision =
6659 CM.getWideningDecision(I, VF);
6660 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6661 "CM decision should be taken at this point.");
6662 assert(Decision != LoopVectorizationCostModel::CM_Interleave &&
6663 "Interleave memory opportunity should be caught earlier.");
6664 return Decision != LoopVectorizationCostModel::CM_Scalarize;
6667 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6668 return nullptr;
6670 VPValue *Mask = nullptr;
6671 if (Legal->isMaskRequired(I))
6672 Mask = createBlockInMask(I->getParent(), Plan);
6674 return new VPWidenMemoryInstructionRecipe(*I, Mask);
6677 VPWidenIntOrFpInductionRecipe *
6678 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6679 if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6680 // Check if this is an integer or fp induction. If so, build the recipe that
6681 // produces its scalar and vector values.
6682 InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6683 if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6684 II.getKind() == InductionDescriptor::IK_FpInduction)
6685 return new VPWidenIntOrFpInductionRecipe(Phi);
6687 return nullptr;
6690 // Optimize the special case where the source is a constant integer
6691 // induction variable. Notice that we can only optimize the 'trunc' case
6692 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6693 // (c) other casts depend on pointer size.
6695 // Determine whether \p K is a truncation based on an induction variable that
6696 // can be optimized.
6697 auto isOptimizableIVTruncate =
6698 [&](Instruction *K) -> std::function<bool(unsigned)> {
6699 return
6700 [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6703 if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6704 isOptimizableIVTruncate(I), Range))
6705 return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6706 cast<TruncInst>(I));
6707 return nullptr;
6710 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6711 PHINode *Phi = dyn_cast<PHINode>(I);
6712 if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6713 return nullptr;
6715 // We know that all PHIs in non-header blocks are converted into selects, so
6716 // we don't have to worry about the insertion order and we can just use the
6717 // builder. At this point we generate the predication tree. There may be
6718 // duplications since this is a simple recursive scan, but future
6719 // optimizations will clean it up.
6721 SmallVector<VPValue *, 2> Masks;
6722 unsigned NumIncoming = Phi->getNumIncomingValues();
6723 for (unsigned In = 0; In < NumIncoming; In++) {
6724 VPValue *EdgeMask =
6725 createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6726 assert((EdgeMask || NumIncoming == 1) &&
6727 "Multiple predecessors with one having a full mask");
6728 if (EdgeMask)
6729 Masks.push_back(EdgeMask);
6731 return new VPBlendRecipe(Phi, Masks);
6734 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6735 VFRange &Range) {
6737 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6738 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6740 if (IsPredicated)
6741 return false;
6743 auto IsVectorizableOpcode = [](unsigned Opcode) {
6744 switch (Opcode) {
6745 case Instruction::Add:
6746 case Instruction::And:
6747 case Instruction::AShr:
6748 case Instruction::BitCast:
6749 case Instruction::Br:
6750 case Instruction::Call:
6751 case Instruction::FAdd:
6752 case Instruction::FCmp:
6753 case Instruction::FDiv:
6754 case Instruction::FMul:
6755 case Instruction::FNeg:
6756 case Instruction::FPExt:
6757 case Instruction::FPToSI:
6758 case Instruction::FPToUI:
6759 case Instruction::FPTrunc:
6760 case Instruction::FRem:
6761 case Instruction::FSub:
6762 case Instruction::GetElementPtr:
6763 case Instruction::ICmp:
6764 case Instruction::IntToPtr:
6765 case Instruction::Load:
6766 case Instruction::LShr:
6767 case Instruction::Mul:
6768 case Instruction::Or:
6769 case Instruction::PHI:
6770 case Instruction::PtrToInt:
6771 case Instruction::SDiv:
6772 case Instruction::Select:
6773 case Instruction::SExt:
6774 case Instruction::Shl:
6775 case Instruction::SIToFP:
6776 case Instruction::SRem:
6777 case Instruction::Store:
6778 case Instruction::Sub:
6779 case Instruction::Trunc:
6780 case Instruction::UDiv:
6781 case Instruction::UIToFP:
6782 case Instruction::URem:
6783 case Instruction::Xor:
6784 case Instruction::ZExt:
6785 return true;
6787 return false;
6790 if (!IsVectorizableOpcode(I->getOpcode()))
6791 return false;
6793 if (CallInst *CI = dyn_cast<CallInst>(I)) {
6794 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6795 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6796 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6797 return false;
6800 auto willWiden = [&](unsigned VF) -> bool {
6801 if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6802 CM.isProfitableToScalarize(I, VF)))
6803 return false;
6804 if (CallInst *CI = dyn_cast<CallInst>(I)) {
6805 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6806 // The following case may be scalarized depending on the VF.
6807 // The flag shows whether we use Intrinsic or a usual Call for vectorized
6808 // version of the instruction.
6809 // Is it beneficial to perform intrinsic call compared to lib call?
6810 bool NeedToScalarize;
6811 unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6812 bool UseVectorIntrinsic =
6813 ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6814 return UseVectorIntrinsic || !NeedToScalarize;
6816 if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6817 assert(CM.getWideningDecision(I, VF) ==
6818 LoopVectorizationCostModel::CM_Scalarize &&
6819 "Memory widening decisions should have been taken care by now");
6820 return false;
6822 return true;
6825 if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6826 return false;
6828 // Success: widen this instruction. We optimize the common case where
6829 // consecutive instructions can be represented by a single recipe.
6830 if (!VPBB->empty()) {
6831 VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());
6832 if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))
6833 return true;
6836 VPBB->appendRecipe(new VPWidenRecipe(I));
6837 return true;
6840 VPBasicBlock *VPRecipeBuilder::handleReplication(
6841 Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6842 DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6843 VPlanPtr &Plan) {
6844 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6845 [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6846 Range);
6848 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6849 [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6851 auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6853 // Find if I uses a predicated instruction. If so, it will use its scalar
6854 // value. Avoid hoisting the insert-element which packs the scalar value into
6855 // a vector value, as that happens iff all users use the vector value.
6856 for (auto &Op : I->operands())
6857 if (auto *PredInst = dyn_cast<Instruction>(Op))
6858 if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6859 PredInst2Recipe[PredInst]->setAlsoPack(false);
6861 // Finalize the recipe for Instr, first if it is not predicated.
6862 if (!IsPredicated) {
6863 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
6864 VPBB->appendRecipe(Recipe);
6865 return VPBB;
6867 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
6868 assert(VPBB->getSuccessors().empty() &&
6869 "VPBB has successors when handling predicated replication.");
6870 // Record predicated instructions for above packing optimizations.
6871 PredInst2Recipe[I] = Recipe;
6872 VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
6873 VPBlockUtils::insertBlockAfter(Region, VPBB);
6874 auto *RegSucc = new VPBasicBlock();
6875 VPBlockUtils::insertBlockAfter(RegSucc, Region);
6876 return RegSucc;
6879 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
6880 VPRecipeBase *PredRecipe,
6881 VPlanPtr &Plan) {
6882 // Instructions marked for predication are replicated and placed under an
6883 // if-then construct to prevent side-effects.
6885 // Generate recipes to compute the block mask for this region.
6886 VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
6888 // Build the triangular if-then region.
6889 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
6890 assert(Instr->getParent() && "Predicated instruction not in any basic block");
6891 auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
6892 auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
6893 auto *PHIRecipe =
6894 Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
6895 auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
6896 auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
6897 VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
6899 // Note: first set Entry as region entry and then connect successors starting
6900 // from it in order, to propagate the "parent" of each VPBasicBlock.
6901 VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
6902 VPBlockUtils::connectBlocks(Pred, Exit);
6904 return Region;
6907 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
6908 VPlanPtr &Plan, VPBasicBlock *VPBB) {
6909 VPRecipeBase *Recipe = nullptr;
6910 // Check if Instr should belong to an interleave memory recipe, or already
6911 // does. In the latter case Instr is irrelevant.
6912 if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {
6913 VPBB->appendRecipe(Recipe);
6914 return true;
6917 // Check if Instr is a memory operation that should be widened.
6918 if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
6919 VPBB->appendRecipe(Recipe);
6920 return true;
6923 // Check if Instr should form some PHI recipe.
6924 if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
6925 VPBB->appendRecipe(Recipe);
6926 return true;
6928 if ((Recipe = tryToBlend(Instr, Plan))) {
6929 VPBB->appendRecipe(Recipe);
6930 return true;
6932 if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
6933 VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
6934 return true;
6937 // Check if Instr is to be widened by a general VPWidenRecipe, after
6938 // having first checked for specific widening recipes that deal with
6939 // Interleave Groups, Inductions and Phi nodes.
6940 if (tryToWiden(Instr, VPBB, Range))
6941 return true;
6943 return false;
6946 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
6947 unsigned MaxVF) {
6948 assert(OrigLoop->empty() && "Inner loop expected.");
6950 // Collect conditions feeding internal conditional branches; they need to be
6951 // represented in VPlan for it to model masking.
6952 SmallPtrSet<Value *, 1> NeedDef;
6954 auto *Latch = OrigLoop->getLoopLatch();
6955 for (BasicBlock *BB : OrigLoop->blocks()) {
6956 if (BB == Latch)
6957 continue;
6958 BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
6959 if (Branch && Branch->isConditional())
6960 NeedDef.insert(Branch->getCondition());
6963 // If the tail is to be folded by masking, the primary induction variable
6964 // needs to be represented in VPlan for it to model early-exit masking.
6965 // Also, both the Phi and the live-out instruction of each reduction are
6966 // required in order to introduce a select between them in VPlan.
6967 if (CM.foldTailByMasking()) {
6968 NeedDef.insert(Legal->getPrimaryInduction());
6969 for (auto &Reduction : *Legal->getReductionVars()) {
6970 NeedDef.insert(Reduction.first);
6971 NeedDef.insert(Reduction.second.getLoopExitInstr());
6975 // Collect instructions from the original loop that will become trivially dead
6976 // in the vectorized loop. We don't need to vectorize these instructions. For
6977 // example, original induction update instructions can become dead because we
6978 // separately emit induction "steps" when generating code for the new loop.
6979 // Similarly, we create a new latch condition when setting up the structure
6980 // of the new loop, so the old one can become dead.
6981 SmallPtrSet<Instruction *, 4> DeadInstructions;
6982 collectTriviallyDeadInstructions(DeadInstructions);
6984 for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6985 VFRange SubRange = {VF, MaxVF + 1};
6986 VPlans.push_back(
6987 buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
6988 VF = SubRange.End;
6992 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
6993 VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
6994 SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6995 // Hold a mapping from predicated instructions to their recipes, in order to
6996 // fix their AlsoPack behavior if a user is determined to replicate and use a
6997 // scalar instead of vector value.
6998 DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7000 DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7001 DenseMap<Instruction *, Instruction *> SinkAfterInverse;
7003 // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7004 VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7005 auto Plan = std::make_unique<VPlan>(VPBB);
7007 VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
7008 // Represent values that will have defs inside VPlan.
7009 for (Value *V : NeedDef)
7010 Plan->addVPValue(V);
7012 // Scan the body of the loop in a topological order to visit each basic block
7013 // after having visited its predecessor basic blocks.
7014 LoopBlocksDFS DFS(OrigLoop);
7015 DFS.perform(LI);
7017 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7018 // Relevant instructions from basic block BB will be grouped into VPRecipe
7019 // ingredients and fill a new VPBasicBlock.
7020 unsigned VPBBsForBB = 0;
7021 auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7022 VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7023 VPBB = FirstVPBBForBB;
7024 Builder.setInsertPoint(VPBB);
7026 std::vector<Instruction *> Ingredients;
7028 // Organize the ingredients to vectorize from current basic block in the
7029 // right order.
7030 for (Instruction &I : BB->instructionsWithoutDebug()) {
7031 Instruction *Instr = &I;
7033 // First filter out irrelevant instructions, to ensure no recipes are
7034 // built for them.
7035 if (isa<BranchInst>(Instr) ||
7036 DeadInstructions.find(Instr) != DeadInstructions.end())
7037 continue;
7039 // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
7040 // member of the IG, do not construct any Recipe for it.
7041 const InterleaveGroup<Instruction> *IG =
7042 CM.getInterleavedAccessGroup(Instr);
7043 if (IG && Instr != IG->getInsertPos() &&
7044 Range.Start >= 2 && // Query is illegal for VF == 1
7045 CM.getWideningDecision(Instr, Range.Start) ==
7046 LoopVectorizationCostModel::CM_Interleave) {
7047 auto SinkCandidate = SinkAfterInverse.find(Instr);
7048 if (SinkCandidate != SinkAfterInverse.end())
7049 Ingredients.push_back(SinkCandidate->second);
7050 continue;
7053 // Move instructions to handle first-order recurrences, step 1: avoid
7054 // handling this instruction until after we've handled the instruction it
7055 // should follow.
7056 auto SAIt = SinkAfter.find(Instr);
7057 if (SAIt != SinkAfter.end()) {
7058 LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after"
7059 << *SAIt->second
7060 << " to vectorize a 1st order recurrence.\n");
7061 SinkAfterInverse[SAIt->second] = Instr;
7062 continue;
7065 Ingredients.push_back(Instr);
7067 // Move instructions to handle first-order recurrences, step 2: push the
7068 // instruction to be sunk at its insertion point.
7069 auto SAInvIt = SinkAfterInverse.find(Instr);
7070 if (SAInvIt != SinkAfterInverse.end())
7071 Ingredients.push_back(SAInvIt->second);
7074 // Introduce each ingredient into VPlan.
7075 for (Instruction *Instr : Ingredients) {
7076 if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7077 continue;
7079 // Otherwise, if all widening options failed, Instruction is to be
7080 // replicated. This may create a successor for VPBB.
7081 VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7082 Instr, Range, VPBB, PredInst2Recipe, Plan);
7083 if (NextVPBB != VPBB) {
7084 VPBB = NextVPBB;
7085 VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7086 : "");
7091 // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7092 // may also be empty, such as the last one VPBB, reflecting original
7093 // basic-blocks with no recipes.
7094 VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7095 assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7096 VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7097 VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7098 delete PreEntry;
7100 // Finally, if tail is folded by masking, introduce selects between the phi
7101 // and the live-out instruction of each reduction, at the end of the latch.
7102 if (CM.foldTailByMasking()) {
7103 Builder.setInsertPoint(VPBB);
7104 auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7105 for (auto &Reduction : *Legal->getReductionVars()) {
7106 VPValue *Phi = Plan->getVPValue(Reduction.first);
7107 VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7108 Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7112 std::string PlanName;
7113 raw_string_ostream RSO(PlanName);
7114 unsigned VF = Range.Start;
7115 Plan->addVF(VF);
7116 RSO << "Initial VPlan for VF={" << VF;
7117 for (VF *= 2; VF < Range.End; VF *= 2) {
7118 Plan->addVF(VF);
7119 RSO << "," << VF;
7121 RSO << "},UF>=1";
7122 RSO.flush();
7123 Plan->setName(PlanName);
7125 return Plan;
7128 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7129 // Outer loop handling: They may require CFG and instruction level
7130 // transformations before even evaluating whether vectorization is profitable.
7131 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7132 // the vectorization pipeline.
7133 assert(!OrigLoop->empty());
7134 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7136 // Create new empty VPlan
7137 auto Plan = std::make_unique<VPlan>();
7139 // Build hierarchical CFG
7140 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7141 HCFGBuilder.buildHierarchicalCFG();
7143 for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7144 Plan->addVF(VF);
7146 if (EnableVPlanPredication) {
7147 VPlanPredicator VPP(*Plan);
7148 VPP.predicate();
7150 // Avoid running transformation to recipes until masked code generation in
7151 // VPlan-native path is in place.
7152 return Plan;
7155 SmallPtrSet<Instruction *, 1> DeadInstructions;
7156 VPlanHCFGTransforms::VPInstructionsToVPRecipes(
7157 Plan, Legal->getInductionVars(), DeadInstructions);
7159 return Plan;
7162 Value* LoopVectorizationPlanner::VPCallbackILV::
7163 getOrCreateVectorValues(Value *V, unsigned Part) {
7164 return ILV.getOrCreateVectorValue(V, Part);
7167 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
7168 O << " +\n"
7169 << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7170 IG->getInsertPos()->printAsOperand(O, false);
7171 if (User) {
7172 O << ", ";
7173 User->getOperand(0)->printAsOperand(O);
7175 O << "\\l\"";
7176 for (unsigned i = 0; i < IG->getFactor(); ++i)
7177 if (Instruction *I = IG->getMember(i))
7178 O << " +\n"
7179 << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\"";
7182 void VPWidenRecipe::execute(VPTransformState &State) {
7183 for (auto &Instr : make_range(Begin, End))
7184 State.ILV->widenInstruction(Instr);
7187 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7188 assert(!State.Instance && "Int or FP induction being replicated.");
7189 State.ILV->widenIntOrFpInduction(IV, Trunc);
7192 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7193 State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7196 void VPBlendRecipe::execute(VPTransformState &State) {
7197 State.ILV->setDebugLocFromInst(State.Builder, Phi);
7198 // We know that all PHIs in non-header blocks are converted into
7199 // selects, so we don't have to worry about the insertion order and we
7200 // can just use the builder.
7201 // At this point we generate the predication tree. There may be
7202 // duplications since this is a simple recursive scan, but future
7203 // optimizations will clean it up.
7205 unsigned NumIncoming = Phi->getNumIncomingValues();
7207 assert((User || NumIncoming == 1) &&
7208 "Multiple predecessors with predecessors having a full mask");
7209 // Generate a sequence of selects of the form:
7210 // SELECT(Mask3, In3,
7211 // SELECT(Mask2, In2,
7212 // ( ...)))
7213 InnerLoopVectorizer::VectorParts Entry(State.UF);
7214 for (unsigned In = 0; In < NumIncoming; ++In) {
7215 for (unsigned Part = 0; Part < State.UF; ++Part) {
7216 // We might have single edge PHIs (blocks) - use an identity
7217 // 'select' for the first PHI operand.
7218 Value *In0 =
7219 State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7220 if (In == 0)
7221 Entry[Part] = In0; // Initialize with the first incoming value.
7222 else {
7223 // Select between the current value and the previous incoming edge
7224 // based on the incoming mask.
7225 Value *Cond = State.get(User->getOperand(In), Part);
7226 Entry[Part] =
7227 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7231 for (unsigned Part = 0; Part < State.UF; ++Part)
7232 State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7235 void VPInterleaveRecipe::execute(VPTransformState &State) {
7236 assert(!State.Instance && "Interleave group being replicated.");
7237 if (!User)
7238 return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
7240 // Last (and currently only) operand is a mask.
7241 InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7242 VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7243 for (unsigned Part = 0; Part < State.UF; ++Part)
7244 MaskValues[Part] = State.get(Mask, Part);
7245 State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
7248 void VPReplicateRecipe::execute(VPTransformState &State) {
7249 if (State.Instance) { // Generate a single instance.
7250 State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7251 // Insert scalar instance packing it into a vector.
7252 if (AlsoPack && State.VF > 1) {
7253 // If we're constructing lane 0, initialize to start from undef.
7254 if (State.Instance->Lane == 0) {
7255 Value *Undef =
7256 UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7257 State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7259 State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7261 return;
7264 // Generate scalar instances for all VF lanes of all UF parts, unless the
7265 // instruction is uniform inwhich case generate only the first lane for each
7266 // of the UF parts.
7267 unsigned EndLane = IsUniform ? 1 : State.VF;
7268 for (unsigned Part = 0; Part < State.UF; ++Part)
7269 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7270 State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7273 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7274 assert(State.Instance && "Branch on Mask works only on single instance.");
7276 unsigned Part = State.Instance->Part;
7277 unsigned Lane = State.Instance->Lane;
7279 Value *ConditionBit = nullptr;
7280 if (!User) // Block in mask is all-one.
7281 ConditionBit = State.Builder.getTrue();
7282 else {
7283 VPValue *BlockInMask = User->getOperand(0);
7284 ConditionBit = State.get(BlockInMask, Part);
7285 if (ConditionBit->getType()->isVectorTy())
7286 ConditionBit = State.Builder.CreateExtractElement(
7287 ConditionBit, State.Builder.getInt32(Lane));
7290 // Replace the temporary unreachable terminator with a new conditional branch,
7291 // whose two destinations will be set later when they are created.
7292 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7293 assert(isa<UnreachableInst>(CurrentTerminator) &&
7294 "Expected to replace unreachable terminator with conditional branch.");
7295 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7296 CondBr->setSuccessor(0, nullptr);
7297 ReplaceInstWithInst(CurrentTerminator, CondBr);
7300 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7301 assert(State.Instance && "Predicated instruction PHI works per instance.");
7302 Instruction *ScalarPredInst = cast<Instruction>(
7303 State.ValueMap.getScalarValue(PredInst, *State.Instance));
7304 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7305 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7306 assert(PredicatingBB && "Predicated block has no single predecessor.");
7308 // By current pack/unpack logic we need to generate only a single phi node: if
7309 // a vector value for the predicated instruction exists at this point it means
7310 // the instruction has vector users only, and a phi for the vector value is
7311 // needed. In this case the recipe of the predicated instruction is marked to
7312 // also do that packing, thereby "hoisting" the insert-element sequence.
7313 // Otherwise, a phi node for the scalar value is needed.
7314 unsigned Part = State.Instance->Part;
7315 if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7316 Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7317 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7318 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7319 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7320 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7321 State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7322 } else {
7323 Type *PredInstType = PredInst->getType();
7324 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7325 Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7326 Phi->addIncoming(ScalarPredInst, PredicatedBB);
7327 State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7331 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7332 if (!User)
7333 return State.ILV->vectorizeMemoryInstruction(&Instr);
7335 // Last (and currently only) operand is a mask.
7336 InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7337 VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7338 for (unsigned Part = 0; Part < State.UF; ++Part)
7339 MaskValues[Part] = State.get(Mask, Part);
7340 State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
7343 static ScalarEpilogueLowering
7344 getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
7345 ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
7346 ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
7347 if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7348 (F->hasOptSize() ||
7349 llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
7350 SEL = CM_ScalarEpilogueNotAllowedOptSize;
7351 else if (PreferPredicateOverEpilog || Hints.getPredicate())
7352 SEL = CM_ScalarEpilogueNotNeededUsePredicate;
7354 return SEL;
7357 // Process the loop in the VPlan-native vectorization path. This path builds
7358 // VPlan upfront in the vectorization pipeline, which allows to apply
7359 // VPlan-to-VPlan transformations from the very beginning without modifying the
7360 // input LLVM IR.
7361 static bool processLoopInVPlanNativePath(
7362 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7363 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7364 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7365 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7366 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7368 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7369 Function *F = L->getHeader()->getParent();
7370 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7371 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
7373 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7374 &Hints, IAI);
7375 // Use the planner for outer loop vectorization.
7376 // TODO: CM is not used at this point inside the planner. Turn CM into an
7377 // optional argument if we don't need it in the future.
7378 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);
7380 // Get user vectorization factor.
7381 const unsigned UserVF = Hints.getWidth();
7383 // Plan how to best vectorize, return the best VF and its cost.
7384 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7386 // If we are stress testing VPlan builds, do not attempt to generate vector
7387 // code. Masked vector code generation support will follow soon.
7388 // Also, do not attempt to vectorize if no vector code will be produced.
7389 if (VPlanBuildStressTest || EnableVPlanPredication ||
7390 VectorizationFactor::Disabled() == VF)
7391 return false;
7393 LVP.setBestPlan(VF.Width, 1);
7395 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7396 &CM);
7397 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7398 << L->getHeader()->getParent()->getName() << "\"\n");
7399 LVP.executePlan(LB, DT);
7401 // Mark the loop as already vectorized to avoid vectorizing again.
7402 Hints.setAlreadyVectorized();
7404 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7405 return true;
7408 bool LoopVectorizePass::processLoop(Loop *L) {
7409 assert((EnableVPlanNativePath || L->empty()) &&
7410 "VPlan-native path is not enabled. Only process inner loops.");
7412 #ifndef NDEBUG
7413 const std::string DebugLocStr = getDebugLocString(L);
7414 #endif /* NDEBUG */
7416 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7417 << L->getHeader()->getParent()->getName() << "\" from "
7418 << DebugLocStr << "\n");
7420 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7422 LLVM_DEBUG(
7423 dbgs() << "LV: Loop hints:"
7424 << " force="
7425 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7426 ? "disabled"
7427 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7428 ? "enabled"
7429 : "?"))
7430 << " width=" << Hints.getWidth()
7431 << " unroll=" << Hints.getInterleave() << "\n");
7433 // Function containing loop
7434 Function *F = L->getHeader()->getParent();
7436 // Looking at the diagnostic output is the only way to determine if a loop
7437 // was vectorized (other than looking at the IR or machine code), so it
7438 // is important to generate an optimization remark for each loop. Most of
7439 // these messages are generated as OptimizationRemarkAnalysis. Remarks
7440 // generated as OptimizationRemark and OptimizationRemarkMissed are
7441 // less verbose reporting vectorized loops and unvectorized loops that may
7442 // benefit from vectorization, respectively.
7444 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7445 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7446 return false;
7449 PredicatedScalarEvolution PSE(*SE, *L);
7451 // Check if it is legal to vectorize the loop.
7452 LoopVectorizationRequirements Requirements(*ORE);
7453 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7454 &Requirements, &Hints, DB, AC);
7455 if (!LVL.canVectorize(EnableVPlanNativePath)) {
7456 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7457 Hints.emitRemarkWithHints();
7458 return false;
7461 // Check the function attributes and profiles to find out if this function
7462 // should be optimized for size.
7463 ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
7465 // Entrance to the VPlan-native vectorization path. Outer loops are processed
7466 // here. They may require CFG and instruction level transformations before
7467 // even evaluating whether vectorization is profitable. Since we cannot modify
7468 // the incoming IR, we need to build VPlan upfront in the vectorization
7469 // pipeline.
7470 if (!L->empty())
7471 return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7472 ORE, BFI, PSI, Hints);
7474 assert(L->empty() && "Inner loop expected.");
7475 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7476 // count by optimizing for size, to minimize overheads.
7477 // Prefer constant trip counts over profile data, over upper bound estimate.
7478 unsigned ExpectedTC = 0;
7479 bool HasExpectedTC = false;
7480 if (const SCEVConstant *ConstExits =
7481 dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) {
7482 const APInt &ExitsCount = ConstExits->getAPInt();
7483 // We are interested in small values for ExpectedTC. Skip over those that
7484 // can't fit an unsigned.
7485 if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) {
7486 ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1;
7487 HasExpectedTC = true;
7490 // ExpectedTC may be large because it's bound by a variable. Check
7491 // profiling information to validate we should vectorize.
7492 if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
7493 auto EstimatedTC = getLoopEstimatedTripCount(L);
7494 if (EstimatedTC) {
7495 ExpectedTC = *EstimatedTC;
7496 HasExpectedTC = true;
7499 if (!HasExpectedTC) {
7500 ExpectedTC = SE->getSmallConstantMaxTripCount(L);
7501 HasExpectedTC = (ExpectedTC > 0);
7504 if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
7505 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7506 << "This loop is worth vectorizing only if no scalar "
7507 << "iteration overheads are incurred.");
7508 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7509 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7510 else {
7511 LLVM_DEBUG(dbgs() << "\n");
7512 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7516 // Check the function attributes to see if implicit floats are allowed.
7517 // FIXME: This check doesn't seem possibly correct -- what if the loop is
7518 // an integer loop and the vector instructions selected are purely integer
7519 // vector instructions?
7520 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7521 reportVectorizationFailure(
7522 "Can't vectorize when the NoImplicitFloat attribute is used",
7523 "loop not vectorized due to NoImplicitFloat attribute",
7524 "NoImplicitFloat", ORE, L);
7525 Hints.emitRemarkWithHints();
7526 return false;
7529 // Check if the target supports potentially unsafe FP vectorization.
7530 // FIXME: Add a check for the type of safety issue (denormal, signaling)
7531 // for the target we're vectorizing for, to make sure none of the
7532 // additional fp-math flags can help.
7533 if (Hints.isPotentiallyUnsafe() &&
7534 TTI->isFPVectorizationPotentiallyUnsafe()) {
7535 reportVectorizationFailure(
7536 "Potentially unsafe FP op prevents vectorization",
7537 "loop not vectorized due to unsafe FP support.",
7538 "UnsafeFP", ORE, L);
7539 Hints.emitRemarkWithHints();
7540 return false;
7543 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7544 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7546 // If an override option has been passed in for interleaved accesses, use it.
7547 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7548 UseInterleaved = EnableInterleavedMemAccesses;
7550 // Analyze interleaved memory accesses.
7551 if (UseInterleaved) {
7552 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7555 // Use the cost model.
7556 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7557 F, &Hints, IAI);
7558 CM.collectValuesToIgnore();
7560 // Use the planner for vectorization.
7561 LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);
7563 // Get user vectorization factor.
7564 unsigned UserVF = Hints.getWidth();
7566 // Plan how to best vectorize, return the best VF and its cost.
7567 Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7569 VectorizationFactor VF = VectorizationFactor::Disabled();
7570 unsigned IC = 1;
7571 unsigned UserIC = Hints.getInterleave();
7573 if (MaybeVF) {
7574 VF = *MaybeVF;
7575 // Select the interleave count.
7576 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7579 // Identify the diagnostic messages that should be produced.
7580 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7581 bool VectorizeLoop = true, InterleaveLoop = true;
7582 if (Requirements.doesNotMeet(F, L, Hints)) {
7583 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7584 "requirements.\n");
7585 Hints.emitRemarkWithHints();
7586 return false;
7589 if (VF.Width == 1) {
7590 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7591 VecDiagMsg = std::make_pair(
7592 "VectorizationNotBeneficial",
7593 "the cost-model indicates that vectorization is not beneficial");
7594 VectorizeLoop = false;
7597 if (!MaybeVF && UserIC > 1) {
7598 // Tell the user interleaving was avoided up-front, despite being explicitly
7599 // requested.
7600 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7601 "interleaving should be avoided up front\n");
7602 IntDiagMsg = std::make_pair(
7603 "InterleavingAvoided",
7604 "Ignoring UserIC, because interleaving was avoided up front");
7605 InterleaveLoop = false;
7606 } else if (IC == 1 && UserIC <= 1) {
7607 // Tell the user interleaving is not beneficial.
7608 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7609 IntDiagMsg = std::make_pair(
7610 "InterleavingNotBeneficial",
7611 "the cost-model indicates that interleaving is not beneficial");
7612 InterleaveLoop = false;
7613 if (UserIC == 1) {
7614 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7615 IntDiagMsg.second +=
7616 " and is explicitly disabled or interleave count is set to 1";
7618 } else if (IC > 1 && UserIC == 1) {
7619 // Tell the user interleaving is beneficial, but it explicitly disabled.
7620 LLVM_DEBUG(
7621 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7622 IntDiagMsg = std::make_pair(
7623 "InterleavingBeneficialButDisabled",
7624 "the cost-model indicates that interleaving is beneficial "
7625 "but is explicitly disabled or interleave count is set to 1");
7626 InterleaveLoop = false;
7629 // Override IC if user provided an interleave count.
7630 IC = UserIC > 0 ? UserIC : IC;
7632 // Emit diagnostic messages, if any.
7633 const char *VAPassName = Hints.vectorizeAnalysisPassName();
7634 if (!VectorizeLoop && !InterleaveLoop) {
7635 // Do not vectorize or interleaving the loop.
7636 ORE->emit([&]() {
7637 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7638 L->getStartLoc(), L->getHeader())
7639 << VecDiagMsg.second;
7641 ORE->emit([&]() {
7642 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7643 L->getStartLoc(), L->getHeader())
7644 << IntDiagMsg.second;
7646 return false;
7647 } else if (!VectorizeLoop && InterleaveLoop) {
7648 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7649 ORE->emit([&]() {
7650 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7651 L->getStartLoc(), L->getHeader())
7652 << VecDiagMsg.second;
7654 } else if (VectorizeLoop && !InterleaveLoop) {
7655 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7656 << ") in " << DebugLocStr << '\n');
7657 ORE->emit([&]() {
7658 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7659 L->getStartLoc(), L->getHeader())
7660 << IntDiagMsg.second;
7662 } else if (VectorizeLoop && InterleaveLoop) {
7663 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7664 << ") in " << DebugLocStr << '\n');
7665 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7668 LVP.setBestPlan(VF.Width, IC);
7670 using namespace ore;
7671 bool DisableRuntimeUnroll = false;
7672 MDNode *OrigLoopID = L->getLoopID();
7674 if (!VectorizeLoop) {
7675 assert(IC > 1 && "interleave count should not be 1 or 0");
7676 // If we decided that it is not legal to vectorize the loop, then
7677 // interleave it.
7678 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7679 &CM);
7680 LVP.executePlan(Unroller, DT);
7682 ORE->emit([&]() {
7683 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7684 L->getHeader())
7685 << "interleaved loop (interleaved count: "
7686 << NV("InterleaveCount", IC) << ")";
7688 } else {
7689 // If we decided that it is *legal* to vectorize the loop, then do it.
7690 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7691 &LVL, &CM);
7692 LVP.executePlan(LB, DT);
7693 ++LoopsVectorized;
7695 // Add metadata to disable runtime unrolling a scalar loop when there are
7696 // no runtime checks about strides and memory. A scalar loop that is
7697 // rarely used is not worth unrolling.
7698 if (!LB.areSafetyChecksAdded())
7699 DisableRuntimeUnroll = true;
7701 // Report the vectorization decision.
7702 ORE->emit([&]() {
7703 return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7704 L->getHeader())
7705 << "vectorized loop (vectorization width: "
7706 << NV("VectorizationFactor", VF.Width)
7707 << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7711 Optional<MDNode *> RemainderLoopID =
7712 makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7713 LLVMLoopVectorizeFollowupEpilogue});
7714 if (RemainderLoopID.hasValue()) {
7715 L->setLoopID(RemainderLoopID.getValue());
7716 } else {
7717 if (DisableRuntimeUnroll)
7718 AddRuntimeUnrollDisableMetaData(L);
7720 // Mark the loop as already vectorized to avoid vectorizing again.
7721 Hints.setAlreadyVectorized();
7724 LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7725 return true;
7728 bool LoopVectorizePass::runImpl(
7729 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7730 DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7731 DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7732 std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7733 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7734 SE = &SE_;
7735 LI = &LI_;
7736 TTI = &TTI_;
7737 DT = &DT_;
7738 BFI = &BFI_;
7739 TLI = TLI_;
7740 AA = &AA_;
7741 AC = &AC_;
7742 GetLAA = &GetLAA_;
7743 DB = &DB_;
7744 ORE = &ORE_;
7745 PSI = PSI_;
7747 // Don't attempt if
7748 // 1. the target claims to have no vector registers, and
7749 // 2. interleaving won't help ILP.
7751 // The second condition is necessary because, even if the target has no
7752 // vector registers, loop vectorization may still enable scalar
7753 // interleaving.
7754 if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
7755 return false;
7757 bool Changed = false;
7759 // The vectorizer requires loops to be in simplified form.
7760 // Since simplification may add new inner loops, it has to run before the
7761 // legality and profitability checks. This means running the loop vectorizer
7762 // will simplify all loops, regardless of whether anything end up being
7763 // vectorized.
7764 for (auto &L : *LI)
7765 Changed |=
7766 simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7768 // Build up a worklist of inner-loops to vectorize. This is necessary as
7769 // the act of vectorizing or partially unrolling a loop creates new loops
7770 // and can invalidate iterators across the loops.
7771 SmallVector<Loop *, 8> Worklist;
7773 for (Loop *L : *LI)
7774 collectSupportedLoops(*L, LI, ORE, Worklist);
7776 LoopsAnalyzed += Worklist.size();
7778 // Now walk the identified inner loops.
7779 while (!Worklist.empty()) {
7780 Loop *L = Worklist.pop_back_val();
7782 // For the inner loops we actually process, form LCSSA to simplify the
7783 // transform.
7784 Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7786 Changed |= processLoop(L);
7789 // Process each loop nest in the function.
7790 return Changed;
7793 PreservedAnalyses LoopVectorizePass::run(Function &F,
7794 FunctionAnalysisManager &AM) {
7795 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7796 auto &LI = AM.getResult<LoopAnalysis>(F);
7797 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7798 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7799 auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7800 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7801 auto &AA = AM.getResult<AAManager>(F);
7802 auto &AC = AM.getResult<AssumptionAnalysis>(F);
7803 auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7804 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7805 MemorySSA *MSSA = EnableMSSALoopDependency
7806 ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7807 : nullptr;
7809 auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7810 std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7811 [&](Loop &L) -> const LoopAccessInfo & {
7812 LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7813 return LAM.getResult<LoopAccessAnalysis>(L, AR);
7815 const ModuleAnalysisManager &MAM =
7816 AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7817 ProfileSummaryInfo *PSI =
7818 MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
7819 bool Changed =
7820 runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
7821 if (!Changed)
7822 return PreservedAnalyses::all();
7823 PreservedAnalyses PA;
7825 // We currently do not preserve loopinfo/dominator analyses with outer loop
7826 // vectorization. Until this is addressed, mark these analyses as preserved
7827 // only for non-VPlan-native path.
7828 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7829 if (!EnableVPlanNativePath) {
7830 PA.preserve<LoopAnalysis>();
7831 PA.preserve<DominatorTreeAnalysis>();
7833 PA.preserve<BasicAA>();
7834 PA.preserve<GlobalsAA>();
7835 return PA;