lib/Transforms/Vectorize/LoopVectorize.cpp

   1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
  10 // and generates target-independent LLVM-IR.
  11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
  12 // of instructions in order to estimate the profitability of vectorization.
  13 //
  14 // The loop vectorizer combines consecutive loop iterations into a single
  15 // 'wide' iteration. After this transformation the index is incremented
  16 // by the SIMD vector width, and not by one.
  17 //
  18 // This pass has three parts:
  19 // 1. The main loop pass that drives the different parts.
  20 // 2. LoopVectorizationLegality - A unit that checks for the legality
  21 //    of the vectorization.
  22 // 3. InnerLoopVectorizer - A unit that performs the actual
  23 //    widening of instructions.
  24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
  25 //    of vectorization. It decides on the optimal vector width, which
  26 //    can be one, if vectorization is not profitable.
  27 //
  28 // There is a development effort going on to migrate loop vectorizer to the
  29 // VPlan infrastructure and to introduce outer loop vectorization support (see
  30 // docs/Proposal/VectorizationPlan.rst and
  31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
  32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
  33 // alternative vectorization path that is natively implemented on top of the
  34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
  35 //
  36 //===----------------------------------------------------------------------===//
  37 //
  38 // The reduction-variable vectorization is based on the paper:
  39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
  40 //
  41 // Variable uniformity checks are inspired by:
  42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
  43 //
  44 // The interleaved access vectorization is based on the paper:
  45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
  46 //  Data for SIMD
  47 //
  48 // Other ideas/concepts are from:
  49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
  50 //
  51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
  52 //  Vectorizing Compilers.
  53 //
  54 //===----------------------------------------------------------------------===//
  55
  56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
  57 #include "LoopVectorizationPlanner.h"
  58 #include "VPRecipeBuilder.h"
  59 #include "VPlan.h"
  60 #include "VPlanHCFGBuilder.h"
  61 #include "VPlanHCFGTransforms.h"
  62 #include "VPlanPredicator.h"
  63 #include "llvm/ADT/APInt.h"
  64 #include "llvm/ADT/ArrayRef.h"
  65 #include "llvm/ADT/DenseMap.h"
  66 #include "llvm/ADT/DenseMapInfo.h"
  67 #include "llvm/ADT/Hashing.h"
  68 #include "llvm/ADT/MapVector.h"
  69 #include "llvm/ADT/None.h"
  70 #include "llvm/ADT/Optional.h"
  71 #include "llvm/ADT/STLExtras.h"
  72 #include "llvm/ADT/SetVector.h"
  73 #include "llvm/ADT/SmallPtrSet.h"
  74 #include "llvm/ADT/SmallVector.h"
  75 #include "llvm/ADT/Statistic.h"
  76 #include "llvm/ADT/StringRef.h"
  77 #include "llvm/ADT/Twine.h"
  78 #include "llvm/ADT/iterator_range.h"
  79 #include "llvm/Analysis/AssumptionCache.h"
  80 #include "llvm/Analysis/BasicAliasAnalysis.h"
  81 #include "llvm/Analysis/BlockFrequencyInfo.h"
  82 #include "llvm/Analysis/CFG.h"
  83 #include "llvm/Analysis/CodeMetrics.h"
  84 #include "llvm/Analysis/DemandedBits.h"
  85 #include "llvm/Analysis/GlobalsModRef.h"
  86 #include "llvm/Analysis/LoopAccessAnalysis.h"
  87 #include "llvm/Analysis/LoopAnalysisManager.h"
  88 #include "llvm/Analysis/LoopInfo.h"
  89 #include "llvm/Analysis/LoopIterator.h"
  90 #include "llvm/Analysis/MemorySSA.h"
  91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  92 #include "llvm/Analysis/ProfileSummaryInfo.h"
  93 #include "llvm/Analysis/ScalarEvolution.h"
  94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
  95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  96 #include "llvm/Analysis/TargetLibraryInfo.h"
  97 #include "llvm/Analysis/TargetTransformInfo.h"
  98 #include "llvm/Analysis/VectorUtils.h"
  99 #include "llvm/IR/Attributes.h"
 100 #include "llvm/IR/BasicBlock.h"
 101 #include "llvm/IR/CFG.h"
 102 #include "llvm/IR/Constant.h"
 103 #include "llvm/IR/Constants.h"
 104 #include "llvm/IR/DataLayout.h"
 105 #include "llvm/IR/DebugInfoMetadata.h"
 106 #include "llvm/IR/DebugLoc.h"
 107 #include "llvm/IR/DerivedTypes.h"
 108 #include "llvm/IR/DiagnosticInfo.h"
 109 #include "llvm/IR/Dominators.h"
 110 #include "llvm/IR/Function.h"
 111 #include "llvm/IR/IRBuilder.h"
 112 #include "llvm/IR/InstrTypes.h"
 113 #include "llvm/IR/Instruction.h"
 114 #include "llvm/IR/Instructions.h"
 115 #include "llvm/IR/IntrinsicInst.h"
 116 #include "llvm/IR/Intrinsics.h"
 117 #include "llvm/IR/LLVMContext.h"
 118 #include "llvm/IR/Metadata.h"
 119 #include "llvm/IR/Module.h"
 120 #include "llvm/IR/Operator.h"
 121 #include "llvm/IR/Type.h"
 122 #include "llvm/IR/Use.h"
 123 #include "llvm/IR/User.h"
 124 #include "llvm/IR/Value.h"
 125 #include "llvm/IR/ValueHandle.h"
 126 #include "llvm/IR/Verifier.h"
 127 #include "llvm/Pass.h"
 128 #include "llvm/Support/Casting.h"
 129 #include "llvm/Support/CommandLine.h"
 130 #include "llvm/Support/Compiler.h"
 131 #include "llvm/Support/Debug.h"
 132 #include "llvm/Support/ErrorHandling.h"
 133 #include "llvm/Support/MathExtras.h"
 134 #include "llvm/Support/raw_ostream.h"
 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 136 #include "llvm/Transforms/Utils/LoopSimplify.h"
 137 #include "llvm/Transforms/Utils/LoopUtils.h"
 138 #include "llvm/Transforms/Utils/LoopVersioning.h"
 139 #include "llvm/Transforms/Utils/SizeOpts.h"
 140 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 141 #include <algorithm>
 142 #include <cassert>
 143 #include <cstdint>
 144 #include <cstdlib>
 145 #include <functional>
 146 #include <iterator>
 147 #include <limits>
 148 #include <memory>
 149 #include <string>
 150 #include <tuple>
 151 #include <utility>
 152 #include <vector>
 153
 154 using namespace llvm;
 155
 156 #define LV_NAME "loop-vectorize"
 157 #define DEBUG_TYPE LV_NAME
 158
 159 /// @{
 160 /// Metadata attribute names
 161 static const char *const LLVMLoopVectorizeFollowupAll =
 162     "llvm.loop.vectorize.followup_all";
 163 static const char *const LLVMLoopVectorizeFollowupVectorized =
 164     "llvm.loop.vectorize.followup_vectorized";
 165 static const char *const LLVMLoopVectorizeFollowupEpilogue =
 166     "llvm.loop.vectorize.followup_epilogue";
 167 /// @}
 168
 169 STATISTIC(LoopsVectorized, "Number of loops vectorized");
 170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
 171
 172 /// Loops with a known constant trip count below this number are vectorized only
 173 /// if no scalar iteration overheads are incurred.
 174 static cl::opt<unsigned> TinyTripCountVectorThreshold(
 175     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
 176     cl::desc("Loops with a constant trip count that is smaller than this "
 177              "value are vectorized only if no scalar iteration overheads "
 178              "are incurred."));
 179
 180 // Indicates that an epilogue is undesired, predication is preferred.
 181 // This means that the vectorizer will try to fold the loop-tail (epilogue)
 182 // into the loop and predicate the loop body accordingly.
 183 static cl::opt<bool> PreferPredicateOverEpilog(
 184     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
 185     cl::desc("Indicate that an epilogue is undesired, predication should be "
 186              "used instead."));
 187
 188 static cl::opt<bool> MaximizeBandwidth(
 189     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
 190     cl::desc("Maximize bandwidth when selecting vectorization factor which "
 191              "will be determined by the smallest type in loop."));
 192
 193 static cl::opt<bool> EnableInterleavedMemAccesses(
 194     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
 195     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
 196
 197 /// An interleave-group may need masking if it resides in a block that needs
 198 /// predication, or in order to mask away gaps.
 199 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
 200     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
 201     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
 202
 203 /// We don't interleave loops with a known constant trip count below this
 204 /// number.
 205 static const unsigned TinyTripCountInterleaveThreshold = 128;
 206
 207 static cl::opt<unsigned> ForceTargetNumScalarRegs(
 208     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
 209     cl::desc("A flag that overrides the target's number of scalar registers."));
 210
 211 static cl::opt<unsigned> ForceTargetNumVectorRegs(
 212     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
 213     cl::desc("A flag that overrides the target's number of vector registers."));
 214
 215 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
 216     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
 217     cl::desc("A flag that overrides the target's max interleave factor for "
 218              "scalar loops."));
 219
 220 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
 221     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
 222     cl::desc("A flag that overrides the target's max interleave factor for "
 223              "vectorized loops."));
 224
 225 static cl::opt<unsigned> ForceTargetInstructionCost(
 226     "force-target-instruction-cost", cl::init(0), cl::Hidden,
 227     cl::desc("A flag that overrides the target's expected cost for "
 228              "an instruction to a single constant value. Mostly "
 229              "useful for getting consistent testing."));
 230
 231 static cl::opt<unsigned> SmallLoopCost(
 232     "small-loop-cost", cl::init(20), cl::Hidden,
 233     cl::desc(
 234         "The cost of a loop that is considered 'small' by the interleaver."));
 235
 236 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
 237     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
 238     cl::desc("Enable the use of the block frequency analysis to access PGO "
 239              "heuristics minimizing code growth in cold regions and being more "
 240              "aggressive in hot regions."));
 241
 242 // Runtime interleave loops for load/store throughput.
 243 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
 244     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
 245     cl::desc(
 246         "Enable runtime interleaving until load/store ports are saturated"));
 247
 248 /// The number of stores in a loop that are allowed to need predication.
 249 static cl::opt<unsigned> NumberOfStoresToPredicate(
 250     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
 251     cl::desc("Max number of stores to be predicated behind an if."));
 252
 253 static cl::opt<bool> EnableIndVarRegisterHeur(
 254     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
 255     cl::desc("Count the induction variable only once when interleaving"));
 256
 257 static cl::opt<bool> EnableCondStoresVectorization(
 258     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
 259     cl::desc("Enable if predication of stores during vectorization."));
 260
 261 static cl::opt<unsigned> MaxNestedScalarReductionIC(
 262     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
 263     cl::desc("The maximum interleave count to use when interleaving a scalar "
 264              "reduction in a nested loop."));
 265
 266 cl::opt<bool> EnableVPlanNativePath(
 267     "enable-vplan-native-path", cl::init(false), cl::Hidden,
 268     cl::desc("Enable VPlan-native vectorization path with "
 269              "support for outer loop vectorization."));
 270
 271 // FIXME: Remove this switch once we have divergence analysis. Currently we
 272 // assume divergent non-backedge branches when this switch is true.
 273 cl::opt<bool> EnableVPlanPredication(
 274     "enable-vplan-predication", cl::init(false), cl::Hidden,
 275     cl::desc("Enable VPlan-native vectorization path predicator with "
 276              "support for outer loop vectorization."));
 277
 278 // This flag enables the stress testing of the VPlan H-CFG construction in the
 279 // VPlan-native vectorization path. It must be used in conjuction with
 280 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
 281 // verification of the H-CFGs built.
 282 static cl::opt<bool> VPlanBuildStressTest(
 283     "vplan-build-stress-test", cl::init(false), cl::Hidden,
 284     cl::desc(
 285         "Build VPlan for every supported loop nest in the function and bail "
 286         "out right after the build (stress test the VPlan H-CFG construction "
 287         "in the VPlan-native vectorization path)."));
 288
 289 cl::opt<bool> llvm::EnableLoopInterleaving(
 290     "interleave-loops", cl::init(true), cl::Hidden,
 291     cl::desc("Enable loop interleaving in Loop vectorization passes"));
 292 cl::opt<bool> llvm::EnableLoopVectorization(
 293     "vectorize-loops", cl::init(true), cl::Hidden,
 294     cl::desc("Run the Loop vectorization passes"));
 295
 296 /// A helper function for converting Scalar types to vector types.
 297 /// If the incoming type is void, we return void. If the VF is 1, we return
 298 /// the scalar type.
 299 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
 300   if (Scalar->isVoidTy() || VF == 1)
 301     return Scalar;
 302   return VectorType::get(Scalar, VF);
 303 }
 304
 305 /// A helper function that returns the type of loaded or stored value.
 306 static Type *getMemInstValueType(Value *I) {
 307   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
 308          "Expected Load or Store instruction");
 309   if (auto *LI = dyn_cast<LoadInst>(I))
 310     return LI->getType();
 311   return cast<StoreInst>(I)->getValueOperand()->getType();
 312 }
 313
 314 /// A helper function that returns true if the given type is irregular. The
 315 /// type is irregular if its allocated size doesn't equal the store size of an
 316 /// element of the corresponding vector type at the given vectorization factor.
 317 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
 318   // Determine if an array of VF elements of type Ty is "bitcast compatible"
 319   // with a <VF x Ty> vector.
 320   if (VF > 1) {
 321     auto *VectorTy = VectorType::get(Ty, VF);
 322     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
 323   }
 324
 325   // If the vectorization factor is one, we just check if an array of type Ty
 326   // requires padding between elements.
 327   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
 328 }
 329
 330 /// A helper function that returns the reciprocal of the block probability of
 331 /// predicated blocks. If we return X, we are assuming the predicated block
 332 /// will execute once for every X iterations of the loop header.
 333 ///
 334 /// TODO: We should use actual block probability here, if available. Currently,
 335 ///       we always assume predicated blocks have a 50% chance of executing.
 336 static unsigned getReciprocalPredBlockProb() { return 2; }
 337
 338 /// A helper function that adds a 'fast' flag to floating-point operations.
 339 static Value *addFastMathFlag(Value *V) {
 340   if (isa<FPMathOperator>(V))
 341     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
 342   return V;
 343 }
 344
 345 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
 346   if (isa<FPMathOperator>(V))
 347     cast<Instruction>(V)->setFastMathFlags(FMF);
 348   return V;
 349 }
 350
 351 /// A helper function that returns an integer or floating-point constant with
 352 /// value C.
 353 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
 354   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
 355                            : ConstantFP::get(Ty, C);
 356 }
 357
 358 namespace llvm {
 359
 360 /// InnerLoopVectorizer vectorizes loops which contain only one basic
 361 /// block to a specified vectorization factor (VF).
 362 /// This class performs the widening of scalars into vectors, or multiple
 363 /// scalars. This class also implements the following features:
 364 /// * It inserts an epilogue loop for handling loops that don't have iteration
 365 ///   counts that are known to be a multiple of the vectorization factor.
 366 /// * It handles the code generation for reduction variables.
 367 /// * Scalarization (implementation using scalars) of un-vectorizable
 368 ///   instructions.
 369 /// InnerLoopVectorizer does not perform any vectorization-legality
 370 /// checks, and relies on the caller to check for the different legality
 371 /// aspects. The InnerLoopVectorizer relies on the
 372 /// LoopVectorizationLegality class to provide information about the induction
 373 /// and reduction variables that were found to a given vectorization factor.
 374 class InnerLoopVectorizer {
 375 public:
 376   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
 377                       LoopInfo *LI, DominatorTree *DT,
 378                       const TargetLibraryInfo *TLI,
 379                       const TargetTransformInfo *TTI, AssumptionCache *AC,
 380                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
 381                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
 382                       LoopVectorizationCostModel *CM)
 383       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
 384         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
 385         Builder(PSE.getSE()->getContext()),
 386         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
 387   virtual ~InnerLoopVectorizer() = default;
 388
 389   /// Create a new empty loop. Unlink the old loop and connect the new one.
 390   /// Return the pre-header block of the new loop.
 391   BasicBlock *createVectorizedLoopSkeleton();
 392
 393   /// Widen a single instruction within the innermost loop.
 394   void widenInstruction(Instruction &I);
 395
 396   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
 397   void fixVectorizedLoop();
 398
 399   // Return true if any runtime check is added.
 400   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
 401
 402   /// A type for vectorized values in the new loop. Each value from the
 403   /// original loop, when vectorized, is represented by UF vector values in the
 404   /// new unrolled loop, where UF is the unroll factor.
 405   using VectorParts = SmallVector<Value *, 2>;
 406
 407   /// Vectorize a single PHINode in a block. This method handles the induction
 408   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
 409   /// arbitrary length vectors.
 410   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
 411
 412   /// A helper function to scalarize a single Instruction in the innermost loop.
 413   /// Generates a sequence of scalar instances for each lane between \p MinLane
 414   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
 415   /// inclusive..
 416   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
 417                             bool IfPredicateInstr);
 418
 419   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
 420   /// is provided, the integer induction variable will first be truncated to
 421   /// the corresponding type.
 422   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
 423
 424   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
 425   /// vector or scalar value on-demand if one is not yet available. When
 426   /// vectorizing a loop, we visit the definition of an instruction before its
 427   /// uses. When visiting the definition, we either vectorize or scalarize the
 428   /// instruction, creating an entry for it in the corresponding map. (In some
 429   /// cases, such as induction variables, we will create both vector and scalar
 430   /// entries.) Then, as we encounter uses of the definition, we derive values
 431   /// for each scalar or vector use unless such a value is already available.
 432   /// For example, if we scalarize a definition and one of its uses is vector,
 433   /// we build the required vector on-demand with an insertelement sequence
 434   /// when visiting the use. Otherwise, if the use is scalar, we can use the
 435   /// existing scalar definition.
 436   ///
 437   /// Return a value in the new loop corresponding to \p V from the original
 438   /// loop at unroll index \p Part. If the value has already been vectorized,
 439   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
 440   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
 441   /// a new vector value on-demand by inserting the scalar values into a vector
 442   /// with an insertelement sequence. If the value has been neither vectorized
 443   /// nor scalarized, it must be loop invariant, so we simply broadcast the
 444   /// value into a vector.
 445   Value *getOrCreateVectorValue(Value *V, unsigned Part);
 446
 447   /// Return a value in the new loop corresponding to \p V from the original
 448   /// loop at unroll and vector indices \p Instance. If the value has been
 449   /// vectorized but not scalarized, the necessary extractelement instruction
 450   /// will be generated.
 451   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
 452
 453   /// Construct the vector value of a scalarized value \p V one lane at a time.
 454   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
 455
 456   /// Try to vectorize the interleaved access group that \p Instr belongs to,
 457   /// optionally masking the vector operations if \p BlockInMask is non-null.
 458   void vectorizeInterleaveGroup(Instruction *Instr,
 459                                 VectorParts *BlockInMask = nullptr);
 460
 461   /// Vectorize Load and Store instructions, optionally masking the vector
 462   /// operations if \p BlockInMask is non-null.
 463   void vectorizeMemoryInstruction(Instruction *Instr,
 464                                   VectorParts *BlockInMask = nullptr);
 465
 466   /// Set the debug location in the builder using the debug location in
 467   /// the instruction.
 468   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
 469
 470   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
 471   void fixNonInductionPHIs(void);
 472
 473 protected:
 474   friend class LoopVectorizationPlanner;
 475
 476   /// A small list of PHINodes.
 477   using PhiVector = SmallVector<PHINode *, 4>;
 478
 479   /// A type for scalarized values in the new loop. Each value from the
 480   /// original loop, when scalarized, is represented by UF x VF scalar values
 481   /// in the new unrolled loop, where UF is the unroll factor and VF is the
 482   /// vectorization factor.
 483   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
 484
 485   /// Set up the values of the IVs correctly when exiting the vector loop.
 486   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
 487                     Value *CountRoundDown, Value *EndValue,
 488                     BasicBlock *MiddleBlock);
 489
 490   /// Create a new induction variable inside L.
 491   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
 492                                    Value *Step, Instruction *DL);
 493
 494   /// Handle all cross-iteration phis in the header.
 495   void fixCrossIterationPHIs();
 496
 497   /// Fix a first-order recurrence. This is the second phase of vectorizing
 498   /// this phi node.
 499   void fixFirstOrderRecurrence(PHINode *Phi);
 500
 501   /// Fix a reduction cross-iteration phi. This is the second phase of
 502   /// vectorizing this phi node.
 503   void fixReduction(PHINode *Phi);
 504
 505   /// The Loop exit block may have single value PHI nodes with some
 506   /// incoming value. While vectorizing we only handled real values
 507   /// that were defined inside the loop and we should have one value for
 508   /// each predecessor of its parent basic block. See PR14725.
 509   void fixLCSSAPHIs();
 510
 511   /// Iteratively sink the scalarized operands of a predicated instruction into
 512   /// the block that was created for it.
 513   void sinkScalarOperands(Instruction *PredInst);
 514
 515   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
 516   /// represented as.
 517   void truncateToMinimalBitwidths();
 518
 519   /// Insert the new loop to the loop hierarchy and pass manager
 520   /// and update the analysis passes.
 521   void updateAnalysis();
 522
 523   /// Create a broadcast instruction. This method generates a broadcast
 524   /// instruction (shuffle) for loop invariant values and for the induction
 525   /// value. If this is the induction variable then we extend it to N, N+1, ...
 526   /// this is needed because each iteration in the loop corresponds to a SIMD
 527   /// element.
 528   virtual Value *getBroadcastInstrs(Value *V);
 529
 530   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
 531   /// to each vector element of Val. The sequence starts at StartIndex.
 532   /// \p Opcode is relevant for FP induction variable.
 533   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
 534                                Instruction::BinaryOps Opcode =
 535                                Instruction::BinaryOpsEnd);
 536
 537   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
 538   /// variable on which to base the steps, \p Step is the size of the step, and
 539   /// \p EntryVal is the value from the original loop that maps to the steps.
 540   /// Note that \p EntryVal doesn't have to be an induction variable - it
 541   /// can also be a truncate instruction.
 542   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
 543                         const InductionDescriptor &ID);
 544
 545   /// Create a vector induction phi node based on an existing scalar one. \p
 546   /// EntryVal is the value from the original loop that maps to the vector phi
 547   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
 548   /// truncate instruction, instead of widening the original IV, we widen a
 549   /// version of the IV truncated to \p EntryVal's type.
 550   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
 551                                        Value *Step, Instruction *EntryVal);
 552
 553   /// Returns true if an instruction \p I should be scalarized instead of
 554   /// vectorized for the chosen vectorization factor.
 555   bool shouldScalarizeInstruction(Instruction *I) const;
 556
 557   /// Returns true if we should generate a scalar version of \p IV.
 558   bool needsScalarInduction(Instruction *IV) const;
 559
 560   /// If there is a cast involved in the induction variable \p ID, which should
 561   /// be ignored in the vectorized loop body, this function records the
 562   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
 563   /// cast. We had already proved that the casted Phi is equal to the uncasted
 564   /// Phi in the vectorized loop (under a runtime guard), and therefore
 565   /// there is no need to vectorize the cast - the same value can be used in the
 566   /// vector loop for both the Phi and the cast.
 567   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
 568   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
 569   ///
 570   /// \p EntryVal is the value from the original loop that maps to the vector
 571   /// phi node and is used to distinguish what is the IV currently being
 572   /// processed - original one (if \p EntryVal is a phi corresponding to the
 573   /// original IV) or the "newly-created" one based on the proof mentioned above
 574   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
 575   /// latter case \p EntryVal is a TruncInst and we must not record anything for
 576   /// that IV, but it's error-prone to expect callers of this routine to care
 577   /// about that, hence this explicit parameter.
 578   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
 579                                              const Instruction *EntryVal,
 580                                              Value *VectorLoopValue,
 581                                              unsigned Part,
 582                                              unsigned Lane = UINT_MAX);
 583
 584   /// Generate a shuffle sequence that will reverse the vector Vec.
 585   virtual Value *reverseVector(Value *Vec);
 586
 587   /// Returns (and creates if needed) the original loop trip count.
 588   Value *getOrCreateTripCount(Loop *NewLoop);
 589
 590   /// Returns (and creates if needed) the trip count of the widened loop.
 591   Value *getOrCreateVectorTripCount(Loop *NewLoop);
 592
 593   /// Returns a bitcasted value to the requested vector type.
 594   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
 595   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
 596                                 const DataLayout &DL);
 597
 598   /// Emit a bypass check to see if the vector trip count is zero, including if
 599   /// it overflows.
 600   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
 601
 602   /// Emit a bypass check to see if all of the SCEV assumptions we've
 603   /// had to make are correct.
 604   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
 605
 606   /// Emit bypass checks to check any memory assumptions we may have made.
 607   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
 608
 609   /// Compute the transformed value of Index at offset StartValue using step
 610   /// StepValue.
 611   /// For integer induction, returns StartValue + Index * StepValue.
 612   /// For pointer induction, returns StartValue[Index * StepValue].
 613   /// FIXME: The newly created binary instructions should contain nsw/nuw
 614   /// flags, which can be found from the original scalar operations.
 615   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
 616                               const DataLayout &DL,
 617                               const InductionDescriptor &ID) const;
 618
 619   /// Add additional metadata to \p To that was not present on \p Orig.
 620   ///
 621   /// Currently this is used to add the noalias annotations based on the
 622   /// inserted memchecks.  Use this for instructions that are *cloned* into the
 623   /// vector loop.
 624   void addNewMetadata(Instruction *To, const Instruction *Orig);
 625
 626   /// Add metadata from one instruction to another.
 627   ///
 628   /// This includes both the original MDs from \p From and additional ones (\see
 629   /// addNewMetadata).  Use this for *newly created* instructions in the vector
 630   /// loop.
 631   void addMetadata(Instruction *To, Instruction *From);
 632
 633   /// Similar to the previous function but it adds the metadata to a
 634   /// vector of instructions.
 635   void addMetadata(ArrayRef<Value *> To, Instruction *From);
 636
 637   /// The original loop.
 638   Loop *OrigLoop;
 639
 640   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
 641   /// dynamic knowledge to simplify SCEV expressions and converts them to a
 642   /// more usable form.
 643   PredicatedScalarEvolution &PSE;
 644
 645   /// Loop Info.
 646   LoopInfo *LI;
 647
 648   /// Dominator Tree.
 649   DominatorTree *DT;
 650
 651   /// Alias Analysis.
 652   AliasAnalysis *AA;
 653
 654   /// Target Library Info.
 655   const TargetLibraryInfo *TLI;
 656
 657   /// Target Transform Info.
 658   const TargetTransformInfo *TTI;
 659
 660   /// Assumption Cache.
 661   AssumptionCache *AC;
 662
 663   /// Interface to emit optimization remarks.
 664   OptimizationRemarkEmitter *ORE;
 665
 666   /// LoopVersioning.  It's only set up (non-null) if memchecks were
 667   /// used.
 668   ///
 669   /// This is currently only used to add no-alias metadata based on the
 670   /// memchecks.  The actually versioning is performed manually.
 671   std::unique_ptr<LoopVersioning> LVer;
 672
 673   /// The vectorization SIMD factor to use. Each vector will have this many
 674   /// vector elements.
 675   unsigned VF;
 676
 677   /// The vectorization unroll factor to use. Each scalar is vectorized to this
 678   /// many different vector instructions.
 679   unsigned UF;
 680
 681   /// The builder that we use
 682   IRBuilder<> Builder;
 683
 684   // --- Vectorization state ---
 685
 686   /// The vector-loop preheader.
 687   BasicBlock *LoopVectorPreHeader;
 688
 689   /// The scalar-loop preheader.
 690   BasicBlock *LoopScalarPreHeader;
 691
 692   /// Middle Block between the vector and the scalar.
 693   BasicBlock *LoopMiddleBlock;
 694
 695   /// The ExitBlock of the scalar loop.
 696   BasicBlock *LoopExitBlock;
 697
 698   /// The vector loop body.
 699   BasicBlock *LoopVectorBody;
 700
 701   /// The scalar loop body.
 702   BasicBlock *LoopScalarBody;
 703
 704   /// A list of all bypass blocks. The first block is the entry of the loop.
 705   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
 706
 707   /// The new Induction variable which was added to the new block.
 708   PHINode *Induction = nullptr;
 709
 710   /// The induction variable of the old basic block.
 711   PHINode *OldInduction = nullptr;
 712
 713   /// Maps values from the original loop to their corresponding values in the
 714   /// vectorized loop. A key value can map to either vector values, scalar
 715   /// values or both kinds of values, depending on whether the key was
 716   /// vectorized and scalarized.
 717   VectorizerValueMap VectorLoopValueMap;
 718
 719   /// Store instructions that were predicated.
 720   SmallVector<Instruction *, 4> PredicatedInstructions;
 721
 722   /// Trip count of the original loop.
 723   Value *TripCount = nullptr;
 724
 725   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
 726   Value *VectorTripCount = nullptr;
 727
 728   /// The legality analysis.
 729   LoopVectorizationLegality *Legal;
 730
 731   /// The profitablity analysis.
 732   LoopVectorizationCostModel *Cost;
 733
 734   // Record whether runtime checks are added.
 735   bool AddedSafetyChecks = false;
 736
 737   // Holds the end values for each induction variable. We save the end values
 738   // so we can later fix-up the external users of the induction variables.
 739   DenseMap<PHINode *, Value *> IVEndValues;
 740
 741   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
 742   // fixed up at the end of vector code generation.
 743   SmallVector<PHINode *, 8> OrigPHIsToFix;
 744 };
 745
 746 class InnerLoopUnroller : public InnerLoopVectorizer {
 747 public:
 748   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
 749                     LoopInfo *LI, DominatorTree *DT,
 750                     const TargetLibraryInfo *TLI,
 751                     const TargetTransformInfo *TTI, AssumptionCache *AC,
 752                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
 753                     LoopVectorizationLegality *LVL,
 754                     LoopVectorizationCostModel *CM)
 755       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
 756                             UnrollFactor, LVL, CM) {}
 757
 758 private:
 759   Value *getBroadcastInstrs(Value *V) override;
 760   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
 761                        Instruction::BinaryOps Opcode =
 762                        Instruction::BinaryOpsEnd) override;
 763   Value *reverseVector(Value *Vec) override;
 764 };
 765
 766 } // end namespace llvm
 767
 768 /// Look for a meaningful debug location on the instruction or it's
 769 /// operands.
 770 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
 771   if (!I)
 772     return I;
 773
 774   DebugLoc Empty;
 775   if (I->getDebugLoc() != Empty)
 776     return I;
 777
 778   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
 779     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
 780       if (OpInst->getDebugLoc() != Empty)
 781         return OpInst;
 782   }
 783
 784   return I;
 785 }
 786
 787 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
 788   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
 789     const DILocation *DIL = Inst->getDebugLoc();
 790     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
 791         !isa<DbgInfoIntrinsic>(Inst)) {
 792       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
 793       if (NewDIL)
 794         B.SetCurrentDebugLocation(NewDIL.getValue());
 795       else
 796         LLVM_DEBUG(dbgs()
 797                    << "Failed to create new discriminator: "
 798                    << DIL->getFilename() << " Line: " << DIL->getLine());
 799     }
 800     else
 801       B.SetCurrentDebugLocation(DIL);
 802   } else
 803     B.SetCurrentDebugLocation(DebugLoc());
 804 }
 805
 806 /// Write a record \p DebugMsg about vectorization failure to the debug
 807 /// output stream. If \p I is passed, it is an instruction that prevents
 808 /// vectorization.
 809 #ifndef NDEBUG
 810 static void debugVectorizationFailure(const StringRef DebugMsg,
 811     Instruction *I) {
 812   dbgs() << "LV: Not vectorizing: " << DebugMsg;
 813   if (I != nullptr)
 814     dbgs() << " " << *I;
 815   else
 816     dbgs() << '.';
 817   dbgs() << '\n';
 818 }
 819 #endif
 820
 821 /// Create an analysis remark that explains why vectorization failed
 822 ///
 823 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
 824 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
 825 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
 826 /// the location of the remark.  \return the remark object that can be
 827 /// streamed to.
 828 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
 829     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
 830   Value *CodeRegion = TheLoop->getHeader();
 831   DebugLoc DL = TheLoop->getStartLoc();
 832
 833   if (I) {
 834     CodeRegion = I->getParent();
 835     // If there is no debug location attached to the instruction, revert back to
 836     // using the loop's.
 837     if (I->getDebugLoc())
 838       DL = I->getDebugLoc();
 839   }
 840
 841   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
 842   R << "loop not vectorized: ";
 843   return R;
 844 }
 845
 846 namespace llvm {
 847
 848 void reportVectorizationFailure(const StringRef DebugMsg,
 849     const StringRef OREMsg, const StringRef ORETag,
 850     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
 851   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
 852   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
 853   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
 854                 ORETag, TheLoop, I) << OREMsg);
 855 }
 856
 857 } // end namespace llvm
 858
 859 #ifndef NDEBUG
 860 /// \return string containing a file name and a line # for the given loop.
 861 static std::string getDebugLocString(const Loop *L) {
 862   std::string Result;
 863   if (L) {
 864     raw_string_ostream OS(Result);
 865     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
 866       LoopDbgLoc.print(OS);
 867     else
 868       // Just print the module name.
 869       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
 870     OS.flush();
 871   }
 872   return Result;
 873 }
 874 #endif
 875
 876 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
 877                                          const Instruction *Orig) {
 878   // If the loop was versioned with memchecks, add the corresponding no-alias
 879   // metadata.
 880   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
 881     LVer->annotateInstWithNoAlias(To, Orig);
 882 }
 883
 884 void InnerLoopVectorizer::addMetadata(Instruction *To,
 885                                       Instruction *From) {
 886   propagateMetadata(To, From);
 887   addNewMetadata(To, From);
 888 }
 889
 890 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
 891                                       Instruction *From) {
 892   for (Value *V : To) {
 893     if (Instruction *I = dyn_cast<Instruction>(V))
 894       addMetadata(I, From);
 895   }
 896 }
 897
 898 namespace llvm {
 899
 900 // Loop vectorization cost-model hints how the scalar epilogue loop should be
 901 // lowered.
 902 enum ScalarEpilogueLowering {
 903
 904   // The default: allowing scalar epilogues.
 905   CM_ScalarEpilogueAllowed,
 906
 907   // Vectorization with OptForSize: don't allow epilogues.
 908   CM_ScalarEpilogueNotAllowedOptSize,
 909
 910   // A special case of vectorisation with OptForSize: loops with a very small
 911   // trip count are considered for vectorization under OptForSize, thereby
 912   // making sure the cost of their loop body is dominant, free of runtime
 913   // guards and scalar iteration overheads.
 914   CM_ScalarEpilogueNotAllowedLowTripLoop,
 915
 916   // Loop hint predicate indicating an epilogue is undesired.
 917   CM_ScalarEpilogueNotNeededUsePredicate
 918 };
 919
 920 /// LoopVectorizationCostModel - estimates the expected speedups due to
 921 /// vectorization.
 922 /// In many cases vectorization is not profitable. This can happen because of
 923 /// a number of reasons. In this class we mainly attempt to predict the
 924 /// expected speedup/slowdowns due to the supported instruction set. We use the
 925 /// TargetTransformInfo to query the different backends for the cost of
 926 /// different operations.
 927 class LoopVectorizationCostModel {
 928 public:
 929   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
 930                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
 931                              LoopVectorizationLegality *Legal,
 932                              const TargetTransformInfo &TTI,
 933                              const TargetLibraryInfo *TLI, DemandedBits *DB,
 934                              AssumptionCache *AC,
 935                              OptimizationRemarkEmitter *ORE, const Function *F,
 936                              const LoopVectorizeHints *Hints,
 937                              InterleavedAccessInfo &IAI)
 938       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
 939         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
 940         Hints(Hints), InterleaveInfo(IAI) {}
 941
 942   /// \return An upper bound for the vectorization factor, or None if
 943   /// vectorization and interleaving should be avoided up front.
 944   Optional<unsigned> computeMaxVF();
 945
 946   /// \return True if runtime checks are required for vectorization, and false
 947   /// otherwise.
 948   bool runtimeChecksRequired();
 949
 950   /// \return The most profitable vectorization factor and the cost of that VF.
 951   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
 952   /// then this vectorization factor will be selected if vectorization is
 953   /// possible.
 954   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
 955
 956   /// Setup cost-based decisions for user vectorization factor.
 957   void selectUserVectorizationFactor(unsigned UserVF) {
 958     collectUniformsAndScalars(UserVF);
 959     collectInstsToScalarize(UserVF);
 960   }
 961
 962   /// \return The size (in bits) of the smallest and widest types in the code
 963   /// that needs to be vectorized. We ignore values that remain scalar such as
 964   /// 64 bit loop indices.
 965   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
 966
 967   /// \return The desired interleave count.
 968   /// If interleave count has been specified by metadata it will be returned.
 969   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
 970   /// are the selected vectorization factor and the cost of the selected VF.
 971   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
 972
 973   /// Memory access instruction may be vectorized in more than one way.
 974   /// Form of instruction after vectorization depends on cost.
 975   /// This function takes cost-based decisions for Load/Store instructions
 976   /// and collects them in a map. This decisions map is used for building
 977   /// the lists of loop-uniform and loop-scalar instructions.
 978   /// The calculated cost is saved with widening decision in order to
 979   /// avoid redundant calculations.
 980   void setCostBasedWideningDecision(unsigned VF);
 981
 982   /// A struct that represents some properties of the register usage
 983   /// of a loop.
 984   struct RegisterUsage {
 985     /// Holds the number of loop invariant values that are used in the loop.
 986     unsigned LoopInvariantRegs;
 987
 988     /// Holds the maximum number of concurrent live intervals in the loop.
 989     unsigned MaxLocalUsers;
 990   };
 991
 992   /// \return Returns information about the register usages of the loop for the
 993   /// given vectorization factors.
 994   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
 995
 996   /// Collect values we want to ignore in the cost model.
 997   void collectValuesToIgnore();
 998
 999   /// \returns The smallest bitwidth each instruction can be represented with.
1000   /// The vector equivalents of these instructions should be truncated to this
1001   /// type.
1002   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1003     return MinBWs;
1004   }
1005
1006   /// \returns True if it is more profitable to scalarize instruction \p I for
1007   /// vectorization factor \p VF.
1008   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1009     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1010
1011     // Cost model is not run in the VPlan-native path - return conservative
1012     // result until this changes.
1013     if (EnableVPlanNativePath)
1014       return false;
1015
1016     auto Scalars = InstsToScalarize.find(VF);
1017     assert(Scalars != InstsToScalarize.end() &&
1018            "VF not yet analyzed for scalarization profitability");
1019     return Scalars->second.find(I) != Scalars->second.end();
1020   }
1021
1022   /// Returns true if \p I is known to be uniform after vectorization.
1023   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1024     if (VF == 1)
1025       return true;
1026
1027     // Cost model is not run in the VPlan-native path - return conservative
1028     // result until this changes.
1029     if (EnableVPlanNativePath)
1030       return false;
1031
1032     auto UniformsPerVF = Uniforms.find(VF);
1033     assert(UniformsPerVF != Uniforms.end() &&
1034            "VF not yet analyzed for uniformity");
1035     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1036   }
1037
1038   /// Returns true if \p I is known to be scalar after vectorization.
1039   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1040     if (VF == 1)
1041       return true;
1042
1043     // Cost model is not run in the VPlan-native path - return conservative
1044     // result until this changes.
1045     if (EnableVPlanNativePath)
1046       return false;
1047
1048     auto ScalarsPerVF = Scalars.find(VF);
1049     assert(ScalarsPerVF != Scalars.end() &&
1050            "Scalar values are not calculated for VF");
1051     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1052   }
1053
1054   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1055   /// for vectorization factor \p VF.
1056   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1057     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1058            !isProfitableToScalarize(I, VF) &&
1059            !isScalarAfterVectorization(I, VF);
1060   }
1061
1062   /// Decision that was taken during cost calculation for memory instruction.
1063   enum InstWidening {
1064     CM_Unknown,
1065     CM_Widen,         // For consecutive accesses with stride +1.
1066     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1067     CM_Interleave,
1068     CM_GatherScatter,
1069     CM_Scalarize
1070   };
1071
1072   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1073   /// instruction \p I and vector width \p VF.
1074   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1075                            unsigned Cost) {
1076     assert(VF >= 2 && "Expected VF >=2");
1077     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1078   }
1079
1080   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1081   /// interleaving group \p Grp and vector width \p VF.
1082   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1083                            InstWidening W, unsigned Cost) {
1084     assert(VF >= 2 && "Expected VF >=2");
1085     /// Broadcast this decicion to all instructions inside the group.
1086     /// But the cost will be assigned to one instruction only.
1087     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1088       if (auto *I = Grp->getMember(i)) {
1089         if (Grp->getInsertPos() == I)
1090           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1091         else
1092           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1093       }
1094     }
1095   }
1096
1097   /// Return the cost model decision for the given instruction \p I and vector
1098   /// width \p VF. Return CM_Unknown if this instruction did not pass
1099   /// through the cost modeling.
1100   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1101     assert(VF >= 2 && "Expected VF >=2");
1102
1103     // Cost model is not run in the VPlan-native path - return conservative
1104     // result until this changes.
1105     if (EnableVPlanNativePath)
1106       return CM_GatherScatter;
1107
1108     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1109     auto Itr = WideningDecisions.find(InstOnVF);
1110     if (Itr == WideningDecisions.end())
1111       return CM_Unknown;
1112     return Itr->second.first;
1113   }
1114
1115   /// Return the vectorization cost for the given instruction \p I and vector
1116   /// width \p VF.
1117   unsigned getWideningCost(Instruction *I, unsigned VF) {
1118     assert(VF >= 2 && "Expected VF >=2");
1119     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1120     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1121            "The cost is not calculated");
1122     return WideningDecisions[InstOnVF].second;
1123   }
1124
1125   /// Return True if instruction \p I is an optimizable truncate whose operand
1126   /// is an induction variable. Such a truncate will be removed by adding a new
1127   /// induction variable with the destination type.
1128   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1129     // If the instruction is not a truncate, return false.
1130     auto *Trunc = dyn_cast<TruncInst>(I);
1131     if (!Trunc)
1132       return false;
1133
1134     // Get the source and destination types of the truncate.
1135     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1136     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1137
1138     // If the truncate is free for the given types, return false. Replacing a
1139     // free truncate with an induction variable would add an induction variable
1140     // update instruction to each iteration of the loop. We exclude from this
1141     // check the primary induction variable since it will need an update
1142     // instruction regardless.
1143     Value *Op = Trunc->getOperand(0);
1144     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1145       return false;
1146
1147     // If the truncated value is not an induction variable, return false.
1148     return Legal->isInductionPhi(Op);
1149   }
1150
1151   /// Collects the instructions to scalarize for each predicated instruction in
1152   /// the loop.
1153   void collectInstsToScalarize(unsigned VF);
1154
1155   /// Collect Uniform and Scalar values for the given \p VF.
1156   /// The sets depend on CM decision for Load/Store instructions
1157   /// that may be vectorized as interleave, gather-scatter or scalarized.
1158   void collectUniformsAndScalars(unsigned VF) {
1159     // Do the analysis once.
1160     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1161       return;
1162     setCostBasedWideningDecision(VF);
1163     collectLoopUniforms(VF);
1164     collectLoopScalars(VF);
1165   }
1166
1167   /// Returns true if the target machine supports masked store operation
1168   /// for the given \p DataType and kind of access to \p Ptr.
1169   bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
1170     return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
1171   }
1172
1173   /// Returns true if the target machine supports masked load operation
1174   /// for the given \p DataType and kind of access to \p Ptr.
1175   bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
1176     return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
1177   }
1178
1179   /// Returns true if the target machine supports masked scatter operation
1180   /// for the given \p DataType.
1181   bool isLegalMaskedScatter(Type *DataType) {
1182     return TTI.isLegalMaskedScatter(DataType);
1183   }
1184
1185   /// Returns true if the target machine supports masked gather operation
1186   /// for the given \p DataType.
1187   bool isLegalMaskedGather(Type *DataType) {
1188     return TTI.isLegalMaskedGather(DataType);
1189   }
1190
1191   /// Returns true if the target machine can represent \p V as a masked gather
1192   /// or scatter operation.
1193   bool isLegalGatherOrScatter(Value *V) {
1194     bool LI = isa<LoadInst>(V);
1195     bool SI = isa<StoreInst>(V);
1196     if (!LI && !SI)
1197       return false;
1198     auto *Ty = getMemInstValueType(V);
1199     return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1200   }
1201
1202   /// Returns true if \p I is an instruction that will be scalarized with
1203   /// predication. Such instructions include conditional stores and
1204   /// instructions that may divide by zero.
1205   /// If a non-zero VF has been calculated, we check if I will be scalarized
1206   /// predication for that VF.
1207   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1208
1209   // Returns true if \p I is an instruction that will be predicated either
1210   // through scalar predication or masked load/store or masked gather/scatter.
1211   // Superset of instructions that return true for isScalarWithPredication.
1212   bool isPredicatedInst(Instruction *I) {
1213     if (!blockNeedsPredication(I->getParent()))
1214       return false;
1215     // Loads and stores that need some form of masked operation are predicated
1216     // instructions.
1217     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1218       return Legal->isMaskRequired(I);
1219     return isScalarWithPredication(I);
1220   }
1221
1222   /// Returns true if \p I is a memory instruction with consecutive memory
1223   /// access that can be widened.
1224   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1225
1226   /// Returns true if \p I is a memory instruction in an interleaved-group
1227   /// of memory accesses that can be vectorized with wide vector loads/stores
1228   /// and shuffles.
1229   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1230
1231   /// Check if \p Instr belongs to any interleaved access group.
1232   bool isAccessInterleaved(Instruction *Instr) {
1233     return InterleaveInfo.isInterleaved(Instr);
1234   }
1235
1236   /// Get the interleaved access group that \p Instr belongs to.
1237   const InterleaveGroup<Instruction> *
1238   getInterleavedAccessGroup(Instruction *Instr) {
1239     return InterleaveInfo.getInterleaveGroup(Instr);
1240   }
1241
1242   /// Returns true if an interleaved group requires a scalar iteration
1243   /// to handle accesses with gaps, and there is nothing preventing us from
1244   /// creating a scalar epilogue.
1245   bool requiresScalarEpilogue() const {
1246     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1247   }
1248
1249   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1250   /// loop hint annotation.
1251   bool isScalarEpilogueAllowed() const {
1252     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1253   }
1254
1255   /// Returns true if all loop blocks should be masked to fold tail loop.
1256   bool foldTailByMasking() const { return FoldTailByMasking; }
1257
1258   bool blockNeedsPredication(BasicBlock *BB) {
1259     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1260   }
1261
1262   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1263   /// with factor VF.  Return the cost of the instruction, including
1264   /// scalarization overhead if it's needed.
1265   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1266
1267   /// Estimate cost of a call instruction CI if it were vectorized with factor
1268   /// VF. Return the cost of the instruction, including scalarization overhead
1269   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1270   /// scalarized -
1271   /// i.e. either vector version isn't available, or is too expensive.
1272   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1273
1274 private:
1275   unsigned NumPredStores = 0;
1276
1277   /// \return An upper bound for the vectorization factor, larger than zero.
1278   /// One is returned if vectorization should best be avoided due to cost.
1279   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1280
1281   /// The vectorization cost is a combination of the cost itself and a boolean
1282   /// indicating whether any of the contributing operations will actually
1283   /// operate on
1284   /// vector values after type legalization in the backend. If this latter value
1285   /// is
1286   /// false, then all operations will be scalarized (i.e. no vectorization has
1287   /// actually taken place).
1288   using VectorizationCostTy = std::pair<unsigned, bool>;
1289
1290   /// Returns the expected execution cost. The unit of the cost does
1291   /// not matter because we use the 'cost' units to compare different
1292   /// vector widths. The cost that is returned is *not* normalized by
1293   /// the factor width.
1294   VectorizationCostTy expectedCost(unsigned VF);
1295
1296   /// Returns the execution time cost of an instruction for a given vector
1297   /// width. Vector width of one means scalar.
1298   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1299
1300   /// The cost-computation logic from getInstructionCost which provides
1301   /// the vector type as an output parameter.
1302   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1303
1304   /// Calculate vectorization cost of memory instruction \p I.
1305   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1306
1307   /// The cost computation for scalarized memory instruction.
1308   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1309
1310   /// The cost computation for interleaving group of memory instructions.
1311   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1312
1313   /// The cost computation for Gather/Scatter instruction.
1314   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1315
1316   /// The cost computation for widening instruction \p I with consecutive
1317   /// memory access.
1318   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1319
1320   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1321   /// Load: scalar load + broadcast.
1322   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1323   /// element)
1324   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1325
1326   /// Estimate the overhead of scalarizing an instruction. This is a
1327   /// convenience wrapper for the type-based getScalarizationOverhead API.
1328   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1329
1330   /// Returns whether the instruction is a load or store and will be a emitted
1331   /// as a vector operation.
1332   bool isConsecutiveLoadOrStore(Instruction *I);
1333
1334   /// Returns true if an artificially high cost for emulated masked memrefs
1335   /// should be used.
1336   bool useEmulatedMaskMemRefHack(Instruction *I);
1337
1338   /// Map of scalar integer values to the smallest bitwidth they can be legally
1339   /// represented as. The vector equivalents of these values should be truncated
1340   /// to this type.
1341   MapVector<Instruction *, uint64_t> MinBWs;
1342
1343   /// A type representing the costs for instructions if they were to be
1344   /// scalarized rather than vectorized. The entries are Instruction-Cost
1345   /// pairs.
1346   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1347
1348   /// A set containing all BasicBlocks that are known to present after
1349   /// vectorization as a predicated block.
1350   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1351
1352   /// Records whether it is allowed to have the original scalar loop execute at
1353   /// least once. This may be needed as a fallback loop in case runtime
1354   /// aliasing/dependence checks fail, or to handle the tail/remainder
1355   /// iterations when the trip count is unknown or doesn't divide by the VF,
1356   /// or as a peel-loop to handle gaps in interleave-groups.
1357   /// Under optsize and when the trip count is very small we don't allow any
1358   /// iterations to execute in the scalar loop.
1359   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1360
1361   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1362   bool FoldTailByMasking = false;
1363
1364   /// A map holding scalar costs for different vectorization factors. The
1365   /// presence of a cost for an instruction in the mapping indicates that the
1366   /// instruction will be scalarized when vectorizing with the associated
1367   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1368   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1369
1370   /// Holds the instructions known to be uniform after vectorization.
1371   /// The data is collected per VF.
1372   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1373
1374   /// Holds the instructions known to be scalar after vectorization.
1375   /// The data is collected per VF.
1376   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1377
1378   /// Holds the instructions (address computations) that are forced to be
1379   /// scalarized.
1380   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1381
1382   /// Returns the expected difference in cost from scalarizing the expression
1383   /// feeding a predicated instruction \p PredInst. The instructions to
1384   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1385   /// non-negative return value implies the expression will be scalarized.
1386   /// Currently, only single-use chains are considered for scalarization.
1387   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1388                               unsigned VF);
1389
1390   /// Collect the instructions that are uniform after vectorization. An
1391   /// instruction is uniform if we represent it with a single scalar value in
1392   /// the vectorized loop corresponding to each vector iteration. Examples of
1393   /// uniform instructions include pointer operands of consecutive or
1394   /// interleaved memory accesses. Note that although uniformity implies an
1395   /// instruction will be scalar, the reverse is not true. In general, a
1396   /// scalarized instruction will be represented by VF scalar values in the
1397   /// vectorized loop, each corresponding to an iteration of the original
1398   /// scalar loop.
1399   void collectLoopUniforms(unsigned VF);
1400
1401   /// Collect the instructions that are scalar after vectorization. An
1402   /// instruction is scalar if it is known to be uniform or will be scalarized
1403   /// during vectorization. Non-uniform scalarized instructions will be
1404   /// represented by VF values in the vectorized loop, each corresponding to an
1405   /// iteration of the original scalar loop.
1406   void collectLoopScalars(unsigned VF);
1407
1408   /// Keeps cost model vectorization decision and cost for instructions.
1409   /// Right now it is used for memory instructions only.
1410   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1411                                 std::pair<InstWidening, unsigned>>;
1412
1413   DecisionList WideningDecisions;
1414
1415   /// Returns true if \p V is expected to be vectorized and it needs to be
1416   /// extracted.
1417   bool needsExtract(Value *V, unsigned VF) const {
1418     Instruction *I = dyn_cast<Instruction>(V);
1419     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1420       return false;
1421
1422     // Assume we can vectorize V (and hence we need extraction) if the
1423     // scalars are not computed yet. This can happen, because it is called
1424     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1425     // the scalars are collected. That should be a safe assumption in most
1426     // cases, because we check if the operands have vectorizable types
1427     // beforehand in LoopVectorizationLegality.
1428     return Scalars.find(VF) == Scalars.end() ||
1429            !isScalarAfterVectorization(I, VF);
1430   };
1431
1432   /// Returns a range containing only operands needing to be extracted.
1433   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1434                                                    unsigned VF) {
1435     return SmallVector<Value *, 4>(make_filter_range(
1436         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1437   }
1438
1439 public:
1440   /// The loop that we evaluate.
1441   Loop *TheLoop;
1442
1443   /// Predicated scalar evolution analysis.
1444   PredicatedScalarEvolution &PSE;
1445
1446   /// Loop Info analysis.
1447   LoopInfo *LI;
1448
1449   /// Vectorization legality.
1450   LoopVectorizationLegality *Legal;
1451
1452   /// Vector target information.
1453   const TargetTransformInfo &TTI;
1454
1455   /// Target Library Info.
1456   const TargetLibraryInfo *TLI;
1457
1458   /// Demanded bits analysis.
1459   DemandedBits *DB;
1460
1461   /// Assumption cache.
1462   AssumptionCache *AC;
1463
1464   /// Interface to emit optimization remarks.
1465   OptimizationRemarkEmitter *ORE;
1466
1467   const Function *TheFunction;
1468
1469   /// Loop Vectorize Hint.
1470   const LoopVectorizeHints *Hints;
1471
1472   /// The interleave access information contains groups of interleaved accesses
1473   /// with the same stride and close to each other.
1474   InterleavedAccessInfo &InterleaveInfo;
1475
1476   /// Values to ignore in the cost model.
1477   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1478
1479   /// Values to ignore in the cost model when VF > 1.
1480   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1481 };
1482
1483 } // end namespace llvm
1484
1485 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1486 // vectorization. The loop needs to be annotated with #pragma omp simd
1487 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1488 // vector length information is not provided, vectorization is not considered
1489 // explicit. Interleave hints are not allowed either. These limitations will be
1490 // relaxed in the future.
1491 // Please, note that we are currently forced to abuse the pragma 'clang
1492 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1493 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1494 // provides *explicit vectorization hints* (LV can bypass legal checks and
1495 // assume that vectorization is legal). However, both hints are implemented
1496 // using the same metadata (llvm.loop.vectorize, processed by
1497 // LoopVectorizeHints). This will be fixed in the future when the native IR
1498 // representation for pragma 'omp simd' is introduced.
1499 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1500                                    OptimizationRemarkEmitter *ORE) {
1501   assert(!OuterLp->empty() && "This is not an outer loop");
1502   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1503
1504   // Only outer loops with an explicit vectorization hint are supported.
1505   // Unannotated outer loops are ignored.
1506   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1507     return false;
1508
1509   Function *Fn = OuterLp->getHeader()->getParent();
1510   if (!Hints.allowVectorization(Fn, OuterLp,
1511                                 true /*VectorizeOnlyWhenForced*/)) {
1512     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1513     return false;
1514   }
1515
1516   if (Hints.getInterleave() > 1) {
1517     // TODO: Interleave support is future work.
1518     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1519                          "outer loops.\n");
1520     Hints.emitRemarkWithHints();
1521     return false;
1522   }
1523
1524   return true;
1525 }
1526
1527 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1528                                   OptimizationRemarkEmitter *ORE,
1529                                   SmallVectorImpl<Loop *> &V) {
1530   // Collect inner loops and outer loops without irreducible control flow. For
1531   // now, only collect outer loops that have explicit vectorization hints. If we
1532   // are stress testing the VPlan H-CFG construction, we collect the outermost
1533   // loop of every loop nest.
1534   if (L.empty() || VPlanBuildStressTest ||
1535       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1536     LoopBlocksRPO RPOT(&L);
1537     RPOT.perform(LI);
1538     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1539       V.push_back(&L);
1540       // TODO: Collect inner loops inside marked outer loops in case
1541       // vectorization fails for the outer loop. Do not invoke
1542       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1543       // already known to be reducible. We can use an inherited attribute for
1544       // that.
1545       return;
1546     }
1547   }
1548   for (Loop *InnerL : L)
1549     collectSupportedLoops(*InnerL, LI, ORE, V);
1550 }
1551
1552 namespace {
1553
1554 /// The LoopVectorize Pass.
1555 struct LoopVectorize : public FunctionPass {
1556   /// Pass identification, replacement for typeid
1557   static char ID;
1558
1559   LoopVectorizePass Impl;
1560
1561   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1562                          bool VectorizeOnlyWhenForced = false)
1563       : FunctionPass(ID) {
1564     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1565     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1566     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1567   }
1568
1569   bool runOnFunction(Function &F) override {
1570     if (skipFunction(F))
1571       return false;
1572
1573     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1574     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1575     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1576     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1577     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1578     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1579     auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
1580     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1581     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1582     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1583     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1584     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1585     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1586
1587     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1588         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1589
1590     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1591                         GetLAA, *ORE, PSI);
1592   }
1593
1594   void getAnalysisUsage(AnalysisUsage &AU) const override {
1595     AU.addRequired<AssumptionCacheTracker>();
1596     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1597     AU.addRequired<DominatorTreeWrapperPass>();
1598     AU.addRequired<LoopInfoWrapperPass>();
1599     AU.addRequired<ScalarEvolutionWrapperPass>();
1600     AU.addRequired<TargetTransformInfoWrapperPass>();
1601     AU.addRequired<AAResultsWrapperPass>();
1602     AU.addRequired<LoopAccessLegacyAnalysis>();
1603     AU.addRequired<DemandedBitsWrapperPass>();
1604     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1605
1606     // We currently do not preserve loopinfo/dominator analyses with outer loop
1607     // vectorization. Until this is addressed, mark these analyses as preserved
1608     // only for non-VPlan-native path.
1609     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1610     if (!EnableVPlanNativePath) {
1611       AU.addPreserved<LoopInfoWrapperPass>();
1612       AU.addPreserved<DominatorTreeWrapperPass>();
1613     }
1614
1615     AU.addPreserved<BasicAAWrapperPass>();
1616     AU.addPreserved<GlobalsAAWrapperPass>();
1617     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1618   }
1619 };
1620
1621 } // end anonymous namespace
1622
1623 //===----------------------------------------------------------------------===//
1624 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1625 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1626 //===----------------------------------------------------------------------===//
1627
1628 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1629   // We need to place the broadcast of invariant variables outside the loop,
1630   // but only if it's proven safe to do so. Else, broadcast will be inside
1631   // vector loop body.
1632   Instruction *Instr = dyn_cast<Instruction>(V);
1633   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1634                      (!Instr ||
1635                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1636   // Place the code for broadcasting invariant variables in the new preheader.
1637   IRBuilder<>::InsertPointGuard Guard(Builder);
1638   if (SafeToHoist)
1639     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1640
1641   // Broadcast the scalar into all locations in the vector.
1642   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1643
1644   return Shuf;
1645 }
1646
1647 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1648     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1649   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1650          "Expected either an induction phi-node or a truncate of it!");
1651   Value *Start = II.getStartValue();
1652
1653   // Construct the initial value of the vector IV in the vector loop preheader
1654   auto CurrIP = Builder.saveIP();
1655   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1656   if (isa<TruncInst>(EntryVal)) {
1657     assert(Start->getType()->isIntegerTy() &&
1658            "Truncation requires an integer type");
1659     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1660     Step = Builder.CreateTrunc(Step, TruncType);
1661     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1662   }
1663   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1664   Value *SteppedStart =
1665       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1666
1667   // We create vector phi nodes for both integer and floating-point induction
1668   // variables. Here, we determine the kind of arithmetic we will perform.
1669   Instruction::BinaryOps AddOp;
1670   Instruction::BinaryOps MulOp;
1671   if (Step->getType()->isIntegerTy()) {
1672     AddOp = Instruction::Add;
1673     MulOp = Instruction::Mul;
1674   } else {
1675     AddOp = II.getInductionOpcode();
1676     MulOp = Instruction::FMul;
1677   }
1678
1679   // Multiply the vectorization factor by the step using integer or
1680   // floating-point arithmetic as appropriate.
1681   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1682   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1683
1684   // Create a vector splat to use in the induction update.
1685   //
1686   // FIXME: If the step is non-constant, we create the vector splat with
1687   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1688   //        handle a constant vector splat.
1689   Value *SplatVF = isa<Constant>(Mul)
1690                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1691                        : Builder.CreateVectorSplat(VF, Mul);
1692   Builder.restoreIP(CurrIP);
1693
1694   // We may need to add the step a number of times, depending on the unroll
1695   // factor. The last of those goes into the PHI.
1696   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1697                                     &*LoopVectorBody->getFirstInsertionPt());
1698   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1699   Instruction *LastInduction = VecInd;
1700   for (unsigned Part = 0; Part < UF; ++Part) {
1701     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1702
1703     if (isa<TruncInst>(EntryVal))
1704       addMetadata(LastInduction, EntryVal);
1705     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1706
1707     LastInduction = cast<Instruction>(addFastMathFlag(
1708         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1709     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1710   }
1711
1712   // Move the last step to the end of the latch block. This ensures consistent
1713   // placement of all induction updates.
1714   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1715   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1716   auto *ICmp = cast<Instruction>(Br->getCondition());
1717   LastInduction->moveBefore(ICmp);
1718   LastInduction->setName("vec.ind.next");
1719
1720   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1721   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1722 }
1723
1724 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1725   return Cost->isScalarAfterVectorization(I, VF) ||
1726          Cost->isProfitableToScalarize(I, VF);
1727 }
1728
1729 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1730   if (shouldScalarizeInstruction(IV))
1731     return true;
1732   auto isScalarInst = [&](User *U) -> bool {
1733     auto *I = cast<Instruction>(U);
1734     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1735   };
1736   return llvm::any_of(IV->users(), isScalarInst);
1737 }
1738
1739 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1740     const InductionDescriptor &ID, const Instruction *EntryVal,
1741     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1742   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1743          "Expected either an induction phi-node or a truncate of it!");
1744
1745   // This induction variable is not the phi from the original loop but the
1746   // newly-created IV based on the proof that casted Phi is equal to the
1747   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1748   // re-uses the same InductionDescriptor that original IV uses but we don't
1749   // have to do any recording in this case - that is done when original IV is
1750   // processed.
1751   if (isa<TruncInst>(EntryVal))
1752     return;
1753
1754   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1755   if (Casts.empty())
1756     return;
1757   // Only the first Cast instruction in the Casts vector is of interest.
1758   // The rest of the Casts (if exist) have no uses outside the
1759   // induction update chain itself.
1760   Instruction *CastInst = *Casts.begin();
1761   if (Lane < UINT_MAX)
1762     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1763   else
1764     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1765 }
1766
1767 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1768   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1769          "Primary induction variable must have an integer type");
1770
1771   auto II = Legal->getInductionVars()->find(IV);
1772   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1773
1774   auto ID = II->second;
1775   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1776
1777   // The scalar value to broadcast. This will be derived from the canonical
1778   // induction variable.
1779   Value *ScalarIV = nullptr;
1780
1781   // The value from the original loop to which we are mapping the new induction
1782   // variable.
1783   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1784
1785   // True if we have vectorized the induction variable.
1786   auto VectorizedIV = false;
1787
1788   // Determine if we want a scalar version of the induction variable. This is
1789   // true if the induction variable itself is not widened, or if it has at
1790   // least one user in the loop that is not widened.
1791   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1792
1793   // Generate code for the induction step. Note that induction steps are
1794   // required to be loop-invariant
1795   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1796          "Induction step should be loop invariant");
1797   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1798   Value *Step = nullptr;
1799   if (PSE.getSE()->isSCEVable(IV->getType())) {
1800     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1801     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1802                              LoopVectorPreHeader->getTerminator());
1803   } else {
1804     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1805   }
1806
1807   // Try to create a new independent vector induction variable. If we can't
1808   // create the phi node, we will splat the scalar induction variable in each
1809   // loop iteration.
1810   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1811     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1812     VectorizedIV = true;
1813   }
1814
1815   // If we haven't yet vectorized the induction variable, or if we will create
1816   // a scalar one, we need to define the scalar induction variable and step
1817   // values. If we were given a truncation type, truncate the canonical
1818   // induction variable and step. Otherwise, derive these values from the
1819   // induction descriptor.
1820   if (!VectorizedIV || NeedsScalarIV) {
1821     ScalarIV = Induction;
1822     if (IV != OldInduction) {
1823       ScalarIV = IV->getType()->isIntegerTy()
1824                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1825                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1826                                           IV->getType());
1827       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1828       ScalarIV->setName("offset.idx");
1829     }
1830     if (Trunc) {
1831       auto *TruncType = cast<IntegerType>(Trunc->getType());
1832       assert(Step->getType()->isIntegerTy() &&
1833              "Truncation requires an integer step");
1834       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1835       Step = Builder.CreateTrunc(Step, TruncType);
1836     }
1837   }
1838
1839   // If we haven't yet vectorized the induction variable, splat the scalar
1840   // induction variable, and build the necessary step vectors.
1841   // TODO: Don't do it unless the vectorized IV is really required.
1842   if (!VectorizedIV) {
1843     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1844     for (unsigned Part = 0; Part < UF; ++Part) {
1845       Value *EntryPart =
1846           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1847       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1848       if (Trunc)
1849         addMetadata(EntryPart, Trunc);
1850       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1851     }
1852   }
1853
1854   // If an induction variable is only used for counting loop iterations or
1855   // calculating addresses, it doesn't need to be widened. Create scalar steps
1856   // that can be used by instructions we will later scalarize. Note that the
1857   // addition of the scalar steps will not increase the number of instructions
1858   // in the loop in the common case prior to InstCombine. We will be trading
1859   // one vector extract for each scalar step.
1860   if (NeedsScalarIV)
1861     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1862 }
1863
1864 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1865                                           Instruction::BinaryOps BinOp) {
1866   // Create and check the types.
1867   assert(Val->getType()->isVectorTy() && "Must be a vector");
1868   int VLen = Val->getType()->getVectorNumElements();
1869
1870   Type *STy = Val->getType()->getScalarType();
1871   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1872          "Induction Step must be an integer or FP");
1873   assert(Step->getType() == STy && "Step has wrong type");
1874
1875   SmallVector<Constant *, 8> Indices;
1876
1877   if (STy->isIntegerTy()) {
1878     // Create a vector of consecutive numbers from zero to VF.
1879     for (int i = 0; i < VLen; ++i)
1880       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1881
1882     // Add the consecutive indices to the vector value.
1883     Constant *Cv = ConstantVector::get(Indices);
1884     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1885     Step = Builder.CreateVectorSplat(VLen, Step);
1886     assert(Step->getType() == Val->getType() && "Invalid step vec");
1887     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1888     // which can be found from the original scalar operations.
1889     Step = Builder.CreateMul(Cv, Step);
1890     return Builder.CreateAdd(Val, Step, "induction");
1891   }
1892
1893   // Floating point induction.
1894   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1895          "Binary Opcode should be specified for FP induction");
1896   // Create a vector of consecutive numbers from zero to VF.
1897   for (int i = 0; i < VLen; ++i)
1898     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1899
1900   // Add the consecutive indices to the vector value.
1901   Constant *Cv = ConstantVector::get(Indices);
1902
1903   Step = Builder.CreateVectorSplat(VLen, Step);
1904
1905   // Floating point operations had to be 'fast' to enable the induction.
1906   FastMathFlags Flags;
1907   Flags.setFast();
1908
1909   Value *MulOp = Builder.CreateFMul(Cv, Step);
1910   if (isa<Instruction>(MulOp))
1911     // Have to check, MulOp may be a constant
1912     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1913
1914   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1915   if (isa<Instruction>(BOp))
1916     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1917   return BOp;
1918 }
1919
1920 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1921                                            Instruction *EntryVal,
1922                                            const InductionDescriptor &ID) {
1923   // We shouldn't have to build scalar steps if we aren't vectorizing.
1924   assert(VF > 1 && "VF should be greater than one");
1925
1926   // Get the value type and ensure it and the step have the same integer type.
1927   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1928   assert(ScalarIVTy == Step->getType() &&
1929          "Val and Step should have the same type");
1930
1931   // We build scalar steps for both integer and floating-point induction
1932   // variables. Here, we determine the kind of arithmetic we will perform.
1933   Instruction::BinaryOps AddOp;
1934   Instruction::BinaryOps MulOp;
1935   if (ScalarIVTy->isIntegerTy()) {
1936     AddOp = Instruction::Add;
1937     MulOp = Instruction::Mul;
1938   } else {
1939     AddOp = ID.getInductionOpcode();
1940     MulOp = Instruction::FMul;
1941   }
1942
1943   // Determine the number of scalars we need to generate for each unroll
1944   // iteration. If EntryVal is uniform, we only need to generate the first
1945   // lane. Otherwise, we generate all VF values.
1946   unsigned Lanes =
1947       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1948                                                                          : VF;
1949   // Compute the scalar steps and save the results in VectorLoopValueMap.
1950   for (unsigned Part = 0; Part < UF; ++Part) {
1951     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1952       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1953       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1954       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1955       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1956       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1957     }
1958   }
1959 }
1960
1961 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1962   assert(V != Induction && "The new induction variable should not be used.");
1963   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1964   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1965
1966   // If we have a stride that is replaced by one, do it here. Defer this for
1967   // the VPlan-native path until we start running Legal checks in that path.
1968   if (!EnableVPlanNativePath && Legal->hasStride(V))
1969     V = ConstantInt::get(V->getType(), 1);
1970
1971   // If we have a vector mapped to this value, return it.
1972   if (VectorLoopValueMap.hasVectorValue(V, Part))
1973     return VectorLoopValueMap.getVectorValue(V, Part);
1974
1975   // If the value has not been vectorized, check if it has been scalarized
1976   // instead. If it has been scalarized, and we actually need the value in
1977   // vector form, we will construct the vector values on demand.
1978   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
1979     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
1980
1981     // If we've scalarized a value, that value should be an instruction.
1982     auto *I = cast<Instruction>(V);
1983
1984     // If we aren't vectorizing, we can just copy the scalar map values over to
1985     // the vector map.
1986     if (VF == 1) {
1987       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
1988       return ScalarValue;
1989     }
1990
1991     // Get the last scalar instruction we generated for V and Part. If the value
1992     // is known to be uniform after vectorization, this corresponds to lane zero
1993     // of the Part unroll iteration. Otherwise, the last instruction is the one
1994     // we created for the last vector lane of the Part unroll iteration.
1995     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
1996     auto *LastInst = cast<Instruction>(
1997         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
1998
1999     // Set the insert point after the last scalarized instruction. This ensures
2000     // the insertelement sequence will directly follow the scalar definitions.
2001     auto OldIP = Builder.saveIP();
2002     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2003     Builder.SetInsertPoint(&*NewIP);
2004
2005     // However, if we are vectorizing, we need to construct the vector values.
2006     // If the value is known to be uniform after vectorization, we can just
2007     // broadcast the scalar value corresponding to lane zero for each unroll
2008     // iteration. Otherwise, we construct the vector values using insertelement
2009     // instructions. Since the resulting vectors are stored in
2010     // VectorLoopValueMap, we will only generate the insertelements once.
2011     Value *VectorValue = nullptr;
2012     if (Cost->isUniformAfterVectorization(I, VF)) {
2013       VectorValue = getBroadcastInstrs(ScalarValue);
2014       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2015     } else {
2016       // Initialize packing with insertelements to start from undef.
2017       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2018       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2019       for (unsigned Lane = 0; Lane < VF; ++Lane)
2020         packScalarIntoVectorValue(V, {Part, Lane});
2021       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2022     }
2023     Builder.restoreIP(OldIP);
2024     return VectorValue;
2025   }
2026
2027   // If this scalar is unknown, assume that it is a constant or that it is
2028   // loop invariant. Broadcast V and save the value for future uses.
2029   Value *B = getBroadcastInstrs(V);
2030   VectorLoopValueMap.setVectorValue(V, Part, B);
2031   return B;
2032 }
2033
2034 Value *
2035 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2036                                             const VPIteration &Instance) {
2037   // If the value is not an instruction contained in the loop, it should
2038   // already be scalar.
2039   if (OrigLoop->isLoopInvariant(V))
2040     return V;
2041
2042   assert(Instance.Lane > 0
2043              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2044              : true && "Uniform values only have lane zero");
2045
2046   // If the value from the original loop has not been vectorized, it is
2047   // represented by UF x VF scalar values in the new loop. Return the requested
2048   // scalar value.
2049   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2050     return VectorLoopValueMap.getScalarValue(V, Instance);
2051
2052   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2053   // for the given unroll part. If this entry is not a vector type (i.e., the
2054   // vectorization factor is one), there is no need to generate an
2055   // extractelement instruction.
2056   auto *U = getOrCreateVectorValue(V, Instance.Part);
2057   if (!U->getType()->isVectorTy()) {
2058     assert(VF == 1 && "Value not scalarized has non-vector type");
2059     return U;
2060   }
2061
2062   // Otherwise, the value from the original loop has been vectorized and is
2063   // represented by UF vector values. Extract and return the requested scalar
2064   // value from the appropriate vector lane.
2065   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2066 }
2067
2068 void InnerLoopVectorizer::packScalarIntoVectorValue(
2069     Value *V, const VPIteration &Instance) {
2070   assert(V != Induction && "The new induction variable should not be used.");
2071   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2072   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2073
2074   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2075   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2076   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2077                                             Builder.getInt32(Instance.Lane));
2078   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2079 }
2080
2081 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2082   assert(Vec->getType()->isVectorTy() && "Invalid type");
2083   SmallVector<Constant *, 8> ShuffleMask;
2084   for (unsigned i = 0; i < VF; ++i)
2085     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2086
2087   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2088                                      ConstantVector::get(ShuffleMask),
2089                                      "reverse");
2090 }
2091
2092 // Return whether we allow using masked interleave-groups (for dealing with
2093 // strided loads/stores that reside in predicated blocks, or for dealing
2094 // with gaps).
2095 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2096   // If an override option has been passed in for interleaved accesses, use it.
2097   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2098     return EnableMaskedInterleavedMemAccesses;
2099
2100   return TTI.enableMaskedInterleavedAccessVectorization();
2101 }
2102
2103 // Try to vectorize the interleave group that \p Instr belongs to.
2104 //
2105 // E.g. Translate following interleaved load group (factor = 3):
2106 //   for (i = 0; i < N; i+=3) {
2107 //     R = Pic[i];             // Member of index 0
2108 //     G = Pic[i+1];           // Member of index 1
2109 //     B = Pic[i+2];           // Member of index 2
2110 //     ... // do something to R, G, B
2111 //   }
2112 // To:
2113 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2114 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2115 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2116 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2117 //
2118 // Or translate following interleaved store group (factor = 3):
2119 //   for (i = 0; i < N; i+=3) {
2120 //     ... do something to R, G, B
2121 //     Pic[i]   = R;           // Member of index 0
2122 //     Pic[i+1] = G;           // Member of index 1
2123 //     Pic[i+2] = B;           // Member of index 2
2124 //   }
2125 // To:
2126 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2127 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2128 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2129 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2130 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2131 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2132                                                    VectorParts *BlockInMask) {
2133   const InterleaveGroup<Instruction> *Group =
2134       Cost->getInterleavedAccessGroup(Instr);
2135   assert(Group && "Fail to get an interleaved access group.");
2136
2137   // Skip if current instruction is not the insert position.
2138   if (Instr != Group->getInsertPos())
2139     return;
2140
2141   const DataLayout &DL = Instr->getModule()->getDataLayout();
2142   Value *Ptr = getLoadStorePointerOperand(Instr);
2143
2144   // Prepare for the vector type of the interleaved load/store.
2145   Type *ScalarTy = getMemInstValueType(Instr);
2146   unsigned InterleaveFactor = Group->getFactor();
2147   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2148   Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2149
2150   // Prepare for the new pointers.
2151   setDebugLocFromInst(Builder, Ptr);
2152   SmallVector<Value *, 2> NewPtrs;
2153   unsigned Index = Group->getIndex(Instr);
2154
2155   VectorParts Mask;
2156   bool IsMaskForCondRequired = BlockInMask;
2157   if (IsMaskForCondRequired) {
2158     Mask = *BlockInMask;
2159     // TODO: extend the masked interleaved-group support to reversed access.
2160     assert(!Group->isReverse() && "Reversed masked interleave-group "
2161                                   "not supported.");
2162   }
2163
2164   // If the group is reverse, adjust the index to refer to the last vector lane
2165   // instead of the first. We adjust the index from the first vector lane,
2166   // rather than directly getting the pointer for lane VF - 1, because the
2167   // pointer operand of the interleaved access is supposed to be uniform. For
2168   // uniform instructions, we're only required to generate a value for the
2169   // first vector lane in each unroll iteration.
2170   if (Group->isReverse())
2171     Index += (VF - 1) * Group->getFactor();
2172
2173   bool InBounds = false;
2174   if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2175     InBounds = gep->isInBounds();
2176
2177   for (unsigned Part = 0; Part < UF; Part++) {
2178     Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2179
2180     // Notice current instruction could be any index. Need to adjust the address
2181     // to the member of index 0.
2182     //
2183     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2184     //       b = A[i];       // Member of index 0
2185     // Current pointer is pointed to A[i+1], adjust it to A[i].
2186     //
2187     // E.g.  A[i+1] = a;     // Member of index 1
2188     //       A[i]   = b;     // Member of index 0
2189     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2190     // Current pointer is pointed to A[i+2], adjust it to A[i].
2191     NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
2192     if (InBounds)
2193       cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2194
2195     // Cast to the vector pointer type.
2196     NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2197   }
2198
2199   setDebugLocFromInst(Builder, Instr);
2200   Value *UndefVec = UndefValue::get(VecTy);
2201
2202   Value *MaskForGaps = nullptr;
2203   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2204     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2205     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2206   }
2207
2208   // Vectorize the interleaved load group.
2209   if (isa<LoadInst>(Instr)) {
2210     // For each unroll part, create a wide load for the group.
2211     SmallVector<Value *, 2> NewLoads;
2212     for (unsigned Part = 0; Part < UF; Part++) {
2213       Instruction *NewLoad;
2214       if (IsMaskForCondRequired || MaskForGaps) {
2215         assert(useMaskedInterleavedAccesses(*TTI) &&
2216                "masked interleaved groups are not allowed.");
2217         Value *GroupMask = MaskForGaps;
2218         if (IsMaskForCondRequired) {
2219           auto *Undefs = UndefValue::get(Mask[Part]->getType());
2220           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2221           Value *ShuffledMask = Builder.CreateShuffleVector(
2222               Mask[Part], Undefs, RepMask, "interleaved.mask");
2223           GroupMask = MaskForGaps
2224                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2225                                                 MaskForGaps)
2226                           : ShuffledMask;
2227         }
2228         NewLoad =
2229             Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2230                                      GroupMask, UndefVec, "wide.masked.vec");
2231       }
2232       else
2233         NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
2234                                             Group->getAlignment(), "wide.vec");
2235       Group->addMetadata(NewLoad);
2236       NewLoads.push_back(NewLoad);
2237     }
2238
2239     // For each member in the group, shuffle out the appropriate data from the
2240     // wide loads.
2241     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2242       Instruction *Member = Group->getMember(I);
2243
2244       // Skip the gaps in the group.
2245       if (!Member)
2246         continue;
2247
2248       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2249       for (unsigned Part = 0; Part < UF; Part++) {
2250         Value *StridedVec = Builder.CreateShuffleVector(
2251             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2252
2253         // If this member has different type, cast the result type.
2254         if (Member->getType() != ScalarTy) {
2255           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2256           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2257         }
2258
2259         if (Group->isReverse())
2260           StridedVec = reverseVector(StridedVec);
2261
2262         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2263       }
2264     }
2265     return;
2266   }
2267
2268   // The sub vector type for current instruction.
2269   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2270
2271   // Vectorize the interleaved store group.
2272   for (unsigned Part = 0; Part < UF; Part++) {
2273     // Collect the stored vector from each member.
2274     SmallVector<Value *, 4> StoredVecs;
2275     for (unsigned i = 0; i < InterleaveFactor; i++) {
2276       // Interleaved store group doesn't allow a gap, so each index has a member
2277       Instruction *Member = Group->getMember(i);
2278       assert(Member && "Fail to get a member from an interleaved store group");
2279
2280       Value *StoredVec = getOrCreateVectorValue(
2281           cast<StoreInst>(Member)->getValueOperand(), Part);
2282       if (Group->isReverse())
2283         StoredVec = reverseVector(StoredVec);
2284
2285       // If this member has different type, cast it to a unified type.
2286
2287       if (StoredVec->getType() != SubVT)
2288         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2289
2290       StoredVecs.push_back(StoredVec);
2291     }
2292
2293     // Concatenate all vectors into a wide vector.
2294     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2295
2296     // Interleave the elements in the wide vector.
2297     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2298     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2299                                               "interleaved.vec");
2300
2301     Instruction *NewStoreInstr;
2302     if (IsMaskForCondRequired) {
2303       auto *Undefs = UndefValue::get(Mask[Part]->getType());
2304       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2305       Value *ShuffledMask = Builder.CreateShuffleVector(
2306           Mask[Part], Undefs, RepMask, "interleaved.mask");
2307       NewStoreInstr = Builder.CreateMaskedStore(
2308           IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2309     }
2310     else
2311       NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2312         Group->getAlignment());
2313
2314     Group->addMetadata(NewStoreInstr);
2315   }
2316 }
2317
2318 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2319                                                      VectorParts *BlockInMask) {
2320   // Attempt to issue a wide load.
2321   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2322   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2323
2324   assert((LI || SI) && "Invalid Load/Store instruction");
2325
2326   LoopVectorizationCostModel::InstWidening Decision =
2327       Cost->getWideningDecision(Instr, VF);
2328   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2329          "CM decision should be taken at this point");
2330   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2331     return vectorizeInterleaveGroup(Instr);
2332
2333   Type *ScalarDataTy = getMemInstValueType(Instr);
2334   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2335   Value *Ptr = getLoadStorePointerOperand(Instr);
2336   unsigned Alignment = getLoadStoreAlignment(Instr);
2337   // An alignment of 0 means target abi alignment. We need to use the scalar's
2338   // target abi alignment in such a case.
2339   const DataLayout &DL = Instr->getModule()->getDataLayout();
2340   if (!Alignment)
2341     Alignment = DL.getABITypeAlignment(ScalarDataTy);
2342   unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2343
2344   // Determine if the pointer operand of the access is either consecutive or
2345   // reverse consecutive.
2346   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2347   bool ConsecutiveStride =
2348       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2349   bool CreateGatherScatter =
2350       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2351
2352   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2353   // gather/scatter. Otherwise Decision should have been to Scalarize.
2354   assert((ConsecutiveStride || CreateGatherScatter) &&
2355          "The instruction should be scalarized");
2356
2357   // Handle consecutive loads/stores.
2358   if (ConsecutiveStride)
2359     Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2360
2361   VectorParts Mask;
2362   bool isMaskRequired = BlockInMask;
2363   if (isMaskRequired)
2364     Mask = *BlockInMask;
2365
2366   bool InBounds = false;
2367   if (auto *gep = dyn_cast<GetElementPtrInst>(
2368           getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2369     InBounds = gep->isInBounds();
2370
2371   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2372     // Calculate the pointer for the specific unroll-part.
2373     GetElementPtrInst *PartPtr = nullptr;
2374
2375     if (Reverse) {
2376       // If the address is consecutive but reversed, then the
2377       // wide store needs to start at the last vector element.
2378       PartPtr = cast<GetElementPtrInst>(
2379           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2380       PartPtr->setIsInBounds(InBounds);
2381       PartPtr = cast<GetElementPtrInst>(
2382           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2383       PartPtr->setIsInBounds(InBounds);
2384       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2385         Mask[Part] = reverseVector(Mask[Part]);
2386     } else {
2387       PartPtr = cast<GetElementPtrInst>(
2388           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2389       PartPtr->setIsInBounds(InBounds);
2390     }
2391
2392     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2393   };
2394
2395   // Handle Stores:
2396   if (SI) {
2397     setDebugLocFromInst(Builder, SI);
2398
2399     for (unsigned Part = 0; Part < UF; ++Part) {
2400       Instruction *NewSI = nullptr;
2401       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2402       if (CreateGatherScatter) {
2403         Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2404         Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2405         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2406                                             MaskPart);
2407       } else {
2408         if (Reverse) {
2409           // If we store to reverse consecutive memory locations, then we need
2410           // to reverse the order of elements in the stored value.
2411           StoredVal = reverseVector(StoredVal);
2412           // We don't want to update the value in the map as it might be used in
2413           // another expression. So don't call resetVectorValue(StoredVal).
2414         }
2415         auto *VecPtr = CreateVecPtr(Part, Ptr);
2416         if (isMaskRequired)
2417           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2418                                             Mask[Part]);
2419         else
2420           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2421       }
2422       addMetadata(NewSI, SI);
2423     }
2424     return;
2425   }
2426
2427   // Handle loads.
2428   assert(LI && "Must have a load instruction");
2429   setDebugLocFromInst(Builder, LI);
2430   for (unsigned Part = 0; Part < UF; ++Part) {
2431     Value *NewLI;
2432     if (CreateGatherScatter) {
2433       Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2434       Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2435       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2436                                          nullptr, "wide.masked.gather");
2437       addMetadata(NewLI, LI);
2438     } else {
2439       auto *VecPtr = CreateVecPtr(Part, Ptr);
2440       if (isMaskRequired)
2441         NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
2442                                          UndefValue::get(DataTy),
2443                                          "wide.masked.load");
2444       else
2445         NewLI =
2446             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2447
2448       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2449       addMetadata(NewLI, LI);
2450       if (Reverse)
2451         NewLI = reverseVector(NewLI);
2452     }
2453     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2454   }
2455 }
2456
2457 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2458                                                const VPIteration &Instance,
2459                                                bool IfPredicateInstr) {
2460   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2461
2462   setDebugLocFromInst(Builder, Instr);
2463
2464   // Does this instruction return a value ?
2465   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2466
2467   Instruction *Cloned = Instr->clone();
2468   if (!IsVoidRetTy)
2469     Cloned->setName(Instr->getName() + ".cloned");
2470
2471   // Replace the operands of the cloned instructions with their scalar
2472   // equivalents in the new loop.
2473   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2474     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2475     Cloned->setOperand(op, NewOp);
2476   }
2477   addNewMetadata(Cloned, Instr);
2478
2479   // Place the cloned scalar in the new loop.
2480   Builder.Insert(Cloned);
2481
2482   // Add the cloned scalar to the scalar map entry.
2483   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2484
2485   // If we just cloned a new assumption, add it the assumption cache.
2486   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2487     if (II->getIntrinsicID() == Intrinsic::assume)
2488       AC->registerAssumption(II);
2489
2490   // End if-block.
2491   if (IfPredicateInstr)
2492     PredicatedInstructions.push_back(Cloned);
2493 }
2494
2495 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2496                                                       Value *End, Value *Step,
2497                                                       Instruction *DL) {
2498   BasicBlock *Header = L->getHeader();
2499   BasicBlock *Latch = L->getLoopLatch();
2500   // As we're just creating this loop, it's possible no latch exists
2501   // yet. If so, use the header as this will be a single block loop.
2502   if (!Latch)
2503     Latch = Header;
2504
2505   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2506   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2507   setDebugLocFromInst(Builder, OldInst);
2508   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2509
2510   Builder.SetInsertPoint(Latch->getTerminator());
2511   setDebugLocFromInst(Builder, OldInst);
2512
2513   // Create i+1 and fill the PHINode.
2514   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2515   Induction->addIncoming(Start, L->getLoopPreheader());
2516   Induction->addIncoming(Next, Latch);
2517   // Create the compare.
2518   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2519   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2520
2521   // Now we have two terminators. Remove the old one from the block.
2522   Latch->getTerminator()->eraseFromParent();
2523
2524   return Induction;
2525 }
2526
2527 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2528   if (TripCount)
2529     return TripCount;
2530
2531   assert(L && "Create Trip Count for null loop.");
2532   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2533   // Find the loop boundaries.
2534   ScalarEvolution *SE = PSE.getSE();
2535   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2536   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2537          "Invalid loop count");
2538
2539   Type *IdxTy = Legal->getWidestInductionType();
2540   assert(IdxTy && "No type for induction");
2541
2542   // The exit count might have the type of i64 while the phi is i32. This can
2543   // happen if we have an induction variable that is sign extended before the
2544   // compare. The only way that we get a backedge taken count is that the
2545   // induction variable was signed and as such will not overflow. In such a case
2546   // truncation is legal.
2547   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2548       IdxTy->getPrimitiveSizeInBits())
2549     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2550   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2551
2552   // Get the total trip count from the count by adding 1.
2553   const SCEV *ExitCount = SE->getAddExpr(
2554       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2555
2556   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2557
2558   // Expand the trip count and place the new instructions in the preheader.
2559   // Notice that the pre-header does not change, only the loop body.
2560   SCEVExpander Exp(*SE, DL, "induction");
2561
2562   // Count holds the overall loop count (N).
2563   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2564                                 L->getLoopPreheader()->getTerminator());
2565
2566   if (TripCount->getType()->isPointerTy())
2567     TripCount =
2568         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2569                                     L->getLoopPreheader()->getTerminator());
2570
2571   return TripCount;
2572 }
2573
2574 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2575   if (VectorTripCount)
2576     return VectorTripCount;
2577
2578   Value *TC = getOrCreateTripCount(L);
2579   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2580
2581   Type *Ty = TC->getType();
2582   Constant *Step = ConstantInt::get(Ty, VF * UF);
2583
2584   // If the tail is to be folded by masking, round the number of iterations N
2585   // up to a multiple of Step instead of rounding down. This is done by first
2586   // adding Step-1 and then rounding down. Note that it's ok if this addition
2587   // overflows: the vector induction variable will eventually wrap to zero given
2588   // that it starts at zero and its Step is a power of two; the loop will then
2589   // exit, with the last early-exit vector comparison also producing all-true.
2590   if (Cost->foldTailByMasking()) {
2591     assert(isPowerOf2_32(VF * UF) &&
2592            "VF*UF must be a power of 2 when folding tail by masking");
2593     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2594   }
2595
2596   // Now we need to generate the expression for the part of the loop that the
2597   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2598   // iterations are not required for correctness, or N - Step, otherwise. Step
2599   // is equal to the vectorization factor (number of SIMD elements) times the
2600   // unroll factor (number of SIMD instructions).
2601   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2602
2603   // If there is a non-reversed interleaved group that may speculatively access
2604   // memory out-of-bounds, we need to ensure that there will be at least one
2605   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2606   // the trip count, we set the remainder to be equal to the step. If the step
2607   // does not evenly divide the trip count, no adjustment is necessary since
2608   // there will already be scalar iterations. Note that the minimum iterations
2609   // check ensures that N >= Step.
2610   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2611     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2612     R = Builder.CreateSelect(IsZero, Step, R);
2613   }
2614
2615   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2616
2617   return VectorTripCount;
2618 }
2619
2620 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2621                                                    const DataLayout &DL) {
2622   // Verify that V is a vector type with same number of elements as DstVTy.
2623   unsigned VF = DstVTy->getNumElements();
2624   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2625   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2626   Type *SrcElemTy = SrcVecTy->getElementType();
2627   Type *DstElemTy = DstVTy->getElementType();
2628   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2629          "Vector elements must have same size");
2630
2631   // Do a direct cast if element types are castable.
2632   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2633     return Builder.CreateBitOrPointerCast(V, DstVTy);
2634   }
2635   // V cannot be directly casted to desired vector type.
2636   // May happen when V is a floating point vector but DstVTy is a vector of
2637   // pointers or vice-versa. Handle this using a two-step bitcast using an
2638   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2639   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2640          "Only one type should be a pointer type");
2641   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2642          "Only one type should be a floating point type");
2643   Type *IntTy =
2644       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2645   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2646   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2647   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2648 }
2649
2650 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2651                                                          BasicBlock *Bypass) {
2652   Value *Count = getOrCreateTripCount(L);
2653   BasicBlock *BB = L->getLoopPreheader();
2654   IRBuilder<> Builder(BB->getTerminator());
2655
2656   // Generate code to check if the loop's trip count is less than VF * UF, or
2657   // equal to it in case a scalar epilogue is required; this implies that the
2658   // vector trip count is zero. This check also covers the case where adding one
2659   // to the backedge-taken count overflowed leading to an incorrect trip count
2660   // of zero. In this case we will also jump to the scalar loop.
2661   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2662                                           : ICmpInst::ICMP_ULT;
2663
2664   // If tail is to be folded, vector loop takes care of all iterations.
2665   Value *CheckMinIters = Builder.getFalse();
2666   if (!Cost->foldTailByMasking())
2667     CheckMinIters = Builder.CreateICmp(
2668         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2669         "min.iters.check");
2670
2671   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2672   // Update dominator tree immediately if the generated block is a
2673   // LoopBypassBlock because SCEV expansions to generate loop bypass
2674   // checks may query it before the current function is finished.
2675   DT->addNewBlock(NewBB, BB);
2676   if (L->getParentLoop())
2677     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2678   ReplaceInstWithInst(BB->getTerminator(),
2679                       BranchInst::Create(Bypass, NewBB, CheckMinIters));
2680   LoopBypassBlocks.push_back(BB);
2681 }
2682
2683 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2684   BasicBlock *BB = L->getLoopPreheader();
2685
2686   // Generate the code to check that the SCEV assumptions that we made.
2687   // We want the new basic block to start at the first instruction in a
2688   // sequence of instructions that form a check.
2689   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2690                    "scev.check");
2691   Value *SCEVCheck =
2692       Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2693
2694   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2695     if (C->isZero())
2696       return;
2697
2698   assert(!Cost->foldTailByMasking() &&
2699          "Cannot SCEV check stride or overflow when folding tail");
2700   // Create a new block containing the stride check.
2701   BB->setName("vector.scevcheck");
2702   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2703   // Update dominator tree immediately if the generated block is a
2704   // LoopBypassBlock because SCEV expansions to generate loop bypass
2705   // checks may query it before the current function is finished.
2706   DT->addNewBlock(NewBB, BB);
2707   if (L->getParentLoop())
2708     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2709   ReplaceInstWithInst(BB->getTerminator(),
2710                       BranchInst::Create(Bypass, NewBB, SCEVCheck));
2711   LoopBypassBlocks.push_back(BB);
2712   AddedSafetyChecks = true;
2713 }
2714
2715 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2716   // VPlan-native path does not do any analysis for runtime checks currently.
2717   if (EnableVPlanNativePath)
2718     return;
2719
2720   BasicBlock *BB = L->getLoopPreheader();
2721
2722   // Generate the code that checks in runtime if arrays overlap. We put the
2723   // checks into a separate block to make the more common case of few elements
2724   // faster.
2725   Instruction *FirstCheckInst;
2726   Instruction *MemRuntimeCheck;
2727   std::tie(FirstCheckInst, MemRuntimeCheck) =
2728       Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
2729   if (!MemRuntimeCheck)
2730     return;
2731
2732   assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail");
2733   // Create a new block containing the memory check.
2734   BB->setName("vector.memcheck");
2735   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2736   // Update dominator tree immediately if the generated block is a
2737   // LoopBypassBlock because SCEV expansions to generate loop bypass
2738   // checks may query it before the current function is finished.
2739   DT->addNewBlock(NewBB, BB);
2740   if (L->getParentLoop())
2741     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2742   ReplaceInstWithInst(BB->getTerminator(),
2743                       BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2744   LoopBypassBlocks.push_back(BB);
2745   AddedSafetyChecks = true;
2746
2747   // We currently don't use LoopVersioning for the actual loop cloning but we
2748   // still use it to add the noalias metadata.
2749   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2750                                            PSE.getSE());
2751   LVer->prepareNoAliasMetadata();
2752 }
2753
2754 Value *InnerLoopVectorizer::emitTransformedIndex(
2755     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2756     const InductionDescriptor &ID) const {
2757
2758   SCEVExpander Exp(*SE, DL, "induction");
2759   auto Step = ID.getStep();
2760   auto StartValue = ID.getStartValue();
2761   assert(Index->getType() == Step->getType() &&
2762          "Index type does not match StepValue type");
2763
2764   // Note: the IR at this point is broken. We cannot use SE to create any new
2765   // SCEV and then expand it, hoping that SCEV's simplification will give us
2766   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2767   // lead to various SCEV crashes. So all we can do is to use builder and rely
2768   // on InstCombine for future simplifications. Here we handle some trivial
2769   // cases only.
2770   auto CreateAdd = [&B](Value *X, Value *Y) {
2771     assert(X->getType() == Y->getType() && "Types don't match!");
2772     if (auto *CX = dyn_cast<ConstantInt>(X))
2773       if (CX->isZero())
2774         return Y;
2775     if (auto *CY = dyn_cast<ConstantInt>(Y))
2776       if (CY->isZero())
2777         return X;
2778     return B.CreateAdd(X, Y);
2779   };
2780
2781   auto CreateMul = [&B](Value *X, Value *Y) {
2782     assert(X->getType() == Y->getType() && "Types don't match!");
2783     if (auto *CX = dyn_cast<ConstantInt>(X))
2784       if (CX->isOne())
2785         return Y;
2786     if (auto *CY = dyn_cast<ConstantInt>(Y))
2787       if (CY->isOne())
2788         return X;
2789     return B.CreateMul(X, Y);
2790   };
2791
2792   switch (ID.getKind()) {
2793   case InductionDescriptor::IK_IntInduction: {
2794     assert(Index->getType() == StartValue->getType() &&
2795            "Index type does not match StartValue type");
2796     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2797       return B.CreateSub(StartValue, Index);
2798     auto *Offset = CreateMul(
2799         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2800     return CreateAdd(StartValue, Offset);
2801   }
2802   case InductionDescriptor::IK_PtrInduction: {
2803     assert(isa<SCEVConstant>(Step) &&
2804            "Expected constant step for pointer induction");
2805     return B.CreateGEP(
2806         StartValue->getType()->getPointerElementType(), StartValue,
2807         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2808                                            &*B.GetInsertPoint())));
2809   }
2810   case InductionDescriptor::IK_FpInduction: {
2811     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2812     auto InductionBinOp = ID.getInductionBinOp();
2813     assert(InductionBinOp &&
2814            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2815             InductionBinOp->getOpcode() == Instruction::FSub) &&
2816            "Original bin op should be defined for FP induction");
2817
2818     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2819
2820     // Floating point operations had to be 'fast' to enable the induction.
2821     FastMathFlags Flags;
2822     Flags.setFast();
2823
2824     Value *MulExp = B.CreateFMul(StepValue, Index);
2825     if (isa<Instruction>(MulExp))
2826       // We have to check, the MulExp may be a constant.
2827       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2828
2829     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2830                                "induction");
2831     if (isa<Instruction>(BOp))
2832       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2833
2834     return BOp;
2835   }
2836   case InductionDescriptor::IK_NoInduction:
2837     return nullptr;
2838   }
2839   llvm_unreachable("invalid enum");
2840 }
2841
2842 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2843   /*
2844    In this function we generate a new loop. The new loop will contain
2845    the vectorized instructions while the old loop will continue to run the
2846    scalar remainder.
2847
2848        [ ] <-- loop iteration number check.
2849     /   |
2850    /    v
2851   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2852   |  /  |
2853   | /   v
2854   ||   [ ]     <-- vector pre header.
2855   |/    |
2856   |     v
2857   |    [  ] \
2858   |    [  ]_|   <-- vector loop.
2859   |     |
2860   |     v
2861   |   -[ ]   <--- middle-block.
2862   |  /  |
2863   | /   v
2864   -|- >[ ]     <--- new preheader.
2865    |    |
2866    |    v
2867    |   [ ] \
2868    |   [ ]_|   <-- old scalar loop to handle remainder.
2869     \   |
2870      \  v
2871       >[ ]     <-- exit block.
2872    ...
2873    */
2874
2875   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2876   BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2877   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2878   MDNode *OrigLoopID = OrigLoop->getLoopID();
2879   assert(VectorPH && "Invalid loop structure");
2880   assert(ExitBlock && "Must have an exit block");
2881
2882   // Some loops have a single integer induction variable, while other loops
2883   // don't. One example is c++ iterators that often have multiple pointer
2884   // induction variables. In the code below we also support a case where we
2885   // don't have a single induction variable.
2886   //
2887   // We try to obtain an induction variable from the original loop as hard
2888   // as possible. However if we don't find one that:
2889   //   - is an integer
2890   //   - counts from zero, stepping by one
2891   //   - is the size of the widest induction variable type
2892   // then we create a new one.
2893   OldInduction = Legal->getPrimaryInduction();
2894   Type *IdxTy = Legal->getWidestInductionType();
2895
2896   // Split the single block loop into the two loop structure described above.
2897   BasicBlock *VecBody =
2898       VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2899   BasicBlock *MiddleBlock =
2900       VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2901   BasicBlock *ScalarPH =
2902       MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2903
2904   // Create and register the new vector loop.
2905   Loop *Lp = LI->AllocateLoop();
2906   Loop *ParentLoop = OrigLoop->getParentLoop();
2907
2908   // Insert the new loop into the loop nest and register the new basic blocks
2909   // before calling any utilities such as SCEV that require valid LoopInfo.
2910   if (ParentLoop) {
2911     ParentLoop->addChildLoop(Lp);
2912     ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2913     ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2914   } else {
2915     LI->addTopLevelLoop(Lp);
2916   }
2917   Lp->addBasicBlockToLoop(VecBody, *LI);
2918
2919   // Find the loop boundaries.
2920   Value *Count = getOrCreateTripCount(Lp);
2921
2922   Value *StartIdx = ConstantInt::get(IdxTy, 0);
2923
2924   // Now, compare the new count to zero. If it is zero skip the vector loop and
2925   // jump to the scalar loop. This check also covers the case where the
2926   // backedge-taken count is uint##_max: adding one to it will overflow leading
2927   // to an incorrect trip count of zero. In this (rare) case we will also jump
2928   // to the scalar loop.
2929   emitMinimumIterationCountCheck(Lp, ScalarPH);
2930
2931   // Generate the code to check any assumptions that we've made for SCEV
2932   // expressions.
2933   emitSCEVChecks(Lp, ScalarPH);
2934
2935   // Generate the code that checks in runtime if arrays overlap. We put the
2936   // checks into a separate block to make the more common case of few elements
2937   // faster.
2938   emitMemRuntimeChecks(Lp, ScalarPH);
2939
2940   // Generate the induction variable.
2941   // The loop step is equal to the vectorization factor (num of SIMD elements)
2942   // times the unroll factor (num of SIMD instructions).
2943   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2944   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2945   Induction =
2946       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2947                               getDebugLocFromInstOrOperands(OldInduction));
2948
2949   // We are going to resume the execution of the scalar loop.
2950   // Go over all of the induction variables that we found and fix the
2951   // PHIs that are left in the scalar version of the loop.
2952   // The starting values of PHI nodes depend on the counter of the last
2953   // iteration in the vectorized loop.
2954   // If we come from a bypass edge then we need to start from the original
2955   // start value.
2956
2957   // This variable saves the new starting index for the scalar loop. It is used
2958   // to test if there are any tail iterations left once the vector loop has
2959   // completed.
2960   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
2961   for (auto &InductionEntry : *List) {
2962     PHINode *OrigPhi = InductionEntry.first;
2963     InductionDescriptor II = InductionEntry.second;
2964
2965     // Create phi nodes to merge from the  backedge-taken check block.
2966     PHINode *BCResumeVal = PHINode::Create(
2967         OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
2968     // Copy original phi DL over to the new one.
2969     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2970     Value *&EndValue = IVEndValues[OrigPhi];
2971     if (OrigPhi == OldInduction) {
2972       // We know what the end value is.
2973       EndValue = CountRoundDown;
2974     } else {
2975       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
2976       Type *StepType = II.getStep()->getType();
2977       Instruction::CastOps CastOp =
2978         CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
2979       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
2980       const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2981       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
2982       EndValue->setName("ind.end");
2983     }
2984
2985     // The new PHI merges the original incoming value, in case of a bypass,
2986     // or the value at the end of the vectorized loop.
2987     BCResumeVal->addIncoming(EndValue, MiddleBlock);
2988
2989     // Fix the scalar body counter (PHI node).
2990     // The old induction's phi node in the scalar body needs the truncated
2991     // value.
2992     for (BasicBlock *BB : LoopBypassBlocks)
2993       BCResumeVal->addIncoming(II.getStartValue(), BB);
2994     OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);
2995   }
2996
2997   // We need the OrigLoop (scalar loop part) latch terminator to help
2998   // produce correct debug info for the middle block BB instructions.
2999   // The legality check stage guarantees that the loop will have a single
3000   // latch.
3001   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3002          "Scalar loop latch terminator isn't a branch");
3003   BranchInst *ScalarLatchBr =
3004       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3005
3006   // Add a check in the middle block to see if we have completed
3007   // all of the iterations in the first vector loop.
3008   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3009   // If tail is to be folded, we know we don't need to run the remainder.
3010   Value *CmpN = Builder.getTrue();
3011   if (!Cost->foldTailByMasking()) {
3012     CmpN =
3013         CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3014                         CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
3015
3016     // Here we use the same DebugLoc as the scalar loop latch branch instead
3017     // of the corresponding compare because they may have ended up with
3018     // different line numbers and we want to avoid awkward line stepping while
3019     // debugging. Eg. if the compare has got a line number inside the loop.
3020     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3021   }
3022
3023   BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
3024   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3025   ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
3026
3027   // Get ready to start creating new instructions into the vectorized body.
3028   Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
3029
3030   // Save the state.
3031   LoopVectorPreHeader = Lp->getLoopPreheader();
3032   LoopScalarPreHeader = ScalarPH;
3033   LoopMiddleBlock = MiddleBlock;
3034   LoopExitBlock = ExitBlock;
3035   LoopVectorBody = VecBody;
3036   LoopScalarBody = OldBasicBlock;
3037
3038   Optional<MDNode *> VectorizedLoopID =
3039       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3040                                       LLVMLoopVectorizeFollowupVectorized});
3041   if (VectorizedLoopID.hasValue()) {
3042     Lp->setLoopID(VectorizedLoopID.getValue());
3043
3044     // Do not setAlreadyVectorized if loop attributes have been defined
3045     // explicitly.
3046     return LoopVectorPreHeader;
3047   }
3048
3049   // Keep all loop hints from the original loop on the vector loop (we'll
3050   // replace the vectorizer-specific hints below).
3051   if (MDNode *LID = OrigLoop->getLoopID())
3052     Lp->setLoopID(LID);
3053
3054   LoopVectorizeHints Hints(Lp, true, *ORE);
3055   Hints.setAlreadyVectorized();
3056
3057   return LoopVectorPreHeader;
3058 }
3059
3060 // Fix up external users of the induction variable. At this point, we are
3061 // in LCSSA form, with all external PHIs that use the IV having one input value,
3062 // coming from the remainder loop. We need those PHIs to also have a correct
3063 // value for the IV when arriving directly from the middle block.
3064 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3065                                        const InductionDescriptor &II,
3066                                        Value *CountRoundDown, Value *EndValue,
3067                                        BasicBlock *MiddleBlock) {
3068   // There are two kinds of external IV usages - those that use the value
3069   // computed in the last iteration (the PHI) and those that use the penultimate
3070   // value (the value that feeds into the phi from the loop latch).
3071   // We allow both, but they, obviously, have different values.
3072
3073   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3074
3075   DenseMap<Value *, Value *> MissingVals;
3076
3077   // An external user of the last iteration's value should see the value that
3078   // the remainder loop uses to initialize its own IV.
3079   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3080   for (User *U : PostInc->users()) {
3081     Instruction *UI = cast<Instruction>(U);
3082     if (!OrigLoop->contains(UI)) {
3083       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3084       MissingVals[UI] = EndValue;
3085     }
3086   }
3087
3088   // An external user of the penultimate value need to see EndValue - Step.
3089   // The simplest way to get this is to recompute it from the constituent SCEVs,
3090   // that is Start + (Step * (CRD - 1)).
3091   for (User *U : OrigPhi->users()) {
3092     auto *UI = cast<Instruction>(U);
3093     if (!OrigLoop->contains(UI)) {
3094       const DataLayout &DL =
3095           OrigLoop->getHeader()->getModule()->getDataLayout();
3096       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3097
3098       IRBuilder<> B(MiddleBlock->getTerminator());
3099       Value *CountMinusOne = B.CreateSub(
3100           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3101       Value *CMO =
3102           !II.getStep()->getType()->isIntegerTy()
3103               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3104                              II.getStep()->getType())
3105               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3106       CMO->setName("cast.cmo");
3107       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3108       Escape->setName("ind.escape");
3109       MissingVals[UI] = Escape;
3110     }
3111   }
3112
3113   for (auto &I : MissingVals) {
3114     PHINode *PHI = cast<PHINode>(I.first);
3115     // One corner case we have to handle is two IVs "chasing" each-other,
3116     // that is %IV2 = phi [...], [ %IV1, %latch ]
3117     // In this case, if IV1 has an external use, we need to avoid adding both
3118     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3119     // don't already have an incoming value for the middle block.
3120     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3121       PHI->addIncoming(I.second, MiddleBlock);
3122   }
3123 }
3124
3125 namespace {
3126
3127 struct CSEDenseMapInfo {
3128   static bool canHandle(const Instruction *I) {
3129     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3130            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3131   }
3132
3133   static inline Instruction *getEmptyKey() {
3134     return DenseMapInfo<Instruction *>::getEmptyKey();
3135   }
3136
3137   static inline Instruction *getTombstoneKey() {
3138     return DenseMapInfo<Instruction *>::getTombstoneKey();
3139   }
3140
3141   static unsigned getHashValue(const Instruction *I) {
3142     assert(canHandle(I) && "Unknown instruction!");
3143     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3144                                                            I->value_op_end()));
3145   }
3146
3147   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3148     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3149         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3150       return LHS == RHS;
3151     return LHS->isIdenticalTo(RHS);
3152   }
3153 };
3154
3155 } // end anonymous namespace
3156
3157 ///Perform cse of induction variable instructions.
3158 static void cse(BasicBlock *BB) {
3159   // Perform simple cse.
3160   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3161   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3162     Instruction *In = &*I++;
3163
3164     if (!CSEDenseMapInfo::canHandle(In))
3165       continue;
3166
3167     // Check if we can replace this instruction with any of the
3168     // visited instructions.
3169     if (Instruction *V = CSEMap.lookup(In)) {
3170       In->replaceAllUsesWith(V);
3171       In->eraseFromParent();
3172       continue;
3173     }
3174
3175     CSEMap[In] = In;
3176   }
3177 }
3178
3179 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3180                                                        unsigned VF,
3181                                                        bool &NeedToScalarize) {
3182   Function *F = CI->getCalledFunction();
3183   StringRef FnName = CI->getCalledFunction()->getName();
3184   Type *ScalarRetTy = CI->getType();
3185   SmallVector<Type *, 4> Tys, ScalarTys;
3186   for (auto &ArgOp : CI->arg_operands())
3187     ScalarTys.push_back(ArgOp->getType());
3188
3189   // Estimate cost of scalarized vector call. The source operands are assumed
3190   // to be vectors, so we need to extract individual elements from there,
3191   // execute VF scalar calls, and then gather the result into the vector return
3192   // value.
3193   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3194   if (VF == 1)
3195     return ScalarCallCost;
3196
3197   // Compute corresponding vector type for return value and arguments.
3198   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3199   for (Type *ScalarTy : ScalarTys)
3200     Tys.push_back(ToVectorTy(ScalarTy, VF));
3201
3202   // Compute costs of unpacking argument values for the scalar calls and
3203   // packing the return values to a vector.
3204   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3205
3206   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3207
3208   // If we can't emit a vector call for this function, then the currently found
3209   // cost is the cost we need to return.
3210   NeedToScalarize = true;
3211   if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3212     return Cost;
3213
3214   // If the corresponding vector cost is cheaper, return its cost.
3215   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3216   if (VectorCallCost < Cost) {
3217     NeedToScalarize = false;
3218     return VectorCallCost;
3219   }
3220   return Cost;
3221 }
3222
3223 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3224                                                             unsigned VF) {
3225   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3226   assert(ID && "Expected intrinsic call!");
3227
3228   FastMathFlags FMF;
3229   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3230     FMF = FPMO->getFastMathFlags();
3231
3232   SmallVector<Value *, 4> Operands(CI->arg_operands());
3233   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3234 }
3235
3236 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3237   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3238   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3239   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3240 }
3241 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3242   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3243   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3244   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3245 }
3246
3247 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3248   // For every instruction `I` in MinBWs, truncate the operands, create a
3249   // truncated version of `I` and reextend its result. InstCombine runs
3250   // later and will remove any ext/trunc pairs.
3251   SmallPtrSet<Value *, 4> Erased;
3252   for (const auto &KV : Cost->getMinimalBitwidths()) {
3253     // If the value wasn't vectorized, we must maintain the original scalar
3254     // type. The absence of the value from VectorLoopValueMap indicates that it
3255     // wasn't vectorized.
3256     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3257       continue;
3258     for (unsigned Part = 0; Part < UF; ++Part) {
3259       Value *I = getOrCreateVectorValue(KV.first, Part);
3260       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3261           !isa<Instruction>(I))
3262         continue;
3263       Type *OriginalTy = I->getType();
3264       Type *ScalarTruncatedTy =
3265           IntegerType::get(OriginalTy->getContext(), KV.second);
3266       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3267                                           OriginalTy->getVectorNumElements());
3268       if (TruncatedTy == OriginalTy)
3269         continue;
3270
3271       IRBuilder<> B(cast<Instruction>(I));
3272       auto ShrinkOperand = [&](Value *V) -> Value * {
3273         if (auto *ZI = dyn_cast<ZExtInst>(V))
3274           if (ZI->getSrcTy() == TruncatedTy)
3275             return ZI->getOperand(0);
3276         return B.CreateZExtOrTrunc(V, TruncatedTy);
3277       };
3278
3279       // The actual instruction modification depends on the instruction type,
3280       // unfortunately.
3281       Value *NewI = nullptr;
3282       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3283         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3284                              ShrinkOperand(BO->getOperand(1)));
3285
3286         // Any wrapping introduced by shrinking this operation shouldn't be
3287         // considered undefined behavior. So, we can't unconditionally copy
3288         // arithmetic wrapping flags to NewI.
3289         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3290       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3291         NewI =
3292             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3293                          ShrinkOperand(CI->getOperand(1)));
3294       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3295         NewI = B.CreateSelect(SI->getCondition(),
3296                               ShrinkOperand(SI->getTrueValue()),
3297                               ShrinkOperand(SI->getFalseValue()));
3298       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3299         switch (CI->getOpcode()) {
3300         default:
3301           llvm_unreachable("Unhandled cast!");
3302         case Instruction::Trunc:
3303           NewI = ShrinkOperand(CI->getOperand(0));
3304           break;
3305         case Instruction::SExt:
3306           NewI = B.CreateSExtOrTrunc(
3307               CI->getOperand(0),
3308               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3309           break;
3310         case Instruction::ZExt:
3311           NewI = B.CreateZExtOrTrunc(
3312               CI->getOperand(0),
3313               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3314           break;
3315         }
3316       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3317         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3318         auto *O0 = B.CreateZExtOrTrunc(
3319             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3320         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3321         auto *O1 = B.CreateZExtOrTrunc(
3322             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3323
3324         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3325       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3326         // Don't do anything with the operands, just extend the result.
3327         continue;
3328       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3329         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3330         auto *O0 = B.CreateZExtOrTrunc(
3331             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3332         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3333         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3334       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3335         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3336         auto *O0 = B.CreateZExtOrTrunc(
3337             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3338         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3339       } else {
3340         // If we don't know what to do, be conservative and don't do anything.
3341         continue;
3342       }
3343
3344       // Lastly, extend the result.
3345       NewI->takeName(cast<Instruction>(I));
3346       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3347       I->replaceAllUsesWith(Res);
3348       cast<Instruction>(I)->eraseFromParent();
3349       Erased.insert(I);
3350       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3351     }
3352   }
3353
3354   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3355   for (const auto &KV : Cost->getMinimalBitwidths()) {
3356     // If the value wasn't vectorized, we must maintain the original scalar
3357     // type. The absence of the value from VectorLoopValueMap indicates that it
3358     // wasn't vectorized.
3359     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3360       continue;
3361     for (unsigned Part = 0; Part < UF; ++Part) {
3362       Value *I = getOrCreateVectorValue(KV.first, Part);
3363       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3364       if (Inst && Inst->use_empty()) {
3365         Value *NewI = Inst->getOperand(0);
3366         Inst->eraseFromParent();
3367         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3368       }
3369     }
3370   }
3371 }
3372
3373 void InnerLoopVectorizer::fixVectorizedLoop() {
3374   // Insert truncates and extends for any truncated instructions as hints to
3375   // InstCombine.
3376   if (VF > 1)
3377     truncateToMinimalBitwidths();
3378
3379   // Fix widened non-induction PHIs by setting up the PHI operands.
3380   if (OrigPHIsToFix.size()) {
3381     assert(EnableVPlanNativePath &&
3382            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3383     fixNonInductionPHIs();
3384   }
3385
3386   // At this point every instruction in the original loop is widened to a
3387   // vector form. Now we need to fix the recurrences in the loop. These PHI
3388   // nodes are currently empty because we did not want to introduce cycles.
3389   // This is the second stage of vectorizing recurrences.
3390   fixCrossIterationPHIs();
3391
3392   // Update the dominator tree.
3393   //
3394   // FIXME: After creating the structure of the new loop, the dominator tree is
3395   //        no longer up-to-date, and it remains that way until we update it
3396   //        here. An out-of-date dominator tree is problematic for SCEV,
3397   //        because SCEVExpander uses it to guide code generation. The
3398   //        vectorizer use SCEVExpanders in several places. Instead, we should
3399   //        keep the dominator tree up-to-date as we go.
3400   updateAnalysis();
3401
3402   // Fix-up external users of the induction variables.
3403   for (auto &Entry : *Legal->getInductionVars())
3404     fixupIVUsers(Entry.first, Entry.second,
3405                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3406                  IVEndValues[Entry.first], LoopMiddleBlock);
3407
3408   fixLCSSAPHIs();
3409   for (Instruction *PI : PredicatedInstructions)
3410     sinkScalarOperands(&*PI);
3411
3412   // Remove redundant induction instructions.
3413   cse(LoopVectorBody);
3414 }
3415
3416 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3417   // In order to support recurrences we need to be able to vectorize Phi nodes.
3418   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3419   // stage #2: We now need to fix the recurrences by adding incoming edges to
3420   // the currently empty PHI nodes. At this point every instruction in the
3421   // original loop is widened to a vector form so we can use them to construct
3422   // the incoming edges.
3423   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3424     // Handle first-order recurrences and reductions that need to be fixed.
3425     if (Legal->isFirstOrderRecurrence(&Phi))
3426       fixFirstOrderRecurrence(&Phi);
3427     else if (Legal->isReductionVariable(&Phi))
3428       fixReduction(&Phi);
3429   }
3430 }
3431
3432 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3433   // This is the second phase of vectorizing first-order recurrences. An
3434   // overview of the transformation is described below. Suppose we have the
3435   // following loop.
3436   //
3437   //   for (int i = 0; i < n; ++i)
3438   //     b[i] = a[i] - a[i - 1];
3439   //
3440   // There is a first-order recurrence on "a". For this loop, the shorthand
3441   // scalar IR looks like:
3442   //
3443   //   scalar.ph:
3444   //     s_init = a[-1]
3445   //     br scalar.body
3446   //
3447   //   scalar.body:
3448   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3449   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3450   //     s2 = a[i]
3451   //     b[i] = s2 - s1
3452   //     br cond, scalar.body, ...
3453   //
3454   // In this example, s1 is a recurrence because it's value depends on the
3455   // previous iteration. In the first phase of vectorization, we created a
3456   // temporary value for s1. We now complete the vectorization and produce the
3457   // shorthand vector IR shown below (for VF = 4, UF = 1).
3458   //
3459   //   vector.ph:
3460   //     v_init = vector(..., ..., ..., a[-1])
3461   //     br vector.body
3462   //
3463   //   vector.body
3464   //     i = phi [0, vector.ph], [i+4, vector.body]
3465   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3466   //     v2 = a[i, i+1, i+2, i+3];
3467   //     v3 = vector(v1(3), v2(0, 1, 2))
3468   //     b[i, i+1, i+2, i+3] = v2 - v3
3469   //     br cond, vector.body, middle.block
3470   //
3471   //   middle.block:
3472   //     x = v2(3)
3473   //     br scalar.ph
3474   //
3475   //   scalar.ph:
3476   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3477   //     br scalar.body
3478   //
3479   // After execution completes the vector loop, we extract the next value of
3480   // the recurrence (x) to use as the initial value in the scalar loop.
3481
3482   // Get the original loop preheader and single loop latch.
3483   auto *Preheader = OrigLoop->getLoopPreheader();
3484   auto *Latch = OrigLoop->getLoopLatch();
3485
3486   // Get the initial and previous values of the scalar recurrence.
3487   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3488   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3489
3490   // Create a vector from the initial value.
3491   auto *VectorInit = ScalarInit;
3492   if (VF > 1) {
3493     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3494     VectorInit = Builder.CreateInsertElement(
3495         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3496         Builder.getInt32(VF - 1), "vector.recur.init");
3497   }
3498
3499   // We constructed a temporary phi node in the first phase of vectorization.
3500   // This phi node will eventually be deleted.
3501   Builder.SetInsertPoint(
3502       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3503
3504   // Create a phi node for the new recurrence. The current value will either be
3505   // the initial value inserted into a vector or loop-varying vector value.
3506   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3507   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3508
3509   // Get the vectorized previous value of the last part UF - 1. It appears last
3510   // among all unrolled iterations, due to the order of their construction.
3511   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3512
3513   // Set the insertion point after the previous value if it is an instruction.
3514   // Note that the previous value may have been constant-folded so it is not
3515   // guaranteed to be an instruction in the vector loop. Also, if the previous
3516   // value is a phi node, we should insert after all the phi nodes to avoid
3517   // breaking basic block verification.
3518   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
3519       isa<PHINode>(PreviousLastPart))
3520     Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3521   else
3522     Builder.SetInsertPoint(
3523         &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
3524
3525   // We will construct a vector for the recurrence by combining the values for
3526   // the current and previous iterations. This is the required shuffle mask.
3527   SmallVector<Constant *, 8> ShuffleMask(VF);
3528   ShuffleMask[0] = Builder.getInt32(VF - 1);
3529   for (unsigned I = 1; I < VF; ++I)
3530     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3531
3532   // The vector from which to take the initial value for the current iteration
3533   // (actual or unrolled). Initially, this is the vector phi node.
3534   Value *Incoming = VecPhi;
3535
3536   // Shuffle the current and previous vector and update the vector parts.
3537   for (unsigned Part = 0; Part < UF; ++Part) {
3538     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3539     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3540     auto *Shuffle =
3541         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3542                                              ConstantVector::get(ShuffleMask))
3543                : Incoming;
3544     PhiPart->replaceAllUsesWith(Shuffle);
3545     cast<Instruction>(PhiPart)->eraseFromParent();
3546     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3547     Incoming = PreviousPart;
3548   }
3549
3550   // Fix the latch value of the new recurrence in the vector loop.
3551   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3552
3553   // Extract the last vector element in the middle block. This will be the
3554   // initial value for the recurrence when jumping to the scalar loop.
3555   auto *ExtractForScalar = Incoming;
3556   if (VF > 1) {
3557     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3558     ExtractForScalar = Builder.CreateExtractElement(
3559         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3560   }
3561   // Extract the second last element in the middle block if the
3562   // Phi is used outside the loop. We need to extract the phi itself
3563   // and not the last element (the phi update in the current iteration). This
3564   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3565   // when the scalar loop is not run at all.
3566   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3567   if (VF > 1)
3568     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3569         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3570   // When loop is unrolled without vectorizing, initialize
3571   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3572   // `Incoming`. This is analogous to the vectorized case above: extracting the
3573   // second last element when VF > 1.
3574   else if (UF > 1)
3575     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3576
3577   // Fix the initial value of the original recurrence in the scalar loop.
3578   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3579   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3580   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3581     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3582     Start->addIncoming(Incoming, BB);
3583   }
3584
3585   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3586   Phi->setName("scalar.recur");
3587
3588   // Finally, fix users of the recurrence outside the loop. The users will need
3589   // either the last value of the scalar recurrence or the last value of the
3590   // vector recurrence we extracted in the middle block. Since the loop is in
3591   // LCSSA form, we just need to find all the phi nodes for the original scalar
3592   // recurrence in the exit block, and then add an edge for the middle block.
3593   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3594     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3595       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3596     }
3597   }
3598 }
3599
3600 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3601   Constant *Zero = Builder.getInt32(0);
3602
3603   // Get it's reduction variable descriptor.
3604   assert(Legal->isReductionVariable(Phi) &&
3605          "Unable to find the reduction variable");
3606   RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3607
3608   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3609   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3610   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3611   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3612     RdxDesc.getMinMaxRecurrenceKind();
3613   setDebugLocFromInst(Builder, ReductionStartValue);
3614
3615   // We need to generate a reduction vector from the incoming scalar.
3616   // To do so, we need to generate the 'identity' vector and override
3617   // one of the elements with the incoming scalar reduction. We need
3618   // to do it in the vector-loop preheader.
3619   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3620
3621   // This is the vector-clone of the value that leaves the loop.
3622   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3623
3624   // Find the reduction identity variable. Zero for addition, or, xor,
3625   // one for multiplication, -1 for And.
3626   Value *Identity;
3627   Value *VectorStart;
3628   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3629       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3630     // MinMax reduction have the start value as their identify.
3631     if (VF == 1) {
3632       VectorStart = Identity = ReductionStartValue;
3633     } else {
3634       VectorStart = Identity =
3635         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3636     }
3637   } else {
3638     // Handle other reduction kinds:
3639     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3640         RK, VecTy->getScalarType());
3641     if (VF == 1) {
3642       Identity = Iden;
3643       // This vector is the Identity vector where the first element is the
3644       // incoming scalar reduction.
3645       VectorStart = ReductionStartValue;
3646     } else {
3647       Identity = ConstantVector::getSplat(VF, Iden);
3648
3649       // This vector is the Identity vector where the first element is the
3650       // incoming scalar reduction.
3651       VectorStart =
3652         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3653     }
3654   }
3655
3656   // Fix the vector-loop phi.
3657
3658   // Reductions do not have to start at zero. They can start with
3659   // any loop invariant values.
3660   BasicBlock *Latch = OrigLoop->getLoopLatch();
3661   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3662   for (unsigned Part = 0; Part < UF; ++Part) {
3663     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3664     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3665     // Make sure to add the reduction stat value only to the
3666     // first unroll part.
3667     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3668     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3669     cast<PHINode>(VecRdxPhi)
3670       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3671   }
3672
3673   // Before each round, move the insertion point right between
3674   // the PHIs and the values we are going to write.
3675   // This allows us to write both PHINodes and the extractelement
3676   // instructions.
3677   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3678
3679   setDebugLocFromInst(Builder, LoopExitInst);
3680
3681   // If tail is folded by masking, the vector value to leave the loop should be
3682   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3683   // instead of the former.
3684   if (Cost->foldTailByMasking()) {
3685     for (unsigned Part = 0; Part < UF; ++Part) {
3686       Value *VecLoopExitInst =
3687           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3688       Value *Sel = nullptr;
3689       for (User *U : VecLoopExitInst->users()) {
3690         if (isa<SelectInst>(U)) {
3691           assert(!Sel && "Reduction exit feeding two selects");
3692           Sel = U;
3693         } else
3694           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3695       }
3696       assert(Sel && "Reduction exit feeds no select");
3697       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3698     }
3699   }
3700
3701   // If the vector reduction can be performed in a smaller type, we truncate
3702   // then extend the loop exit value to enable InstCombine to evaluate the
3703   // entire expression in the smaller type.
3704   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3705     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3706     Builder.SetInsertPoint(
3707         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3708     VectorParts RdxParts(UF);
3709     for (unsigned Part = 0; Part < UF; ++Part) {
3710       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3711       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3712       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3713                                         : Builder.CreateZExt(Trunc, VecTy);
3714       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3715            UI != RdxParts[Part]->user_end();)
3716         if (*UI != Trunc) {
3717           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3718           RdxParts[Part] = Extnd;
3719         } else {
3720           ++UI;
3721         }
3722     }
3723     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3724     for (unsigned Part = 0; Part < UF; ++Part) {
3725       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3726       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3727     }
3728   }
3729
3730   // Reduce all of the unrolled parts into a single vector.
3731   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3732   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3733
3734   // The middle block terminator has already been assigned a DebugLoc here (the
3735   // OrigLoop's single latch terminator). We want the whole middle block to
3736   // appear to execute on this line because: (a) it is all compiler generated,
3737   // (b) these instructions are always executed after evaluating the latch
3738   // conditional branch, and (c) other passes may add new predecessors which
3739   // terminate on this line. This is the easiest way to ensure we don't
3740   // accidentally cause an extra step back into the loop while debugging.
3741   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3742   for (unsigned Part = 1; Part < UF; ++Part) {
3743     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3744     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3745       // Floating point operations had to be 'fast' to enable the reduction.
3746       ReducedPartRdx = addFastMathFlag(
3747           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3748                               ReducedPartRdx, "bin.rdx"),
3749           RdxDesc.getFastMathFlags());
3750     else
3751       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3752                                       RdxPart);
3753   }
3754
3755   if (VF > 1) {
3756     bool NoNaN = Legal->hasFunNoNaNAttr();
3757     ReducedPartRdx =
3758         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3759     // If the reduction can be performed in a smaller type, we need to extend
3760     // the reduction to the wider type before we branch to the original loop.
3761     if (Phi->getType() != RdxDesc.getRecurrenceType())
3762       ReducedPartRdx =
3763         RdxDesc.isSigned()
3764         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3765         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3766   }
3767
3768   // Create a phi node that merges control-flow from the backedge-taken check
3769   // block and the middle block.
3770   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3771                                         LoopScalarPreHeader->getTerminator());
3772   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3773     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3774   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3775
3776   // Now, we need to fix the users of the reduction variable
3777   // inside and outside of the scalar remainder loop.
3778   // We know that the loop is in LCSSA form. We need to update the
3779   // PHI nodes in the exit blocks.
3780   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3781     // All PHINodes need to have a single entry edge, or two if
3782     // we already fixed them.
3783     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3784
3785     // We found a reduction value exit-PHI. Update it with the
3786     // incoming bypass edge.
3787     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3788       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3789   } // end of the LCSSA phi scan.
3790
3791     // Fix the scalar loop reduction variable with the incoming reduction sum
3792     // from the vector body and from the backedge value.
3793   int IncomingEdgeBlockIdx =
3794     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3795   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3796   // Pick the other block.
3797   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3798   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3799   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3800 }
3801
3802 void InnerLoopVectorizer::fixLCSSAPHIs() {
3803   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3804     if (LCSSAPhi.getNumIncomingValues() == 1) {
3805       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3806       // Non-instruction incoming values will have only one value.
3807       unsigned LastLane = 0;
3808       if (isa<Instruction>(IncomingValue))
3809           LastLane = Cost->isUniformAfterVectorization(
3810                          cast<Instruction>(IncomingValue), VF)
3811                          ? 0
3812                          : VF - 1;
3813       // Can be a loop invariant incoming value or the last scalar value to be
3814       // extracted from the vectorized loop.
3815       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3816       Value *lastIncomingValue =
3817           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3818       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3819     }
3820   }
3821 }
3822
3823 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3824   // The basic block and loop containing the predicated instruction.
3825   auto *PredBB = PredInst->getParent();
3826   auto *VectorLoop = LI->getLoopFor(PredBB);
3827
3828   // Initialize a worklist with the operands of the predicated instruction.
3829   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3830
3831   // Holds instructions that we need to analyze again. An instruction may be
3832   // reanalyzed if we don't yet know if we can sink it or not.
3833   SmallVector<Instruction *, 8> InstsToReanalyze;
3834
3835   // Returns true if a given use occurs in the predicated block. Phi nodes use
3836   // their operands in their corresponding predecessor blocks.
3837   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3838     auto *I = cast<Instruction>(U.getUser());
3839     BasicBlock *BB = I->getParent();
3840     if (auto *Phi = dyn_cast<PHINode>(I))
3841       BB = Phi->getIncomingBlock(
3842           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3843     return BB == PredBB;
3844   };
3845
3846   // Iteratively sink the scalarized operands of the predicated instruction
3847   // into the block we created for it. When an instruction is sunk, it's
3848   // operands are then added to the worklist. The algorithm ends after one pass
3849   // through the worklist doesn't sink a single instruction.
3850   bool Changed;
3851   do {
3852     // Add the instructions that need to be reanalyzed to the worklist, and
3853     // reset the changed indicator.
3854     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3855     InstsToReanalyze.clear();
3856     Changed = false;
3857
3858     while (!Worklist.empty()) {
3859       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3860
3861       // We can't sink an instruction if it is a phi node, is already in the
3862       // predicated block, is not in the loop, or may have side effects.
3863       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3864           !VectorLoop->contains(I) || I->mayHaveSideEffects())
3865         continue;
3866
3867       // It's legal to sink the instruction if all its uses occur in the
3868       // predicated block. Otherwise, there's nothing to do yet, and we may
3869       // need to reanalyze the instruction.
3870       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3871         InstsToReanalyze.push_back(I);
3872         continue;
3873       }
3874
3875       // Move the instruction to the beginning of the predicated block, and add
3876       // it's operands to the worklist.
3877       I->moveBefore(&*PredBB->getFirstInsertionPt());
3878       Worklist.insert(I->op_begin(), I->op_end());
3879
3880       // The sinking may have enabled other instructions to be sunk, so we will
3881       // need to iterate.
3882       Changed = true;
3883     }
3884   } while (Changed);
3885 }
3886
3887 void InnerLoopVectorizer::fixNonInductionPHIs() {
3888   for (PHINode *OrigPhi : OrigPHIsToFix) {
3889     PHINode *NewPhi =
3890         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3891     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3892
3893     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3894         predecessors(OrigPhi->getParent()));
3895     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3896         predecessors(NewPhi->getParent()));
3897     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3898            "Scalar and Vector BB should have the same number of predecessors");
3899
3900     // The insertion point in Builder may be invalidated by the time we get
3901     // here. Force the Builder insertion point to something valid so that we do
3902     // not run into issues during insertion point restore in
3903     // getOrCreateVectorValue calls below.
3904     Builder.SetInsertPoint(NewPhi);
3905
3906     // The predecessor order is preserved and we can rely on mapping between
3907     // scalar and vector block predecessors.
3908     for (unsigned i = 0; i < NumIncomingValues; ++i) {
3909       BasicBlock *NewPredBB = VectorBBPredecessors[i];
3910
3911       // When looking up the new scalar/vector values to fix up, use incoming
3912       // values from original phi.
3913       Value *ScIncV =
3914           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3915
3916       // Scalar incoming value may need a broadcast
3917       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3918       NewPhi->addIncoming(NewIncV, NewPredBB);
3919     }
3920   }
3921 }
3922
3923 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
3924                                               unsigned VF) {
3925   PHINode *P = cast<PHINode>(PN);
3926   if (EnableVPlanNativePath) {
3927     // Currently we enter here in the VPlan-native path for non-induction
3928     // PHIs where all control flow is uniform. We simply widen these PHIs.
3929     // Create a vector phi with no operands - the vector phi operands will be
3930     // set at the end of vector code generation.
3931     Type *VecTy =
3932         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3933     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3934     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3935     OrigPHIsToFix.push_back(P);
3936
3937     return;
3938   }
3939
3940   assert(PN->getParent() == OrigLoop->getHeader() &&
3941          "Non-header phis should have been handled elsewhere");
3942
3943   // In order to support recurrences we need to be able to vectorize Phi nodes.
3944   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3945   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3946   // this value when we vectorize all of the instructions that use the PHI.
3947   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
3948     for (unsigned Part = 0; Part < UF; ++Part) {
3949       // This is phase one of vectorizing PHIs.
3950       Type *VecTy =
3951           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3952       Value *EntryPart = PHINode::Create(
3953           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
3954       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
3955     }
3956     return;
3957   }
3958
3959   setDebugLocFromInst(Builder, P);
3960
3961   // This PHINode must be an induction variable.
3962   // Make sure that we know about it.
3963   assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
3964
3965   InductionDescriptor II = Legal->getInductionVars()->lookup(P);
3966   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3967
3968   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
3969   // which can be found from the original scalar operations.
3970   switch (II.getKind()) {
3971   case InductionDescriptor::IK_NoInduction:
3972     llvm_unreachable("Unknown induction");
3973   case InductionDescriptor::IK_IntInduction:
3974   case InductionDescriptor::IK_FpInduction:
3975     llvm_unreachable("Integer/fp induction is handled elsewhere.");
3976   case InductionDescriptor::IK_PtrInduction: {
3977     // Handle the pointer induction variable case.
3978     assert(P->getType()->isPointerTy() && "Unexpected type.");
3979     // This is the normalized GEP that starts counting at zero.
3980     Value *PtrInd = Induction;
3981     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
3982     // Determine the number of scalars we need to generate for each unroll
3983     // iteration. If the instruction is uniform, we only need to generate the
3984     // first lane. Otherwise, we generate all VF values.
3985     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
3986     // These are the scalar results. Notice that we don't generate vector GEPs
3987     // because scalar GEPs result in better code.
3988     for (unsigned Part = 0; Part < UF; ++Part) {
3989       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
3990         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
3991         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
3992         Value *SclrGep =
3993             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
3994         SclrGep->setName("next.gep");
3995         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
3996       }
3997     }
3998     return;
3999   }
4000   }
4001 }
4002
4003 /// A helper function for checking whether an integer division-related
4004 /// instruction may divide by zero (in which case it must be predicated if
4005 /// executed conditionally in the scalar code).
4006 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4007 /// Non-zero divisors that are non compile-time constants will not be
4008 /// converted into multiplication, so we will still end up scalarizing
4009 /// the division, but can do so w/o predication.
4010 static bool mayDivideByZero(Instruction &I) {
4011   assert((I.getOpcode() == Instruction::UDiv ||
4012           I.getOpcode() == Instruction::SDiv ||
4013           I.getOpcode() == Instruction::URem ||
4014           I.getOpcode() == Instruction::SRem) &&
4015          "Unexpected instruction");
4016   Value *Divisor = I.getOperand(1);
4017   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4018   return !CInt || CInt->isZero();
4019 }
4020
4021 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4022   switch (I.getOpcode()) {
4023   case Instruction::Br:
4024   case Instruction::PHI:
4025     llvm_unreachable("This instruction is handled by a different recipe.");
4026   case Instruction::GetElementPtr: {
4027     // Construct a vector GEP by widening the operands of the scalar GEP as
4028     // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4029     // results in a vector of pointers when at least one operand of the GEP
4030     // is vector-typed. Thus, to keep the representation compact, we only use
4031     // vector-typed operands for loop-varying values.
4032     auto *GEP = cast<GetElementPtrInst>(&I);
4033
4034     if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
4035       // If we are vectorizing, but the GEP has only loop-invariant operands,
4036       // the GEP we build (by only using vector-typed operands for
4037       // loop-varying values) would be a scalar pointer. Thus, to ensure we
4038       // produce a vector of pointers, we need to either arbitrarily pick an
4039       // operand to broadcast, or broadcast a clone of the original GEP.
4040       // Here, we broadcast a clone of the original.
4041       //
4042       // TODO: If at some point we decide to scalarize instructions having
4043       //       loop-invariant operands, this special case will no longer be
4044       //       required. We would add the scalarization decision to
4045       //       collectLoopScalars() and teach getVectorValue() to broadcast
4046       //       the lane-zero scalar value.
4047       auto *Clone = Builder.Insert(GEP->clone());
4048       for (unsigned Part = 0; Part < UF; ++Part) {
4049         Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4050         VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
4051         addMetadata(EntryPart, GEP);
4052       }
4053     } else {
4054       // If the GEP has at least one loop-varying operand, we are sure to
4055       // produce a vector of pointers. But if we are only unrolling, we want
4056       // to produce a scalar GEP for each unroll part. Thus, the GEP we
4057       // produce with the code below will be scalar (if VF == 1) or vector
4058       // (otherwise). Note that for the unroll-only case, we still maintain
4059       // values in the vector mapping with initVector, as we do for other
4060       // instructions.
4061       for (unsigned Part = 0; Part < UF; ++Part) {
4062         // The pointer operand of the new GEP. If it's loop-invariant, we
4063         // won't broadcast it.
4064         auto *Ptr =
4065             OrigLoop->isLoopInvariant(GEP->getPointerOperand())
4066                 ? GEP->getPointerOperand()
4067                 : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4068
4069         // Collect all the indices for the new GEP. If any index is
4070         // loop-invariant, we won't broadcast it.
4071         SmallVector<Value *, 4> Indices;
4072         for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
4073           if (OrigLoop->isLoopInvariant(U.get()))
4074             Indices.push_back(U.get());
4075           else
4076             Indices.push_back(getOrCreateVectorValue(U.get(), Part));
4077         }
4078
4079         // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4080         // but it should be a vector, otherwise.
4081         auto *NewGEP =
4082             GEP->isInBounds()
4083                 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4084                                             Indices)
4085                 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4086         assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4087                "NewGEP is not a pointer vector");
4088         VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
4089         addMetadata(NewGEP, GEP);
4090       }
4091     }
4092
4093     break;
4094   }
4095   case Instruction::UDiv:
4096   case Instruction::SDiv:
4097   case Instruction::SRem:
4098   case Instruction::URem:
4099   case Instruction::Add:
4100   case Instruction::FAdd:
4101   case Instruction::Sub:
4102   case Instruction::FSub:
4103   case Instruction::FNeg:
4104   case Instruction::Mul:
4105   case Instruction::FMul:
4106   case Instruction::FDiv:
4107   case Instruction::FRem:
4108   case Instruction::Shl:
4109   case Instruction::LShr:
4110   case Instruction::AShr:
4111   case Instruction::And:
4112   case Instruction::Or:
4113   case Instruction::Xor: {
4114     // Just widen unops and binops.
4115     setDebugLocFromInst(Builder, &I);
4116
4117     for (unsigned Part = 0; Part < UF; ++Part) {
4118       SmallVector<Value *, 2> Ops;
4119       for (Value *Op : I.operands())
4120         Ops.push_back(getOrCreateVectorValue(Op, Part));
4121
4122       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4123
4124       if (auto *VecOp = dyn_cast<Instruction>(V))
4125         VecOp->copyIRFlags(&I);
4126
4127       // Use this vector value for all users of the original instruction.
4128       VectorLoopValueMap.setVectorValue(&I, Part, V);
4129       addMetadata(V, &I);
4130     }
4131
4132     break;
4133   }
4134   case Instruction::Select: {
4135     // Widen selects.
4136     // If the selector is loop invariant we can create a select
4137     // instruction with a scalar condition. Otherwise, use vector-select.
4138     auto *SE = PSE.getSE();
4139     bool InvariantCond =
4140         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4141     setDebugLocFromInst(Builder, &I);
4142
4143     // The condition can be loop invariant  but still defined inside the
4144     // loop. This means that we can't just use the original 'cond' value.
4145     // We have to take the 'vectorized' value and pick the first lane.
4146     // Instcombine will make this a no-op.
4147
4148     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4149
4150     for (unsigned Part = 0; Part < UF; ++Part) {
4151       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4152       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4153       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4154       Value *Sel =
4155           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4156       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4157       addMetadata(Sel, &I);
4158     }
4159
4160     break;
4161   }
4162
4163   case Instruction::ICmp:
4164   case Instruction::FCmp: {
4165     // Widen compares. Generate vector compares.
4166     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4167     auto *Cmp = dyn_cast<CmpInst>(&I);
4168     setDebugLocFromInst(Builder, Cmp);
4169     for (unsigned Part = 0; Part < UF; ++Part) {
4170       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4171       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4172       Value *C = nullptr;
4173       if (FCmp) {
4174         // Propagate fast math flags.
4175         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4176         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4177         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4178       } else {
4179         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4180       }
4181       VectorLoopValueMap.setVectorValue(&I, Part, C);
4182       addMetadata(C, &I);
4183     }
4184
4185     break;
4186   }
4187
4188   case Instruction::ZExt:
4189   case Instruction::SExt:
4190   case Instruction::FPToUI:
4191   case Instruction::FPToSI:
4192   case Instruction::FPExt:
4193   case Instruction::PtrToInt:
4194   case Instruction::IntToPtr:
4195   case Instruction::SIToFP:
4196   case Instruction::UIToFP:
4197   case Instruction::Trunc:
4198   case Instruction::FPTrunc:
4199   case Instruction::BitCast: {
4200     auto *CI = dyn_cast<CastInst>(&I);
4201     setDebugLocFromInst(Builder, CI);
4202
4203     /// Vectorize casts.
4204     Type *DestTy =
4205         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4206
4207     for (unsigned Part = 0; Part < UF; ++Part) {
4208       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4209       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4210       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4211       addMetadata(Cast, &I);
4212     }
4213     break;
4214   }
4215
4216   case Instruction::Call: {
4217     // Ignore dbg intrinsics.
4218     if (isa<DbgInfoIntrinsic>(I))
4219       break;
4220     setDebugLocFromInst(Builder, &I);
4221
4222     Module *M = I.getParent()->getParent()->getParent();
4223     auto *CI = cast<CallInst>(&I);
4224
4225     StringRef FnName = CI->getCalledFunction()->getName();
4226     Function *F = CI->getCalledFunction();
4227     Type *RetTy = ToVectorTy(CI->getType(), VF);
4228     SmallVector<Type *, 4> Tys;
4229     for (Value *ArgOperand : CI->arg_operands())
4230       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4231
4232     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4233
4234     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4235     // version of the instruction.
4236     // Is it beneficial to perform intrinsic call compared to lib call?
4237     bool NeedToScalarize;
4238     unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4239     bool UseVectorIntrinsic =
4240         ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4241     assert((UseVectorIntrinsic || !NeedToScalarize) &&
4242            "Instruction should be scalarized elsewhere.");
4243
4244     for (unsigned Part = 0; Part < UF; ++Part) {
4245       SmallVector<Value *, 4> Args;
4246       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4247         Value *Arg = CI->getArgOperand(i);
4248         // Some intrinsics have a scalar argument - don't replace it with a
4249         // vector.
4250         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4251           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4252         Args.push_back(Arg);
4253       }
4254
4255       Function *VectorF;
4256       if (UseVectorIntrinsic) {
4257         // Use vector version of the intrinsic.
4258         Type *TysForDecl[] = {CI->getType()};
4259         if (VF > 1)
4260           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4261         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4262       } else {
4263         // Use vector version of the library call.
4264         StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4265         assert(!VFnName.empty() && "Vector function name is empty.");
4266         VectorF = M->getFunction(VFnName);
4267         if (!VectorF) {
4268           // Generate a declaration
4269           FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4270           VectorF =
4271               Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4272           VectorF->copyAttributesFrom(F);
4273         }
4274       }
4275       assert(VectorF && "Can't create vector function.");
4276
4277       SmallVector<OperandBundleDef, 1> OpBundles;
4278       CI->getOperandBundlesAsDefs(OpBundles);
4279       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4280
4281       if (isa<FPMathOperator>(V))
4282         V->copyFastMathFlags(CI);
4283
4284       VectorLoopValueMap.setVectorValue(&I, Part, V);
4285       addMetadata(V, &I);
4286     }
4287
4288     break;
4289   }
4290
4291   default:
4292     // This instruction is not vectorized by simple widening.
4293     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4294     llvm_unreachable("Unhandled instruction!");
4295   } // end of switch.
4296 }
4297
4298 void InnerLoopVectorizer::updateAnalysis() {
4299   // Forget the original basic block.
4300   PSE.getSE()->forgetLoop(OrigLoop);
4301
4302   // DT is not kept up-to-date for outer loop vectorization
4303   if (EnableVPlanNativePath)
4304     return;
4305
4306   // Update the dominator tree information.
4307   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
4308          "Entry does not dominate exit.");
4309
4310   DT->addNewBlock(LoopMiddleBlock,
4311                   LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4312   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
4313   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
4314   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
4315   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
4316 }
4317
4318 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4319   // We should not collect Scalars more than once per VF. Right now, this
4320   // function is called from collectUniformsAndScalars(), which already does
4321   // this check. Collecting Scalars for VF=1 does not make any sense.
4322   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4323          "This function should not be visited twice for the same VF");
4324
4325   SmallSetVector<Instruction *, 8> Worklist;
4326
4327   // These sets are used to seed the analysis with pointers used by memory
4328   // accesses that will remain scalar.
4329   SmallSetVector<Instruction *, 8> ScalarPtrs;
4330   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4331
4332   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4333   // The pointer operands of loads and stores will be scalar as long as the
4334   // memory access is not a gather or scatter operation. The value operand of a
4335   // store will remain scalar if the store is scalarized.
4336   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4337     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4338     assert(WideningDecision != CM_Unknown &&
4339            "Widening decision should be ready at this moment");
4340     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4341       if (Ptr == Store->getValueOperand())
4342         return WideningDecision == CM_Scalarize;
4343     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4344            "Ptr is neither a value or pointer operand");
4345     return WideningDecision != CM_GatherScatter;
4346   };
4347
4348   // A helper that returns true if the given value is a bitcast or
4349   // getelementptr instruction contained in the loop.
4350   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4351     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4352             isa<GetElementPtrInst>(V)) &&
4353            !TheLoop->isLoopInvariant(V);
4354   };
4355
4356   // A helper that evaluates a memory access's use of a pointer. If the use
4357   // will be a scalar use, and the pointer is only used by memory accesses, we
4358   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4359   // PossibleNonScalarPtrs.
4360   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4361     // We only care about bitcast and getelementptr instructions contained in
4362     // the loop.
4363     if (!isLoopVaryingBitCastOrGEP(Ptr))
4364       return;
4365
4366     // If the pointer has already been identified as scalar (e.g., if it was
4367     // also identified as uniform), there's nothing to do.
4368     auto *I = cast<Instruction>(Ptr);
4369     if (Worklist.count(I))
4370       return;
4371
4372     // If the use of the pointer will be a scalar use, and all users of the
4373     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4374     // place the pointer in PossibleNonScalarPtrs.
4375     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4376           return isa<LoadInst>(U) || isa<StoreInst>(U);
4377         }))
4378       ScalarPtrs.insert(I);
4379     else
4380       PossibleNonScalarPtrs.insert(I);
4381   };
4382
4383   // We seed the scalars analysis with three classes of instructions: (1)
4384   // instructions marked uniform-after-vectorization, (2) bitcast and
4385   // getelementptr instructions used by memory accesses requiring a scalar use,
4386   // and (3) pointer induction variables and their update instructions (we
4387   // currently only scalarize these).
4388   //
4389   // (1) Add to the worklist all instructions that have been identified as
4390   // uniform-after-vectorization.
4391   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4392
4393   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4394   // memory accesses requiring a scalar use. The pointer operands of loads and
4395   // stores will be scalar as long as the memory accesses is not a gather or
4396   // scatter operation. The value operand of a store will remain scalar if the
4397   // store is scalarized.
4398   for (auto *BB : TheLoop->blocks())
4399     for (auto &I : *BB) {
4400       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4401         evaluatePtrUse(Load, Load->getPointerOperand());
4402       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4403         evaluatePtrUse(Store, Store->getPointerOperand());
4404         evaluatePtrUse(Store, Store->getValueOperand());
4405       }
4406     }
4407   for (auto *I : ScalarPtrs)
4408     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4409       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4410       Worklist.insert(I);
4411     }
4412
4413   // (3) Add to the worklist all pointer induction variables and their update
4414   // instructions.
4415   //
4416   // TODO: Once we are able to vectorize pointer induction variables we should
4417   //       no longer insert them into the worklist here.
4418   auto *Latch = TheLoop->getLoopLatch();
4419   for (auto &Induction : *Legal->getInductionVars()) {
4420     auto *Ind = Induction.first;
4421     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4422     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4423       continue;
4424     Worklist.insert(Ind);
4425     Worklist.insert(IndUpdate);
4426     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4427     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4428                       << "\n");
4429   }
4430
4431   // Insert the forced scalars.
4432   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4433   // induction variable when the PHI user is scalarized.
4434   auto ForcedScalar = ForcedScalars.find(VF);
4435   if (ForcedScalar != ForcedScalars.end())
4436     for (auto *I : ForcedScalar->second)
4437       Worklist.insert(I);
4438
4439   // Expand the worklist by looking through any bitcasts and getelementptr
4440   // instructions we've already identified as scalar. This is similar to the
4441   // expansion step in collectLoopUniforms(); however, here we're only
4442   // expanding to include additional bitcasts and getelementptr instructions.
4443   unsigned Idx = 0;
4444   while (Idx != Worklist.size()) {
4445     Instruction *Dst = Worklist[Idx++];
4446     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4447       continue;
4448     auto *Src = cast<Instruction>(Dst->getOperand(0));
4449     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4450           auto *J = cast<Instruction>(U);
4451           return !TheLoop->contains(J) || Worklist.count(J) ||
4452                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4453                   isScalarUse(J, Src));
4454         })) {
4455       Worklist.insert(Src);
4456       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4457     }
4458   }
4459
4460   // An induction variable will remain scalar if all users of the induction
4461   // variable and induction variable update remain scalar.
4462   for (auto &Induction : *Legal->getInductionVars()) {
4463     auto *Ind = Induction.first;
4464     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4465
4466     // We already considered pointer induction variables, so there's no reason
4467     // to look at their users again.
4468     //
4469     // TODO: Once we are able to vectorize pointer induction variables we
4470     //       should no longer skip over them here.
4471     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4472       continue;
4473
4474     // Determine if all users of the induction variable are scalar after
4475     // vectorization.
4476     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4477       auto *I = cast<Instruction>(U);
4478       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4479     });
4480     if (!ScalarInd)
4481       continue;
4482
4483     // Determine if all users of the induction variable update instruction are
4484     // scalar after vectorization.
4485     auto ScalarIndUpdate =
4486         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4487           auto *I = cast<Instruction>(U);
4488           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4489         });
4490     if (!ScalarIndUpdate)
4491       continue;
4492
4493     // The induction variable and its update instruction will remain scalar.
4494     Worklist.insert(Ind);
4495     Worklist.insert(IndUpdate);
4496     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4497     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4498                       << "\n");
4499   }
4500
4501   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4502 }
4503
4504 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4505   if (!blockNeedsPredication(I->getParent()))
4506     return false;
4507   switch(I->getOpcode()) {
4508   default:
4509     break;
4510   case Instruction::Load:
4511   case Instruction::Store: {
4512     if (!Legal->isMaskRequired(I))
4513       return false;
4514     auto *Ptr = getLoadStorePointerOperand(I);
4515     auto *Ty = getMemInstValueType(I);
4516     // We have already decided how to vectorize this instruction, get that
4517     // result.
4518     if (VF > 1) {
4519       InstWidening WideningDecision = getWideningDecision(I, VF);
4520       assert(WideningDecision != CM_Unknown &&
4521              "Widening decision should be ready at this moment");
4522       return WideningDecision == CM_Scalarize;
4523     }
4524     return isa<LoadInst>(I) ?
4525         !(isLegalMaskedLoad(Ty, Ptr)  || isLegalMaskedGather(Ty))
4526       : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
4527   }
4528   case Instruction::UDiv:
4529   case Instruction::SDiv:
4530   case Instruction::SRem:
4531   case Instruction::URem:
4532     return mayDivideByZero(*I);
4533   }
4534   return false;
4535 }
4536
4537 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4538                                                                unsigned VF) {
4539   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4540   assert(getWideningDecision(I, VF) == CM_Unknown &&
4541          "Decision should not be set yet.");
4542   auto *Group = getInterleavedAccessGroup(I);
4543   assert(Group && "Must have a group.");
4544
4545   // If the instruction's allocated size doesn't equal it's type size, it
4546   // requires padding and will be scalarized.
4547   auto &DL = I->getModule()->getDataLayout();
4548   auto *ScalarTy = getMemInstValueType(I);
4549   if (hasIrregularType(ScalarTy, DL, VF))
4550     return false;
4551
4552   // Check if masking is required.
4553   // A Group may need masking for one of two reasons: it resides in a block that
4554   // needs predication, or it was decided to use masking to deal with gaps.
4555   bool PredicatedAccessRequiresMasking =
4556       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4557   bool AccessWithGapsRequiresMasking =
4558       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4559   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4560     return true;
4561
4562   // If masked interleaving is required, we expect that the user/target had
4563   // enabled it, because otherwise it either wouldn't have been created or
4564   // it should have been invalidated by the CostModel.
4565   assert(useMaskedInterleavedAccesses(TTI) &&
4566          "Masked interleave-groups for predicated accesses are not enabled.");
4567
4568   auto *Ty = getMemInstValueType(I);
4569   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
4570                           : TTI.isLegalMaskedStore(Ty);
4571 }
4572
4573 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4574                                                                unsigned VF) {
4575   // Get and ensure we have a valid memory instruction.
4576   LoadInst *LI = dyn_cast<LoadInst>(I);
4577   StoreInst *SI = dyn_cast<StoreInst>(I);
4578   assert((LI || SI) && "Invalid memory instruction");
4579
4580   auto *Ptr = getLoadStorePointerOperand(I);
4581
4582   // In order to be widened, the pointer should be consecutive, first of all.
4583   if (!Legal->isConsecutivePtr(Ptr))
4584     return false;
4585
4586   // If the instruction is a store located in a predicated block, it will be
4587   // scalarized.
4588   if (isScalarWithPredication(I))
4589     return false;
4590
4591   // If the instruction's allocated size doesn't equal it's type size, it
4592   // requires padding and will be scalarized.
4593   auto &DL = I->getModule()->getDataLayout();
4594   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4595   if (hasIrregularType(ScalarTy, DL, VF))
4596     return false;
4597
4598   return true;
4599 }
4600
4601 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4602   // We should not collect Uniforms more than once per VF. Right now,
4603   // this function is called from collectUniformsAndScalars(), which
4604   // already does this check. Collecting Uniforms for VF=1 does not make any
4605   // sense.
4606
4607   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4608          "This function should not be visited twice for the same VF");
4609
4610   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4611   // not analyze again.  Uniforms.count(VF) will return 1.
4612   Uniforms[VF].clear();
4613
4614   // We now know that the loop is vectorizable!
4615   // Collect instructions inside the loop that will remain uniform after
4616   // vectorization.
4617
4618   // Global values, params and instructions outside of current loop are out of
4619   // scope.
4620   auto isOutOfScope = [&](Value *V) -> bool {
4621     Instruction *I = dyn_cast<Instruction>(V);
4622     return (!I || !TheLoop->contains(I));
4623   };
4624
4625   SetVector<Instruction *> Worklist;
4626   BasicBlock *Latch = TheLoop->getLoopLatch();
4627
4628   // Start with the conditional branch. If the branch condition is an
4629   // instruction contained in the loop that is only used by the branch, it is
4630   // uniform.
4631   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4632   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4633     Worklist.insert(Cmp);
4634     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
4635   }
4636
4637   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4638   // are pointers that are treated like consecutive pointers during
4639   // vectorization. The pointer operands of interleaved accesses are an
4640   // example.
4641   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4642
4643   // Holds pointer operands of instructions that are possibly non-uniform.
4644   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4645
4646   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4647     InstWidening WideningDecision = getWideningDecision(I, VF);
4648     assert(WideningDecision != CM_Unknown &&
4649            "Widening decision should be ready at this moment");
4650
4651     return (WideningDecision == CM_Widen ||
4652             WideningDecision == CM_Widen_Reverse ||
4653             WideningDecision == CM_Interleave);
4654   };
4655   // Iterate over the instructions in the loop, and collect all
4656   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4657   // that a consecutive-like pointer operand will be scalarized, we collect it
4658   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4659   // getelementptr instruction can be used by both vectorized and scalarized
4660   // memory instructions. For example, if a loop loads and stores from the same
4661   // location, but the store is conditional, the store will be scalarized, and
4662   // the getelementptr won't remain uniform.
4663   for (auto *BB : TheLoop->blocks())
4664     for (auto &I : *BB) {
4665       // If there's no pointer operand, there's nothing to do.
4666       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4667       if (!Ptr)
4668         continue;
4669
4670       // True if all users of Ptr are memory accesses that have Ptr as their
4671       // pointer operand.
4672       auto UsersAreMemAccesses =
4673           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4674             return getLoadStorePointerOperand(U) == Ptr;
4675           });
4676
4677       // Ensure the memory instruction will not be scalarized or used by
4678       // gather/scatter, making its pointer operand non-uniform. If the pointer
4679       // operand is used by any instruction other than a memory access, we
4680       // conservatively assume the pointer operand may be non-uniform.
4681       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4682         PossibleNonUniformPtrs.insert(Ptr);
4683
4684       // If the memory instruction will be vectorized and its pointer operand
4685       // is consecutive-like, or interleaving - the pointer operand should
4686       // remain uniform.
4687       else
4688         ConsecutiveLikePtrs.insert(Ptr);
4689     }
4690
4691   // Add to the Worklist all consecutive and consecutive-like pointers that
4692   // aren't also identified as possibly non-uniform.
4693   for (auto *V : ConsecutiveLikePtrs)
4694     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
4695       LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
4696       Worklist.insert(V);
4697     }
4698
4699   // Expand Worklist in topological order: whenever a new instruction
4700   // is added , its users should be already inside Worklist.  It ensures
4701   // a uniform instruction will only be used by uniform instructions.
4702   unsigned idx = 0;
4703   while (idx != Worklist.size()) {
4704     Instruction *I = Worklist[idx++];
4705
4706     for (auto OV : I->operand_values()) {
4707       // isOutOfScope operands cannot be uniform instructions.
4708       if (isOutOfScope(OV))
4709         continue;
4710       // First order recurrence Phi's should typically be considered
4711       // non-uniform.
4712       auto *OP = dyn_cast<PHINode>(OV);
4713       if (OP && Legal->isFirstOrderRecurrence(OP))
4714         continue;
4715       // If all the users of the operand are uniform, then add the
4716       // operand into the uniform worklist.
4717       auto *OI = cast<Instruction>(OV);
4718       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4719             auto *J = cast<Instruction>(U);
4720             return Worklist.count(J) ||
4721                    (OI == getLoadStorePointerOperand(J) &&
4722                     isUniformDecision(J, VF));
4723           })) {
4724         Worklist.insert(OI);
4725         LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
4726       }
4727     }
4728   }
4729
4730   // Returns true if Ptr is the pointer operand of a memory access instruction
4731   // I, and I is known to not require scalarization.
4732   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4733     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4734   };
4735
4736   // For an instruction to be added into Worklist above, all its users inside
4737   // the loop should also be in Worklist. However, this condition cannot be
4738   // true for phi nodes that form a cyclic dependence. We must process phi
4739   // nodes separately. An induction variable will remain uniform if all users
4740   // of the induction variable and induction variable update remain uniform.
4741   // The code below handles both pointer and non-pointer induction variables.
4742   for (auto &Induction : *Legal->getInductionVars()) {
4743     auto *Ind = Induction.first;
4744     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4745
4746     // Determine if all users of the induction variable are uniform after
4747     // vectorization.
4748     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4749       auto *I = cast<Instruction>(U);
4750       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4751              isVectorizedMemAccessUse(I, Ind);
4752     });
4753     if (!UniformInd)
4754       continue;
4755
4756     // Determine if all users of the induction variable update instruction are
4757     // uniform after vectorization.
4758     auto UniformIndUpdate =
4759         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4760           auto *I = cast<Instruction>(U);
4761           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4762                  isVectorizedMemAccessUse(I, IndUpdate);
4763         });
4764     if (!UniformIndUpdate)
4765       continue;
4766
4767     // The induction variable and its update instruction will remain uniform.
4768     Worklist.insert(Ind);
4769     Worklist.insert(IndUpdate);
4770     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
4771     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4772                       << "\n");
4773   }
4774
4775   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4776 }
4777
4778 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4779   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4780
4781   if (Legal->getRuntimePointerChecking()->Need) {
4782     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4783         "runtime pointer checks needed. Enable vectorization of this "
4784         "loop with '#pragma clang loop vectorize(enable)' when "
4785         "compiling with -Os/-Oz",
4786         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4787     return true;
4788   }
4789
4790   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4791     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4792         "runtime SCEV checks needed. Enable vectorization of this "
4793         "loop with '#pragma clang loop vectorize(enable)' when "
4794         "compiling with -Os/-Oz",
4795         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4796     return true;
4797   }
4798
4799   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4800   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4801     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4802         "runtime stride == 1 checks needed. Enable vectorization of "
4803         "this loop with '#pragma clang loop vectorize(enable)' when "
4804         "compiling with -Os/-Oz",
4805         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4806     return true;
4807   }
4808
4809   return false;
4810 }
4811
4812 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4813   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4814     // TODO: It may by useful to do since it's still likely to be dynamically
4815     // uniform if the target can skip.
4816     reportVectorizationFailure(
4817         "Not inserting runtime ptr check for divergent target",
4818         "runtime pointer checks needed. Not enabled for divergent target",
4819         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4820     return None;
4821   }
4822
4823   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4824   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4825   if (TC == 1) {
4826     reportVectorizationFailure("Single iteration (non) loop",
4827         "loop trip count is one, irrelevant for vectorization",
4828         "SingleIterationLoop", ORE, TheLoop);
4829     return None;
4830   }
4831
4832   switch (ScalarEpilogueStatus) {
4833   case CM_ScalarEpilogueAllowed:
4834     return computeFeasibleMaxVF(TC);
4835   case CM_ScalarEpilogueNotNeededUsePredicate:
4836     LLVM_DEBUG(
4837         dbgs() << "LV: vector predicate hint/switch found.\n"
4838                << "LV: Not allowing scalar epilogue, creating predicated "
4839                << "vector loop.\n");
4840     break;
4841   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4842     // fallthrough as a special case of OptForSize
4843   case CM_ScalarEpilogueNotAllowedOptSize:
4844     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4845       LLVM_DEBUG(
4846           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4847     else
4848       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4849                         << "count.\n");
4850
4851     // Bail if runtime checks are required, which are not good when optimising
4852     // for size.
4853     if (runtimeChecksRequired())
4854       return None;
4855     break;
4856   }
4857
4858   // Now try the tail folding
4859
4860   // Invalidate interleave groups that require an epilogue if we can't mask
4861   // the interleave-group.
4862   if (!useMaskedInterleavedAccesses(TTI))
4863     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4864
4865   unsigned MaxVF = computeFeasibleMaxVF(TC);
4866   if (TC > 0 && TC % MaxVF == 0) {
4867     // Accept MaxVF if we do not have a tail.
4868     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4869     return MaxVF;
4870   }
4871
4872   // If we don't know the precise trip count, or if the trip count that we
4873   // found modulo the vectorization factor is not zero, try to fold the tail
4874   // by masking.
4875   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4876   if (Legal->prepareToFoldTailByMasking()) {
4877     FoldTailByMasking = true;
4878     return MaxVF;
4879   }
4880
4881   if (TC == 0) {
4882     reportVectorizationFailure(
4883         "Unable to calculate the loop count due to complex control flow",
4884         "unable to calculate the loop count due to complex control flow",
4885         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4886     return None;
4887   }
4888
4889   reportVectorizationFailure(
4890       "Cannot optimize for size and vectorize at the same time.",
4891       "cannot optimize for size and vectorize at the same time. "
4892       "Enable vectorization of this loop with '#pragma clang loop "
4893       "vectorize(enable)' when compiling with -Os/-Oz",
4894       "NoTailLoopWithOptForSize", ORE, TheLoop);
4895   return None;
4896 }
4897
4898 unsigned
4899 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
4900   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4901   unsigned SmallestType, WidestType;
4902   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4903   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4904
4905   // Get the maximum safe dependence distance in bits computed by LAA.
4906   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4907   // the memory accesses that is most restrictive (involved in the smallest
4908   // dependence distance).
4909   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4910
4911   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4912
4913   unsigned MaxVectorSize = WidestRegister / WidestType;
4914
4915   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4916                     << " / " << WidestType << " bits.\n");
4917   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4918                     << WidestRegister << " bits.\n");
4919
4920   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4921                                  " into one vector!");
4922   if (MaxVectorSize == 0) {
4923     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4924     MaxVectorSize = 1;
4925     return MaxVectorSize;
4926   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4927              isPowerOf2_32(ConstTripCount)) {
4928     // We need to clamp the VF to be the ConstTripCount. There is no point in
4929     // choosing a higher viable VF as done in the loop below.
4930     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4931                       << ConstTripCount << "\n");
4932     MaxVectorSize = ConstTripCount;
4933     return MaxVectorSize;
4934   }
4935
4936   unsigned MaxVF = MaxVectorSize;
4937   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
4938       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
4939     // Collect all viable vectorization factors larger than the default MaxVF
4940     // (i.e. MaxVectorSize).
4941     SmallVector<unsigned, 8> VFs;
4942     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
4943     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
4944       VFs.push_back(VS);
4945
4946     // For each VF calculate its register usage.
4947     auto RUs = calculateRegisterUsage(VFs);
4948
4949     // Select the largest VF which doesn't require more registers than existing
4950     // ones.
4951     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
4952     for (int i = RUs.size() - 1; i >= 0; --i) {
4953       if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
4954         MaxVF = VFs[i];
4955         break;
4956       }
4957     }
4958     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
4959       if (MaxVF < MinVF) {
4960         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4961                           << ") with target's minimum: " << MinVF << '\n');
4962         MaxVF = MinVF;
4963       }
4964     }
4965   }
4966   return MaxVF;
4967 }
4968
4969 VectorizationFactor
4970 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
4971   float Cost = expectedCost(1).first;
4972   const float ScalarCost = Cost;
4973   unsigned Width = 1;
4974   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
4975
4976   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
4977   if (ForceVectorization && MaxVF > 1) {
4978     // Ignore scalar width, because the user explicitly wants vectorization.
4979     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4980     // evaluation.
4981     Cost = std::numeric_limits<float>::max();
4982   }
4983
4984   for (unsigned i = 2; i <= MaxVF; i *= 2) {
4985     // Notice that the vector loop needs to be executed less times, so
4986     // we need to divide the cost of the vector loops by the width of
4987     // the vector elements.
4988     VectorizationCostTy C = expectedCost(i);
4989     float VectorCost = C.first / (float)i;
4990     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
4991                       << " costs: " << (int)VectorCost << ".\n");
4992     if (!C.second && !ForceVectorization) {
4993       LLVM_DEBUG(
4994           dbgs() << "LV: Not considering vector loop of width " << i
4995                  << " because it will not generate any vector instructions.\n");
4996       continue;
4997     }
4998     if (VectorCost < Cost) {
4999       Cost = VectorCost;
5000       Width = i;
5001     }
5002   }
5003
5004   if (!EnableCondStoresVectorization && NumPredStores) {
5005     reportVectorizationFailure("There are conditional stores.",
5006         "store that is conditionally executed prevents vectorization",
5007         "ConditionalStore", ORE, TheLoop);
5008     Width = 1;
5009     Cost = ScalarCost;
5010   }
5011
5012   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5013              << "LV: Vectorization seems to be not beneficial, "
5014              << "but was forced by a user.\n");
5015   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5016   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5017   return Factor;
5018 }
5019
5020 std::pair<unsigned, unsigned>
5021 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5022   unsigned MinWidth = -1U;
5023   unsigned MaxWidth = 8;
5024   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5025
5026   // For each block.
5027   for (BasicBlock *BB : TheLoop->blocks()) {
5028     // For each instruction in the loop.
5029     for (Instruction &I : BB->instructionsWithoutDebug()) {
5030       Type *T = I.getType();
5031
5032       // Skip ignored values.
5033       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5034         continue;
5035
5036       // Only examine Loads, Stores and PHINodes.
5037       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5038         continue;
5039
5040       // Examine PHI nodes that are reduction variables. Update the type to
5041       // account for the recurrence type.
5042       if (auto *PN = dyn_cast<PHINode>(&I)) {
5043         if (!Legal->isReductionVariable(PN))
5044           continue;
5045         RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
5046         T = RdxDesc.getRecurrenceType();
5047       }
5048
5049       // Examine the stored values.
5050       if (auto *ST = dyn_cast<StoreInst>(&I))
5051         T = ST->getValueOperand()->getType();
5052
5053       // Ignore loaded pointer types and stored pointer types that are not
5054       // vectorizable.
5055       //
5056       // FIXME: The check here attempts to predict whether a load or store will
5057       //        be vectorized. We only know this for certain after a VF has
5058       //        been selected. Here, we assume that if an access can be
5059       //        vectorized, it will be. We should also look at extending this
5060       //        optimization to non-pointer types.
5061       //
5062       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5063           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5064         continue;
5065
5066       MinWidth = std::min(MinWidth,
5067                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5068       MaxWidth = std::max(MaxWidth,
5069                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5070     }
5071   }
5072
5073   return {MinWidth, MaxWidth};
5074 }
5075
5076 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5077                                                            unsigned LoopCost) {
5078   // -- The interleave heuristics --
5079   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5080   // There are many micro-architectural considerations that we can't predict
5081   // at this level. For example, frontend pressure (on decode or fetch) due to
5082   // code size, or the number and capabilities of the execution ports.
5083   //
5084   // We use the following heuristics to select the interleave count:
5085   // 1. If the code has reductions, then we interleave to break the cross
5086   // iteration dependency.
5087   // 2. If the loop is really small, then we interleave to reduce the loop
5088   // overhead.
5089   // 3. We don't interleave if we think that we will spill registers to memory
5090   // due to the increased register pressure.
5091
5092   if (!isScalarEpilogueAllowed())
5093     return 1;
5094
5095   // We used the distance for the interleave count.
5096   if (Legal->getMaxSafeDepDistBytes() != -1U)
5097     return 1;
5098
5099   // Do not interleave loops with a relatively small trip count.
5100   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5101   if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
5102     return 1;
5103
5104   unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
5105   LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5106                     << " registers\n");
5107
5108   if (VF == 1) {
5109     if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5110       TargetNumRegisters = ForceTargetNumScalarRegs;
5111   } else {
5112     if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5113       TargetNumRegisters = ForceTargetNumVectorRegs;
5114   }
5115
5116   RegisterUsage R = calculateRegisterUsage({VF})[0];
5117   // We divide by these constants so assume that we have at least one
5118   // instruction that uses at least one register.
5119   R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
5120
5121   // We calculate the interleave count using the following formula.
5122   // Subtract the number of loop invariants from the number of available
5123   // registers. These registers are used by all of the interleaved instances.
5124   // Next, divide the remaining registers by the number of registers that is
5125   // required by the loop, in order to estimate how many parallel instances
5126   // fit without causing spills. All of this is rounded down if necessary to be
5127   // a power of two. We want power of two interleave count to simplify any
5128   // addressing operations or alignment considerations.
5129   // We also want power of two interleave counts to ensure that the induction
5130   // variable of the vector loop wraps to zero, when tail is folded by masking;
5131   // this currently happens when OptForSize, in which case IC is set to 1 above.
5132   unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
5133                               R.MaxLocalUsers);
5134
5135   // Don't count the induction variable as interleaved.
5136   if (EnableIndVarRegisterHeur)
5137     IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
5138                        std::max(1U, (R.MaxLocalUsers - 1)));
5139
5140   // Clamp the interleave ranges to reasonable counts.
5141   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5142
5143   // Check if the user has overridden the max.
5144   if (VF == 1) {
5145     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5146       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5147   } else {
5148     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5149       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5150   }
5151
5152   // If the trip count is constant, limit the interleave count to be less than
5153   // the trip count divided by VF.
5154   if (TC > 0) {
5155     assert(TC >= VF && "VF exceeds trip count?");
5156     if ((TC / VF) < MaxInterleaveCount)
5157       MaxInterleaveCount = (TC / VF);
5158   }
5159
5160   // If we did not calculate the cost for VF (because the user selected the VF)
5161   // then we calculate the cost of VF here.
5162   if (LoopCost == 0)
5163     LoopCost = expectedCost(VF).first;
5164
5165   assert(LoopCost && "Non-zero loop cost expected");
5166
5167   // Clamp the calculated IC to be between the 1 and the max interleave count
5168   // that the target and trip count allows.
5169   if (IC > MaxInterleaveCount)
5170     IC = MaxInterleaveCount;
5171   else if (IC < 1)
5172     IC = 1;
5173
5174   // Interleave if we vectorized this loop and there is a reduction that could
5175   // benefit from interleaving.
5176   if (VF > 1 && !Legal->getReductionVars()->empty()) {
5177     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5178     return IC;
5179   }
5180
5181   // Note that if we've already vectorized the loop we will have done the
5182   // runtime check and so interleaving won't require further checks.
5183   bool InterleavingRequiresRuntimePointerCheck =
5184       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5185
5186   // We want to interleave small loops in order to reduce the loop overhead and
5187   // potentially expose ILP opportunities.
5188   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5189   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5190     // We assume that the cost overhead is 1 and we use the cost model
5191     // to estimate the cost of the loop and interleave until the cost of the
5192     // loop overhead is about 5% of the cost of the loop.
5193     unsigned SmallIC =
5194         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5195
5196     // Interleave until store/load ports (estimated by max interleave count) are
5197     // saturated.
5198     unsigned NumStores = Legal->getNumStores();
5199     unsigned NumLoads = Legal->getNumLoads();
5200     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5201     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5202
5203     // If we have a scalar reduction (vector reductions are already dealt with
5204     // by this point), we can increase the critical path length if the loop
5205     // we're interleaving is inside another loop. Limit, by default to 2, so the
5206     // critical path only gets increased by one reduction operation.
5207     if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5208       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5209       SmallIC = std::min(SmallIC, F);
5210       StoresIC = std::min(StoresIC, F);
5211       LoadsIC = std::min(LoadsIC, F);
5212     }
5213
5214     if (EnableLoadStoreRuntimeInterleave &&
5215         std::max(StoresIC, LoadsIC) > SmallIC) {
5216       LLVM_DEBUG(
5217           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5218       return std::max(StoresIC, LoadsIC);
5219     }
5220
5221     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5222     return SmallIC;
5223   }
5224
5225   // Interleave if this is a large loop (small loops are already dealt with by
5226   // this point) that could benefit from interleaving.
5227   bool HasReductions = !Legal->getReductionVars()->empty();
5228   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5229     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5230     return IC;
5231   }
5232
5233   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5234   return 1;
5235 }
5236
5237 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5238 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5239   // This function calculates the register usage by measuring the highest number
5240   // of values that are alive at a single location. Obviously, this is a very
5241   // rough estimation. We scan the loop in a topological order in order and
5242   // assign a number to each instruction. We use RPO to ensure that defs are
5243   // met before their users. We assume that each instruction that has in-loop
5244   // users starts an interval. We record every time that an in-loop value is
5245   // used, so we have a list of the first and last occurrences of each
5246   // instruction. Next, we transpose this data structure into a multi map that
5247   // holds the list of intervals that *end* at a specific location. This multi
5248   // map allows us to perform a linear search. We scan the instructions linearly
5249   // and record each time that a new interval starts, by placing it in a set.
5250   // If we find this value in the multi-map then we remove it from the set.
5251   // The max register usage is the maximum size of the set.
5252   // We also search for instructions that are defined outside the loop, but are
5253   // used inside the loop. We need this number separately from the max-interval
5254   // usage number because when we unroll, loop-invariant values do not take
5255   // more register.
5256   LoopBlocksDFS DFS(TheLoop);
5257   DFS.perform(LI);
5258
5259   RegisterUsage RU;
5260
5261   // Each 'key' in the map opens a new interval. The values
5262   // of the map are the index of the 'last seen' usage of the
5263   // instruction that is the key.
5264   using IntervalMap = DenseMap<Instruction *, unsigned>;
5265
5266   // Maps instruction to its index.
5267   SmallVector<Instruction *, 64> IdxToInstr;
5268   // Marks the end of each interval.
5269   IntervalMap EndPoint;
5270   // Saves the list of instruction indices that are used in the loop.
5271   SmallPtrSet<Instruction *, 8> Ends;
5272   // Saves the list of values that are used in the loop but are
5273   // defined outside the loop, such as arguments and constants.
5274   SmallPtrSet<Value *, 8> LoopInvariants;
5275
5276   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5277     for (Instruction &I : BB->instructionsWithoutDebug()) {
5278       IdxToInstr.push_back(&I);
5279
5280       // Save the end location of each USE.
5281       for (Value *U : I.operands()) {
5282         auto *Instr = dyn_cast<Instruction>(U);
5283
5284         // Ignore non-instruction values such as arguments, constants, etc.
5285         if (!Instr)
5286           continue;
5287
5288         // If this instruction is outside the loop then record it and continue.
5289         if (!TheLoop->contains(Instr)) {
5290           LoopInvariants.insert(Instr);
5291           continue;
5292         }
5293
5294         // Overwrite previous end points.
5295         EndPoint[Instr] = IdxToInstr.size();
5296         Ends.insert(Instr);
5297       }
5298     }
5299   }
5300
5301   // Saves the list of intervals that end with the index in 'key'.
5302   using InstrList = SmallVector<Instruction *, 2>;
5303   DenseMap<unsigned, InstrList> TransposeEnds;
5304
5305   // Transpose the EndPoints to a list of values that end at each index.
5306   for (auto &Interval : EndPoint)
5307     TransposeEnds[Interval.second].push_back(Interval.first);
5308
5309   SmallPtrSet<Instruction *, 8> OpenIntervals;
5310
5311   // Get the size of the widest register.
5312   unsigned MaxSafeDepDist = -1U;
5313   if (Legal->getMaxSafeDepDistBytes() != -1U)
5314     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5315   unsigned WidestRegister =
5316       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5317   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5318
5319   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5320   SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
5321
5322   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5323
5324   // A lambda that gets the register usage for the given type and VF.
5325   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5326     if (Ty->isTokenTy())
5327       return 0U;
5328     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5329     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5330   };
5331
5332   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5333     Instruction *I = IdxToInstr[i];
5334
5335     // Remove all of the instructions that end at this location.
5336     InstrList &List = TransposeEnds[i];
5337     for (Instruction *ToRemove : List)
5338       OpenIntervals.erase(ToRemove);
5339
5340     // Ignore instructions that are never used within the loop.
5341     if (Ends.find(I) == Ends.end())
5342       continue;
5343
5344     // Skip ignored values.
5345     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5346       continue;
5347
5348     // For each VF find the maximum usage of registers.
5349     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5350       if (VFs[j] == 1) {
5351         MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
5352         continue;
5353       }
5354       collectUniformsAndScalars(VFs[j]);
5355       // Count the number of live intervals.
5356       unsigned RegUsage = 0;
5357       for (auto Inst : OpenIntervals) {
5358         // Skip ignored values for VF > 1.
5359         if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() ||
5360             isScalarAfterVectorization(Inst, VFs[j]))
5361           continue;
5362         RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
5363       }
5364       MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
5365     }
5366
5367     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5368                       << OpenIntervals.size() << '\n');
5369
5370     // Add the current instruction to the list of open intervals.
5371     OpenIntervals.insert(I);
5372   }
5373
5374   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5375     unsigned Invariant = 0;
5376     if (VFs[i] == 1)
5377       Invariant = LoopInvariants.size();
5378     else {
5379       for (auto Inst : LoopInvariants)
5380         Invariant += GetRegUsage(Inst->getType(), VFs[i]);
5381     }
5382
5383     LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
5384     LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
5385     LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant
5386                       << '\n');
5387
5388     RU.LoopInvariantRegs = Invariant;
5389     RU.MaxLocalUsers = MaxUsages[i];
5390     RUs[i] = RU;
5391   }
5392
5393   return RUs;
5394 }
5395
5396 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5397   // TODO: Cost model for emulated masked load/store is completely
5398   // broken. This hack guides the cost model to use an artificially
5399   // high enough value to practically disable vectorization with such
5400   // operations, except where previously deployed legality hack allowed
5401   // using very low cost values. This is to avoid regressions coming simply
5402   // from moving "masked load/store" check from legality to cost model.
5403   // Masked Load/Gather emulation was previously never allowed.
5404   // Limited number of Masked Store/Scatter emulation was allowed.
5405   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5406   return isa<LoadInst>(I) ||
5407          (isa<StoreInst>(I) &&
5408           NumPredStores > NumberOfStoresToPredicate);
5409 }
5410
5411 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5412   // If we aren't vectorizing the loop, or if we've already collected the
5413   // instructions to scalarize, there's nothing to do. Collection may already
5414   // have occurred if we have a user-selected VF and are now computing the
5415   // expected cost for interleaving.
5416   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5417     return;
5418
5419   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5420   // not profitable to scalarize any instructions, the presence of VF in the
5421   // map will indicate that we've analyzed it already.
5422   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5423
5424   // Find all the instructions that are scalar with predication in the loop and
5425   // determine if it would be better to not if-convert the blocks they are in.
5426   // If so, we also record the instructions to scalarize.
5427   for (BasicBlock *BB : TheLoop->blocks()) {
5428     if (!blockNeedsPredication(BB))
5429       continue;
5430     for (Instruction &I : *BB)
5431       if (isScalarWithPredication(&I)) {
5432         ScalarCostsTy ScalarCosts;
5433         // Do not apply discount logic if hacked cost is needed
5434         // for emulated masked memrefs.
5435         if (!useEmulatedMaskMemRefHack(&I) &&
5436             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5437           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5438         // Remember that BB will remain after vectorization.
5439         PredicatedBBsAfterVectorization.insert(BB);
5440       }
5441   }
5442 }
5443
5444 int LoopVectorizationCostModel::computePredInstDiscount(
5445     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5446     unsigned VF) {
5447   assert(!isUniformAfterVectorization(PredInst, VF) &&
5448          "Instruction marked uniform-after-vectorization will be predicated");
5449
5450   // Initialize the discount to zero, meaning that the scalar version and the
5451   // vector version cost the same.
5452   int Discount = 0;
5453
5454   // Holds instructions to analyze. The instructions we visit are mapped in
5455   // ScalarCosts. Those instructions are the ones that would be scalarized if
5456   // we find that the scalar version costs less.
5457   SmallVector<Instruction *, 8> Worklist;
5458
5459   // Returns true if the given instruction can be scalarized.
5460   auto canBeScalarized = [&](Instruction *I) -> bool {
5461     // We only attempt to scalarize instructions forming a single-use chain
5462     // from the original predicated block that would otherwise be vectorized.
5463     // Although not strictly necessary, we give up on instructions we know will
5464     // already be scalar to avoid traversing chains that are unlikely to be
5465     // beneficial.
5466     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5467         isScalarAfterVectorization(I, VF))
5468       return false;
5469
5470     // If the instruction is scalar with predication, it will be analyzed
5471     // separately. We ignore it within the context of PredInst.
5472     if (isScalarWithPredication(I))
5473       return false;
5474
5475     // If any of the instruction's operands are uniform after vectorization,
5476     // the instruction cannot be scalarized. This prevents, for example, a
5477     // masked load from being scalarized.
5478     //
5479     // We assume we will only emit a value for lane zero of an instruction
5480     // marked uniform after vectorization, rather than VF identical values.
5481     // Thus, if we scalarize an instruction that uses a uniform, we would
5482     // create uses of values corresponding to the lanes we aren't emitting code
5483     // for. This behavior can be changed by allowing getScalarValue to clone
5484     // the lane zero values for uniforms rather than asserting.
5485     for (Use &U : I->operands())
5486       if (auto *J = dyn_cast<Instruction>(U.get()))
5487         if (isUniformAfterVectorization(J, VF))
5488           return false;
5489
5490     // Otherwise, we can scalarize the instruction.
5491     return true;
5492   };
5493
5494   // Compute the expected cost discount from scalarizing the entire expression
5495   // feeding the predicated instruction. We currently only consider expressions
5496   // that are single-use instruction chains.
5497   Worklist.push_back(PredInst);
5498   while (!Worklist.empty()) {
5499     Instruction *I = Worklist.pop_back_val();
5500
5501     // If we've already analyzed the instruction, there's nothing to do.
5502     if (ScalarCosts.find(I) != ScalarCosts.end())
5503       continue;
5504
5505     // Compute the cost of the vector instruction. Note that this cost already
5506     // includes the scalarization overhead of the predicated instruction.
5507     unsigned VectorCost = getInstructionCost(I, VF).first;
5508
5509     // Compute the cost of the scalarized instruction. This cost is the cost of
5510     // the instruction as if it wasn't if-converted and instead remained in the
5511     // predicated block. We will scale this cost by block probability after
5512     // computing the scalarization overhead.
5513     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5514
5515     // Compute the scalarization overhead of needed insertelement instructions
5516     // and phi nodes.
5517     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5518       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5519                                                  true, false);
5520       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5521     }
5522
5523     // Compute the scalarization overhead of needed extractelement
5524     // instructions. For each of the instruction's operands, if the operand can
5525     // be scalarized, add it to the worklist; otherwise, account for the
5526     // overhead.
5527     for (Use &U : I->operands())
5528       if (auto *J = dyn_cast<Instruction>(U.get())) {
5529         assert(VectorType::isValidElementType(J->getType()) &&
5530                "Instruction has non-scalar type");
5531         if (canBeScalarized(J))
5532           Worklist.push_back(J);
5533         else if (needsExtract(J, VF))
5534           ScalarCost += TTI.getScalarizationOverhead(
5535                               ToVectorTy(J->getType(),VF), false, true);
5536       }
5537
5538     // Scale the total scalar cost by block probability.
5539     ScalarCost /= getReciprocalPredBlockProb();
5540
5541     // Compute the discount. A non-negative discount means the vector version
5542     // of the instruction costs more, and scalarizing would be beneficial.
5543     Discount += VectorCost - ScalarCost;
5544     ScalarCosts[I] = ScalarCost;
5545   }
5546
5547   return Discount;
5548 }
5549
5550 LoopVectorizationCostModel::VectorizationCostTy
5551 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5552   VectorizationCostTy Cost;
5553
5554   // For each block.
5555   for (BasicBlock *BB : TheLoop->blocks()) {
5556     VectorizationCostTy BlockCost;
5557
5558     // For each instruction in the old loop.
5559     for (Instruction &I : BB->instructionsWithoutDebug()) {
5560       // Skip ignored values.
5561       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5562           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5563         continue;
5564
5565       VectorizationCostTy C = getInstructionCost(&I, VF);
5566
5567       // Check if we should override the cost.
5568       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5569         C.first = ForceTargetInstructionCost;
5570
5571       BlockCost.first += C.first;
5572       BlockCost.second |= C.second;
5573       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5574                         << " for VF " << VF << " For instruction: " << I
5575                         << '\n');
5576     }
5577
5578     // If we are vectorizing a predicated block, it will have been
5579     // if-converted. This means that the block's instructions (aside from
5580     // stores and instructions that may divide by zero) will now be
5581     // unconditionally executed. For the scalar case, we may not always execute
5582     // the predicated block. Thus, scale the block's cost by the probability of
5583     // executing it.
5584     if (VF == 1 && blockNeedsPredication(BB))
5585       BlockCost.first /= getReciprocalPredBlockProb();
5586
5587     Cost.first += BlockCost.first;
5588     Cost.second |= BlockCost.second;
5589   }
5590
5591   return Cost;
5592 }
5593
5594 /// Gets Address Access SCEV after verifying that the access pattern
5595 /// is loop invariant except the induction variable dependence.
5596 ///
5597 /// This SCEV can be sent to the Target in order to estimate the address
5598 /// calculation cost.
5599 static const SCEV *getAddressAccessSCEV(
5600               Value *Ptr,
5601               LoopVectorizationLegality *Legal,
5602               PredicatedScalarEvolution &PSE,
5603               const Loop *TheLoop) {
5604
5605   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5606   if (!Gep)
5607     return nullptr;
5608
5609   // We are looking for a gep with all loop invariant indices except for one
5610   // which should be an induction variable.
5611   auto SE = PSE.getSE();
5612   unsigned NumOperands = Gep->getNumOperands();
5613   for (unsigned i = 1; i < NumOperands; ++i) {
5614     Value *Opd = Gep->getOperand(i);
5615     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5616         !Legal->isInductionVariable(Opd))
5617       return nullptr;
5618   }
5619
5620   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5621   return PSE.getSCEV(Ptr);
5622 }
5623
5624 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5625   return Legal->hasStride(I->getOperand(0)) ||
5626          Legal->hasStride(I->getOperand(1));
5627 }
5628
5629 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5630                                                                  unsigned VF) {
5631   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5632   Type *ValTy = getMemInstValueType(I);
5633   auto SE = PSE.getSE();
5634
5635   unsigned Alignment = getLoadStoreAlignment(I);
5636   unsigned AS = getLoadStoreAddressSpace(I);
5637   Value *Ptr = getLoadStorePointerOperand(I);
5638   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5639
5640   // Figure out whether the access is strided and get the stride value
5641   // if it's known in compile time
5642   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5643
5644   // Get the cost of the scalar memory instruction and address computation.
5645   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5646
5647   // Don't pass *I here, since it is scalar but will actually be part of a
5648   // vectorized loop where the user of it is a vectorized instruction.
5649   Cost += VF *
5650           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
5651                               AS);
5652
5653   // Get the overhead of the extractelement and insertelement instructions
5654   // we might create due to scalarization.
5655   Cost += getScalarizationOverhead(I, VF);
5656
5657   // If we have a predicated store, it may not be executed for each vector
5658   // lane. Scale the cost by the probability of executing the predicated
5659   // block.
5660   if (isPredicatedInst(I)) {
5661     Cost /= getReciprocalPredBlockProb();
5662
5663     if (useEmulatedMaskMemRefHack(I))
5664       // Artificially setting to a high enough value to practically disable
5665       // vectorization with such operations.
5666       Cost = 3000000;
5667   }
5668
5669   return Cost;
5670 }
5671
5672 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5673                                                              unsigned VF) {
5674   Type *ValTy = getMemInstValueType(I);
5675   Type *VectorTy = ToVectorTy(ValTy, VF);
5676   unsigned Alignment = getLoadStoreAlignment(I);
5677   Value *Ptr = getLoadStorePointerOperand(I);
5678   unsigned AS = getLoadStoreAddressSpace(I);
5679   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5680
5681   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5682          "Stride should be 1 or -1 for consecutive memory access");
5683   unsigned Cost = 0;
5684   if (Legal->isMaskRequired(I))
5685     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
5686   else
5687     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5688
5689   bool Reverse = ConsecutiveStride < 0;
5690   if (Reverse)
5691     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5692   return Cost;
5693 }
5694
5695 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5696                                                          unsigned VF) {
5697   Type *ValTy = getMemInstValueType(I);
5698   Type *VectorTy = ToVectorTy(ValTy, VF);
5699   unsigned Alignment = getLoadStoreAlignment(I);
5700   unsigned AS = getLoadStoreAddressSpace(I);
5701   if (isa<LoadInst>(I)) {
5702     return TTI.getAddressComputationCost(ValTy) +
5703            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5704            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5705   }
5706   StoreInst *SI = cast<StoreInst>(I);
5707
5708   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5709   return TTI.getAddressComputationCost(ValTy) +
5710          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5711          (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(
5712                                                Instruction::ExtractElement,
5713                                                VectorTy, VF - 1));
5714 }
5715
5716 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5717                                                           unsigned VF) {
5718   Type *ValTy = getMemInstValueType(I);
5719   Type *VectorTy = ToVectorTy(ValTy, VF);
5720   unsigned Alignment = getLoadStoreAlignment(I);
5721   Value *Ptr = getLoadStorePointerOperand(I);
5722
5723   return TTI.getAddressComputationCost(VectorTy) +
5724          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5725                                     Legal->isMaskRequired(I), Alignment);
5726 }
5727
5728 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5729                                                             unsigned VF) {
5730   Type *ValTy = getMemInstValueType(I);
5731   Type *VectorTy = ToVectorTy(ValTy, VF);
5732   unsigned AS = getLoadStoreAddressSpace(I);
5733
5734   auto Group = getInterleavedAccessGroup(I);
5735   assert(Group && "Fail to get an interleaved access group.");
5736
5737   unsigned InterleaveFactor = Group->getFactor();
5738   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5739
5740   // Holds the indices of existing members in an interleaved load group.
5741   // An interleaved store group doesn't need this as it doesn't allow gaps.
5742   SmallVector<unsigned, 4> Indices;
5743   if (isa<LoadInst>(I)) {
5744     for (unsigned i = 0; i < InterleaveFactor; i++)
5745       if (Group->getMember(i))
5746         Indices.push_back(i);
5747   }
5748
5749   // Calculate the cost of the whole interleaved group.
5750   bool UseMaskForGaps =
5751       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5752   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5753       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5754       Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5755
5756   if (Group->isReverse()) {
5757     // TODO: Add support for reversed masked interleaved access.
5758     assert(!Legal->isMaskRequired(I) &&
5759            "Reverse masked interleaved access not supported.");
5760     Cost += Group->getNumMembers() *
5761             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5762   }
5763   return Cost;
5764 }
5765
5766 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5767                                                               unsigned VF) {
5768   // Calculate scalar cost only. Vectorization cost should be ready at this
5769   // moment.
5770   if (VF == 1) {
5771     Type *ValTy = getMemInstValueType(I);
5772     unsigned Alignment = getLoadStoreAlignment(I);
5773     unsigned AS = getLoadStoreAddressSpace(I);
5774
5775     return TTI.getAddressComputationCost(ValTy) +
5776            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5777   }
5778   return getWideningCost(I, VF);
5779 }
5780
5781 LoopVectorizationCostModel::VectorizationCostTy
5782 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5783   // If we know that this instruction will remain uniform, check the cost of
5784   // the scalar version.
5785   if (isUniformAfterVectorization(I, VF))
5786     VF = 1;
5787
5788   if (VF > 1 && isProfitableToScalarize(I, VF))
5789     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5790
5791   // Forced scalars do not have any scalarization overhead.
5792   auto ForcedScalar = ForcedScalars.find(VF);
5793   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5794     auto InstSet = ForcedScalar->second;
5795     if (InstSet.find(I) != InstSet.end())
5796       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5797   }
5798
5799   Type *VectorTy;
5800   unsigned C = getInstructionCost(I, VF, VectorTy);
5801
5802   bool TypeNotScalarized =
5803       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5804   return VectorizationCostTy(C, TypeNotScalarized);
5805 }
5806
5807 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5808                                                               unsigned VF) {
5809
5810   if (VF == 1)
5811     return 0;
5812
5813   unsigned Cost = 0;
5814   Type *RetTy = ToVectorTy(I->getType(), VF);
5815   if (!RetTy->isVoidTy() &&
5816       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5817     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5818
5819   // Some targets keep addresses scalar.
5820   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5821     return Cost;
5822
5823   // Some targets support efficient element stores.
5824   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5825     return Cost;
5826
5827   // Collect operands to consider.
5828   CallInst *CI = dyn_cast<CallInst>(I);
5829   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5830
5831   // Skip operands that do not require extraction/scalarization and do not incur
5832   // any overhead.
5833   return Cost + TTI.getOperandsScalarizationOverhead(
5834                     filterExtractingOperands(Ops, VF), VF);
5835 }
5836
5837 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
5838   if (VF == 1)
5839     return;
5840   NumPredStores = 0;
5841   for (BasicBlock *BB : TheLoop->blocks()) {
5842     // For each instruction in the old loop.
5843     for (Instruction &I : *BB) {
5844       Value *Ptr =  getLoadStorePointerOperand(&I);
5845       if (!Ptr)
5846         continue;
5847
5848       // TODO: We should generate better code and update the cost model for
5849       // predicated uniform stores. Today they are treated as any other
5850       // predicated store (see added test cases in
5851       // invariant-store-vectorization.ll).
5852       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
5853         NumPredStores++;
5854
5855       if (Legal->isUniform(Ptr) &&
5856           // Conditional loads and stores should be scalarized and predicated.
5857           // isScalarWithPredication cannot be used here since masked
5858           // gather/scatters are not considered scalar with predication.
5859           !Legal->blockNeedsPredication(I.getParent())) {
5860         // TODO: Avoid replicating loads and stores instead of
5861         // relying on instcombine to remove them.
5862         // Load: Scalar load + broadcast
5863         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5864         unsigned Cost = getUniformMemOpCost(&I, VF);
5865         setWideningDecision(&I, VF, CM_Scalarize, Cost);
5866         continue;
5867       }
5868
5869       // We assume that widening is the best solution when possible.
5870       if (memoryInstructionCanBeWidened(&I, VF)) {
5871         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
5872         int ConsecutiveStride =
5873                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
5874         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5875                "Expected consecutive stride.");
5876         InstWidening Decision =
5877             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5878         setWideningDecision(&I, VF, Decision, Cost);
5879         continue;
5880       }
5881
5882       // Choose between Interleaving, Gather/Scatter or Scalarization.
5883       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
5884       unsigned NumAccesses = 1;
5885       if (isAccessInterleaved(&I)) {
5886         auto Group = getInterleavedAccessGroup(&I);
5887         assert(Group && "Fail to get an interleaved access group.");
5888
5889         // Make one decision for the whole group.
5890         if (getWideningDecision(&I, VF) != CM_Unknown)
5891           continue;
5892
5893         NumAccesses = Group->getNumMembers();
5894         if (interleavedAccessCanBeWidened(&I, VF))
5895           InterleaveCost = getInterleaveGroupCost(&I, VF);
5896       }
5897
5898       unsigned GatherScatterCost =
5899           isLegalGatherOrScatter(&I)
5900               ? getGatherScatterCost(&I, VF) * NumAccesses
5901               : std::numeric_limits<unsigned>::max();
5902
5903       unsigned ScalarizationCost =
5904           getMemInstScalarizationCost(&I, VF) * NumAccesses;
5905
5906       // Choose better solution for the current VF,
5907       // write down this decision and use it during vectorization.
5908       unsigned Cost;
5909       InstWidening Decision;
5910       if (InterleaveCost <= GatherScatterCost &&
5911           InterleaveCost < ScalarizationCost) {
5912         Decision = CM_Interleave;
5913         Cost = InterleaveCost;
5914       } else if (GatherScatterCost < ScalarizationCost) {
5915         Decision = CM_GatherScatter;
5916         Cost = GatherScatterCost;
5917       } else {
5918         Decision = CM_Scalarize;
5919         Cost = ScalarizationCost;
5920       }
5921       // If the instructions belongs to an interleave group, the whole group
5922       // receives the same decision. The whole group receives the cost, but
5923       // the cost will actually be assigned to one instruction.
5924       if (auto Group = getInterleavedAccessGroup(&I))
5925         setWideningDecision(Group, VF, Decision, Cost);
5926       else
5927         setWideningDecision(&I, VF, Decision, Cost);
5928     }
5929   }
5930
5931   // Make sure that any load of address and any other address computation
5932   // remains scalar unless there is gather/scatter support. This avoids
5933   // inevitable extracts into address registers, and also has the benefit of
5934   // activating LSR more, since that pass can't optimize vectorized
5935   // addresses.
5936   if (TTI.prefersVectorizedAddressing())
5937     return;
5938
5939   // Start with all scalar pointer uses.
5940   SmallPtrSet<Instruction *, 8> AddrDefs;
5941   for (BasicBlock *BB : TheLoop->blocks())
5942     for (Instruction &I : *BB) {
5943       Instruction *PtrDef =
5944         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5945       if (PtrDef && TheLoop->contains(PtrDef) &&
5946           getWideningDecision(&I, VF) != CM_GatherScatter)
5947         AddrDefs.insert(PtrDef);
5948     }
5949
5950   // Add all instructions used to generate the addresses.
5951   SmallVector<Instruction *, 4> Worklist;
5952   for (auto *I : AddrDefs)
5953     Worklist.push_back(I);
5954   while (!Worklist.empty()) {
5955     Instruction *I = Worklist.pop_back_val();
5956     for (auto &Op : I->operands())
5957       if (auto *InstOp = dyn_cast<Instruction>(Op))
5958         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
5959             AddrDefs.insert(InstOp).second)
5960           Worklist.push_back(InstOp);
5961   }
5962
5963   for (auto *I : AddrDefs) {
5964     if (isa<LoadInst>(I)) {
5965       // Setting the desired widening decision should ideally be handled in
5966       // by cost functions, but since this involves the task of finding out
5967       // if the loaded register is involved in an address computation, it is
5968       // instead changed here when we know this is the case.
5969       InstWidening Decision = getWideningDecision(I, VF);
5970       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
5971         // Scalarize a widened load of address.
5972         setWideningDecision(I, VF, CM_Scalarize,
5973                             (VF * getMemoryInstructionCost(I, 1)));
5974       else if (auto Group = getInterleavedAccessGroup(I)) {
5975         // Scalarize an interleave group of address loads.
5976         for (unsigned I = 0; I < Group->getFactor(); ++I) {
5977           if (Instruction *Member = Group->getMember(I))
5978             setWideningDecision(Member, VF, CM_Scalarize,
5979                                 (VF * getMemoryInstructionCost(Member, 1)));
5980         }
5981       }
5982     } else
5983       // Make sure I gets scalarized and a cost estimate without
5984       // scalarization overhead.
5985       ForcedScalars[VF].insert(I);
5986   }
5987 }
5988
5989 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
5990                                                         unsigned VF,
5991                                                         Type *&VectorTy) {
5992   Type *RetTy = I->getType();
5993   if (canTruncateToMinimalBitwidth(I, VF))
5994     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
5995   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
5996   auto SE = PSE.getSE();
5997
5998   // TODO: We need to estimate the cost of intrinsic calls.
5999   switch (I->getOpcode()) {
6000   case Instruction::GetElementPtr:
6001     // We mark this instruction as zero-cost because the cost of GEPs in
6002     // vectorized code depends on whether the corresponding memory instruction
6003     // is scalarized or not. Therefore, we handle GEPs with the memory
6004     // instruction cost.
6005     return 0;
6006   case Instruction::Br: {
6007     // In cases of scalarized and predicated instructions, there will be VF
6008     // predicated blocks in the vectorized loop. Each branch around these
6009     // blocks requires also an extract of its vector compare i1 element.
6010     bool ScalarPredicatedBB = false;
6011     BranchInst *BI = cast<BranchInst>(I);
6012     if (VF > 1 && BI->isConditional() &&
6013         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6014              PredicatedBBsAfterVectorization.end() ||
6015          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6016              PredicatedBBsAfterVectorization.end()))
6017       ScalarPredicatedBB = true;
6018
6019     if (ScalarPredicatedBB) {
6020       // Return cost for branches around scalarized and predicated blocks.
6021       Type *Vec_i1Ty =
6022           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6023       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6024               (TTI.getCFInstrCost(Instruction::Br) * VF));
6025     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6026       // The back-edge branch will remain, as will all scalar branches.
6027       return TTI.getCFInstrCost(Instruction::Br);
6028     else
6029       // This branch will be eliminated by if-conversion.
6030       return 0;
6031     // Note: We currently assume zero cost for an unconditional branch inside
6032     // a predicated block since it will become a fall-through, although we
6033     // may decide in the future to call TTI for all branches.
6034   }
6035   case Instruction::PHI: {
6036     auto *Phi = cast<PHINode>(I);
6037
6038     // First-order recurrences are replaced by vector shuffles inside the loop.
6039     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6040     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6041       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6042                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6043
6044     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6045     // converted into select instructions. We require N - 1 selects per phi
6046     // node, where N is the number of incoming values.
6047     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6048       return (Phi->getNumIncomingValues() - 1) *
6049              TTI.getCmpSelInstrCost(
6050                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6051                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6052
6053     return TTI.getCFInstrCost(Instruction::PHI);
6054   }
6055   case Instruction::UDiv:
6056   case Instruction::SDiv:
6057   case Instruction::URem:
6058   case Instruction::SRem:
6059     // If we have a predicated instruction, it may not be executed for each
6060     // vector lane. Get the scalarization cost and scale this amount by the
6061     // probability of executing the predicated block. If the instruction is not
6062     // predicated, we fall through to the next case.
6063     if (VF > 1 && isScalarWithPredication(I)) {
6064       unsigned Cost = 0;
6065
6066       // These instructions have a non-void type, so account for the phi nodes
6067       // that we will create. This cost is likely to be zero. The phi node
6068       // cost, if any, should be scaled by the block probability because it
6069       // models a copy at the end of each predicated block.
6070       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6071
6072       // The cost of the non-predicated instruction.
6073       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6074
6075       // The cost of insertelement and extractelement instructions needed for
6076       // scalarization.
6077       Cost += getScalarizationOverhead(I, VF);
6078
6079       // Scale the cost by the probability of executing the predicated blocks.
6080       // This assumes the predicated block for each vector lane is equally
6081       // likely.
6082       return Cost / getReciprocalPredBlockProb();
6083     }
6084     LLVM_FALLTHROUGH;
6085   case Instruction::Add:
6086   case Instruction::FAdd:
6087   case Instruction::Sub:
6088   case Instruction::FSub:
6089   case Instruction::Mul:
6090   case Instruction::FMul:
6091   case Instruction::FDiv:
6092   case Instruction::FRem:
6093   case Instruction::Shl:
6094   case Instruction::LShr:
6095   case Instruction::AShr:
6096   case Instruction::And:
6097   case Instruction::Or:
6098   case Instruction::Xor: {
6099     // Since we will replace the stride by 1 the multiplication should go away.
6100     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6101       return 0;
6102     // Certain instructions can be cheaper to vectorize if they have a constant
6103     // second vector operand. One example of this are shifts on x86.
6104     Value *Op2 = I->getOperand(1);
6105     TargetTransformInfo::OperandValueProperties Op2VP;
6106     TargetTransformInfo::OperandValueKind Op2VK =
6107         TTI.getOperandInfo(Op2, Op2VP);
6108     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6109       Op2VK = TargetTransformInfo::OK_UniformValue;
6110
6111     SmallVector<const Value *, 4> Operands(I->operand_values());
6112     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6113     return N * TTI.getArithmeticInstrCost(
6114                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6115                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands);
6116   }
6117   case Instruction::FNeg: {
6118     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6119     return N * TTI.getArithmeticInstrCost(
6120                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6121                    TargetTransformInfo::OK_AnyValue,
6122                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6123                    I->getOperand(0));
6124   }
6125   case Instruction::Select: {
6126     SelectInst *SI = cast<SelectInst>(I);
6127     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6128     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6129     Type *CondTy = SI->getCondition()->getType();
6130     if (!ScalarCond)
6131       CondTy = VectorType::get(CondTy, VF);
6132
6133     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6134   }
6135   case Instruction::ICmp:
6136   case Instruction::FCmp: {
6137     Type *ValTy = I->getOperand(0)->getType();
6138     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6139     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6140       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6141     VectorTy = ToVectorTy(ValTy, VF);
6142     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6143   }
6144   case Instruction::Store:
6145   case Instruction::Load: {
6146     unsigned Width = VF;
6147     if (Width > 1) {
6148       InstWidening Decision = getWideningDecision(I, Width);
6149       assert(Decision != CM_Unknown &&
6150              "CM decision should be taken at this point");
6151       if (Decision == CM_Scalarize)
6152         Width = 1;
6153     }
6154     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6155     return getMemoryInstructionCost(I, VF);
6156   }
6157   case Instruction::ZExt:
6158   case Instruction::SExt:
6159   case Instruction::FPToUI:
6160   case Instruction::FPToSI:
6161   case Instruction::FPExt:
6162   case Instruction::PtrToInt:
6163   case Instruction::IntToPtr:
6164   case Instruction::SIToFP:
6165   case Instruction::UIToFP:
6166   case Instruction::Trunc:
6167   case Instruction::FPTrunc:
6168   case Instruction::BitCast: {
6169     // We optimize the truncation of induction variables having constant
6170     // integer steps. The cost of these truncations is the same as the scalar
6171     // operation.
6172     if (isOptimizableIVTruncate(I, VF)) {
6173       auto *Trunc = cast<TruncInst>(I);
6174       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6175                                   Trunc->getSrcTy(), Trunc);
6176     }
6177
6178     Type *SrcScalarTy = I->getOperand(0)->getType();
6179     Type *SrcVecTy =
6180         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6181     if (canTruncateToMinimalBitwidth(I, VF)) {
6182       // This cast is going to be shrunk. This may remove the cast or it might
6183       // turn it into slightly different cast. For example, if MinBW == 16,
6184       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6185       //
6186       // Calculate the modified src and dest types.
6187       Type *MinVecTy = VectorTy;
6188       if (I->getOpcode() == Instruction::Trunc) {
6189         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6190         VectorTy =
6191             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6192       } else if (I->getOpcode() == Instruction::ZExt ||
6193                  I->getOpcode() == Instruction::SExt) {
6194         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6195         VectorTy =
6196             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6197       }
6198     }
6199
6200     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6201     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6202   }
6203   case Instruction::Call: {
6204     bool NeedToScalarize;
6205     CallInst *CI = cast<CallInst>(I);
6206     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6207     if (getVectorIntrinsicIDForCall(CI, TLI))
6208       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6209     return CallCost;
6210   }
6211   default:
6212     // The cost of executing VF copies of the scalar instruction. This opcode
6213     // is unknown. Assume that it is the same as 'mul'.
6214     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6215            getScalarizationOverhead(I, VF);
6216   } // end of switch.
6217 }
6218
6219 char LoopVectorize::ID = 0;
6220
6221 static const char lv_name[] = "Loop Vectorization";
6222
6223 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6224 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6225 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6226 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6227 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6228 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6229 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6230 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6231 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6232 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6233 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6234 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6235 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6236 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6237 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6238
6239 namespace llvm {
6240
6241 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6242
6243 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6244                               bool VectorizeOnlyWhenForced) {
6245   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6246 }
6247
6248 } // end namespace llvm
6249
6250 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6251   // Check if the pointer operand of a load or store instruction is
6252   // consecutive.
6253   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6254     return Legal->isConsecutivePtr(Ptr);
6255   return false;
6256 }
6257
6258 void LoopVectorizationCostModel::collectValuesToIgnore() {
6259   // Ignore ephemeral values.
6260   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6261
6262   // Ignore type-promoting instructions we identified during reduction
6263   // detection.
6264   for (auto &Reduction : *Legal->getReductionVars()) {
6265     RecurrenceDescriptor &RedDes = Reduction.second;
6266     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6267     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6268   }
6269   // Ignore type-casting instructions we identified during induction
6270   // detection.
6271   for (auto &Induction : *Legal->getInductionVars()) {
6272     InductionDescriptor &IndDes = Induction.second;
6273     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6274     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6275   }
6276 }
6277
6278 // TODO: we could return a pair of values that specify the max VF and
6279 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6280 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6281 // doesn't have a cost model that can choose which plan to execute if
6282 // more than one is generated.
6283 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6284                                  LoopVectorizationCostModel &CM) {
6285   unsigned WidestType;
6286   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6287   return WidestVectorRegBits / WidestType;
6288 }
6289
6290 VectorizationFactor
6291 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6292   unsigned VF = UserVF;
6293   // Outer loop handling: They may require CFG and instruction level
6294   // transformations before even evaluating whether vectorization is profitable.
6295   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6296   // the vectorization pipeline.
6297   if (!OrigLoop->empty()) {
6298     // If the user doesn't provide a vectorization factor, determine a
6299     // reasonable one.
6300     if (!UserVF) {
6301       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6302       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6303
6304       // Make sure we have a VF > 1 for stress testing.
6305       if (VPlanBuildStressTest && VF < 2) {
6306         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6307                           << "overriding computed VF.\n");
6308         VF = 4;
6309       }
6310     }
6311     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6312     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6313     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6314                       << " to build VPlans.\n");
6315     buildVPlans(VF, VF);
6316
6317     // For VPlan build stress testing, we bail out after VPlan construction.
6318     if (VPlanBuildStressTest)
6319       return VectorizationFactor::Disabled();
6320
6321     return {VF, 0};
6322   }
6323
6324   LLVM_DEBUG(
6325       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6326                 "VPlan-native path.\n");
6327   return VectorizationFactor::Disabled();
6328 }
6329
6330 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6331   assert(OrigLoop->empty() && "Inner loop expected.");
6332   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6333   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6334     return None;
6335
6336   // Invalidate interleave groups if all blocks of loop will be predicated.
6337   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6338       !useMaskedInterleavedAccesses(*TTI)) {
6339     LLVM_DEBUG(
6340         dbgs()
6341         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6342            "which requires masked-interleaved support.\n");
6343     CM.InterleaveInfo.reset();
6344   }
6345
6346   if (UserVF) {
6347     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6348     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6349     // Collect the instructions (and their associated costs) that will be more
6350     // profitable to scalarize.
6351     CM.selectUserVectorizationFactor(UserVF);
6352     buildVPlansWithVPRecipes(UserVF, UserVF);
6353     LLVM_DEBUG(printPlans(dbgs()));
6354     return {{UserVF, 0}};
6355   }
6356
6357   unsigned MaxVF = MaybeMaxVF.getValue();
6358   assert(MaxVF != 0 && "MaxVF is zero.");
6359
6360   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6361     // Collect Uniform and Scalar instructions after vectorization with VF.
6362     CM.collectUniformsAndScalars(VF);
6363
6364     // Collect the instructions (and their associated costs) that will be more
6365     // profitable to scalarize.
6366     if (VF > 1)
6367       CM.collectInstsToScalarize(VF);
6368   }
6369
6370   buildVPlansWithVPRecipes(1, MaxVF);
6371   LLVM_DEBUG(printPlans(dbgs()));
6372   if (MaxVF == 1)
6373     return VectorizationFactor::Disabled();
6374
6375   // Select the optimal vectorization factor.
6376   return CM.selectVectorizationFactor(MaxVF);
6377 }
6378
6379 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6380   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6381                     << '\n');
6382   BestVF = VF;
6383   BestUF = UF;
6384
6385   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6386     return !Plan->hasVF(VF);
6387   });
6388   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6389 }
6390
6391 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6392                                            DominatorTree *DT) {
6393   // Perform the actual loop transformation.
6394
6395   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6396   VPCallbackILV CallbackILV(ILV);
6397
6398   VPTransformState State{BestVF, BestUF,      LI,
6399                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6400                          &ILV,   CallbackILV};
6401   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6402   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6403
6404   //===------------------------------------------------===//
6405   //
6406   // Notice: any optimization or new instruction that go
6407   // into the code below should also be implemented in
6408   // the cost-model.
6409   //
6410   //===------------------------------------------------===//
6411
6412   // 2. Copy and widen instructions from the old loop into the new loop.
6413   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6414   VPlans.front()->execute(&State);
6415
6416   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6417   //    predication, updating analyses.
6418   ILV.fixVectorizedLoop();
6419 }
6420
6421 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6422     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6423   BasicBlock *Latch = OrigLoop->getLoopLatch();
6424
6425   // We create new control-flow for the vectorized loop, so the original
6426   // condition will be dead after vectorization if it's only used by the
6427   // branch.
6428   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6429   if (Cmp && Cmp->hasOneUse())
6430     DeadInstructions.insert(Cmp);
6431
6432   // We create new "steps" for induction variable updates to which the original
6433   // induction variables map. An original update instruction will be dead if
6434   // all its users except the induction variable are dead.
6435   for (auto &Induction : *Legal->getInductionVars()) {
6436     PHINode *Ind = Induction.first;
6437     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6438     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6439           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6440                                  DeadInstructions.end();
6441         }))
6442       DeadInstructions.insert(IndUpdate);
6443
6444     // We record as "Dead" also the type-casting instructions we had identified
6445     // during induction analysis. We don't need any handling for them in the
6446     // vectorized loop because we have proven that, under a proper runtime
6447     // test guarding the vectorized loop, the value of the phi, and the casted
6448     // value of the phi, are the same. The last instruction in this casting chain
6449     // will get its scalar/vector/widened def from the scalar/vector/widened def
6450     // of the respective phi node. Any other casts in the induction def-use chain
6451     // have no other uses outside the phi update chain, and will be ignored.
6452     InductionDescriptor &IndDes = Induction.second;
6453     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6454     DeadInstructions.insert(Casts.begin(), Casts.end());
6455   }
6456 }
6457
6458 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6459
6460 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6461
6462 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6463                                         Instruction::BinaryOps BinOp) {
6464   // When unrolling and the VF is 1, we only need to add a simple scalar.
6465   Type *Ty = Val->getType();
6466   assert(!Ty->isVectorTy() && "Val must be a scalar");
6467
6468   if (Ty->isFloatingPointTy()) {
6469     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6470
6471     // Floating point operations had to be 'fast' to enable the unrolling.
6472     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6473     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6474   }
6475   Constant *C = ConstantInt::get(Ty, StartIdx);
6476   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6477 }
6478
6479 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6480   SmallVector<Metadata *, 4> MDs;
6481   // Reserve first location for self reference to the LoopID metadata node.
6482   MDs.push_back(nullptr);
6483   bool IsUnrollMetadata = false;
6484   MDNode *LoopID = L->getLoopID();
6485   if (LoopID) {
6486     // First find existing loop unrolling disable metadata.
6487     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6488       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6489       if (MD) {
6490         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6491         IsUnrollMetadata =
6492             S && S->getString().startswith("llvm.loop.unroll.disable");
6493       }
6494       MDs.push_back(LoopID->getOperand(i));
6495     }
6496   }
6497
6498   if (!IsUnrollMetadata) {
6499     // Add runtime unroll disable metadata.
6500     LLVMContext &Context = L->getHeader()->getContext();
6501     SmallVector<Metadata *, 1> DisableOperands;
6502     DisableOperands.push_back(
6503         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6504     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6505     MDs.push_back(DisableNode);
6506     MDNode *NewLoopID = MDNode::get(Context, MDs);
6507     // Set operand 0 to refer to the loop id itself.
6508     NewLoopID->replaceOperandWith(0, NewLoopID);
6509     L->setLoopID(NewLoopID);
6510   }
6511 }
6512
6513 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6514     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6515   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6516   bool PredicateAtRangeStart = Predicate(Range.Start);
6517
6518   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6519     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6520       Range.End = TmpVF;
6521       break;
6522     }
6523
6524   return PredicateAtRangeStart;
6525 }
6526
6527 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6528 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6529 /// of VF's starting at a given VF and extending it as much as possible. Each
6530 /// vectorization decision can potentially shorten this sub-range during
6531 /// buildVPlan().
6532 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6533   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6534     VFRange SubRange = {VF, MaxVF + 1};
6535     VPlans.push_back(buildVPlan(SubRange));
6536     VF = SubRange.End;
6537   }
6538 }
6539
6540 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6541                                          VPlanPtr &Plan) {
6542   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6543
6544   // Look for cached value.
6545   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6546   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6547   if (ECEntryIt != EdgeMaskCache.end())
6548     return ECEntryIt->second;
6549
6550   VPValue *SrcMask = createBlockInMask(Src, Plan);
6551
6552   // The terminator has to be a branch inst!
6553   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6554   assert(BI && "Unexpected terminator found");
6555
6556   if (!BI->isConditional())
6557     return EdgeMaskCache[Edge] = SrcMask;
6558
6559   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6560   assert(EdgeMask && "No Edge Mask found for condition");
6561
6562   if (BI->getSuccessor(0) != Dst)
6563     EdgeMask = Builder.createNot(EdgeMask);
6564
6565   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6566     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6567
6568   return EdgeMaskCache[Edge] = EdgeMask;
6569 }
6570
6571 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6572   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6573
6574   // Look for cached value.
6575   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6576   if (BCEntryIt != BlockMaskCache.end())
6577     return BCEntryIt->second;
6578
6579   // All-one mask is modelled as no-mask following the convention for masked
6580   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6581   VPValue *BlockMask = nullptr;
6582
6583   if (OrigLoop->getHeader() == BB) {
6584     if (!CM.blockNeedsPredication(BB))
6585       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6586
6587     // Introduce the early-exit compare IV <= BTC to form header block mask.
6588     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6589     VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6590     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6591     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6592     return BlockMaskCache[BB] = BlockMask;
6593   }
6594
6595   // This is the block mask. We OR all incoming edges.
6596   for (auto *Predecessor : predecessors(BB)) {
6597     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6598     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6599       return BlockMaskCache[BB] = EdgeMask;
6600
6601     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6602       BlockMask = EdgeMask;
6603       continue;
6604     }
6605
6606     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6607   }
6608
6609   return BlockMaskCache[BB] = BlockMask;
6610 }
6611
6612 VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
6613                                                            VFRange &Range,
6614                                                            VPlanPtr &Plan) {
6615   const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I);
6616   if (!IG)
6617     return nullptr;
6618
6619   // Now check if IG is relevant for VF's in the given range.
6620   auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {
6621     return [=](unsigned VF) -> bool {
6622       return (VF >= 2 && // Query is illegal for VF == 1
6623               CM.getWideningDecision(I, VF) ==
6624                   LoopVectorizationCostModel::CM_Interleave);
6625     };
6626   };
6627   if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range))
6628     return nullptr;
6629
6630   // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
6631   // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
6632   // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
6633   assert(I == IG->getInsertPos() &&
6634          "Generating a recipe for an adjunct member of an interleave group");
6635
6636   VPValue *Mask = nullptr;
6637   if (Legal->isMaskRequired(I))
6638     Mask = createBlockInMask(I->getParent(), Plan);
6639
6640   return new VPInterleaveRecipe(IG, Mask);
6641 }
6642
6643 VPWidenMemoryInstructionRecipe *
6644 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6645                                   VPlanPtr &Plan) {
6646   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6647     return nullptr;
6648
6649   auto willWiden = [&](unsigned VF) -> bool {
6650     if (VF == 1)
6651       return false;
6652     if (CM.isScalarAfterVectorization(I, VF) ||
6653         CM.isProfitableToScalarize(I, VF))
6654       return false;
6655     LoopVectorizationCostModel::InstWidening Decision =
6656         CM.getWideningDecision(I, VF);
6657     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6658            "CM decision should be taken at this point.");
6659     assert(Decision != LoopVectorizationCostModel::CM_Interleave &&
6660            "Interleave memory opportunity should be caught earlier.");
6661     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6662   };
6663
6664   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6665     return nullptr;
6666
6667   VPValue *Mask = nullptr;
6668   if (Legal->isMaskRequired(I))
6669     Mask = createBlockInMask(I->getParent(), Plan);
6670
6671   return new VPWidenMemoryInstructionRecipe(*I, Mask);
6672 }
6673
6674 VPWidenIntOrFpInductionRecipe *
6675 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6676   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6677     // Check if this is an integer or fp induction. If so, build the recipe that
6678     // produces its scalar and vector values.
6679     InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6680     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6681         II.getKind() == InductionDescriptor::IK_FpInduction)
6682       return new VPWidenIntOrFpInductionRecipe(Phi);
6683
6684     return nullptr;
6685   }
6686
6687   // Optimize the special case where the source is a constant integer
6688   // induction variable. Notice that we can only optimize the 'trunc' case
6689   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6690   // (c) other casts depend on pointer size.
6691
6692   // Determine whether \p K is a truncation based on an induction variable that
6693   // can be optimized.
6694   auto isOptimizableIVTruncate =
6695       [&](Instruction *K) -> std::function<bool(unsigned)> {
6696     return
6697         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6698   };
6699
6700   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6701                                isOptimizableIVTruncate(I), Range))
6702     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6703                                              cast<TruncInst>(I));
6704   return nullptr;
6705 }
6706
6707 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6708   PHINode *Phi = dyn_cast<PHINode>(I);
6709   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6710     return nullptr;
6711
6712   // We know that all PHIs in non-header blocks are converted into selects, so
6713   // we don't have to worry about the insertion order and we can just use the
6714   // builder. At this point we generate the predication tree. There may be
6715   // duplications since this is a simple recursive scan, but future
6716   // optimizations will clean it up.
6717
6718   SmallVector<VPValue *, 2> Masks;
6719   unsigned NumIncoming = Phi->getNumIncomingValues();
6720   for (unsigned In = 0; In < NumIncoming; In++) {
6721     VPValue *EdgeMask =
6722       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6723     assert((EdgeMask || NumIncoming == 1) &&
6724            "Multiple predecessors with one having a full mask");
6725     if (EdgeMask)
6726       Masks.push_back(EdgeMask);
6727   }
6728   return new VPBlendRecipe(Phi, Masks);
6729 }
6730
6731 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6732                                  VFRange &Range) {
6733
6734   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6735       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6736
6737   if (IsPredicated)
6738     return false;
6739
6740   auto IsVectorizableOpcode = [](unsigned Opcode) {
6741     switch (Opcode) {
6742     case Instruction::Add:
6743     case Instruction::And:
6744     case Instruction::AShr:
6745     case Instruction::BitCast:
6746     case Instruction::Br:
6747     case Instruction::Call:
6748     case Instruction::FAdd:
6749     case Instruction::FCmp:
6750     case Instruction::FDiv:
6751     case Instruction::FMul:
6752     case Instruction::FNeg:
6753     case Instruction::FPExt:
6754     case Instruction::FPToSI:
6755     case Instruction::FPToUI:
6756     case Instruction::FPTrunc:
6757     case Instruction::FRem:
6758     case Instruction::FSub:
6759     case Instruction::GetElementPtr:
6760     case Instruction::ICmp:
6761     case Instruction::IntToPtr:
6762     case Instruction::Load:
6763     case Instruction::LShr:
6764     case Instruction::Mul:
6765     case Instruction::Or:
6766     case Instruction::PHI:
6767     case Instruction::PtrToInt:
6768     case Instruction::SDiv:
6769     case Instruction::Select:
6770     case Instruction::SExt:
6771     case Instruction::Shl:
6772     case Instruction::SIToFP:
6773     case Instruction::SRem:
6774     case Instruction::Store:
6775     case Instruction::Sub:
6776     case Instruction::Trunc:
6777     case Instruction::UDiv:
6778     case Instruction::UIToFP:
6779     case Instruction::URem:
6780     case Instruction::Xor:
6781     case Instruction::ZExt:
6782       return true;
6783     }
6784     return false;
6785   };
6786
6787   if (!IsVectorizableOpcode(I->getOpcode()))
6788     return false;
6789
6790   if (CallInst *CI = dyn_cast<CallInst>(I)) {
6791     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6792     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6793                ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6794       return false;
6795   }
6796
6797   auto willWiden = [&](unsigned VF) -> bool {
6798     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6799                              CM.isProfitableToScalarize(I, VF)))
6800       return false;
6801     if (CallInst *CI = dyn_cast<CallInst>(I)) {
6802       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6803       // The following case may be scalarized depending on the VF.
6804       // The flag shows whether we use Intrinsic or a usual Call for vectorized
6805       // version of the instruction.
6806       // Is it beneficial to perform intrinsic call compared to lib call?
6807       bool NeedToScalarize;
6808       unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6809       bool UseVectorIntrinsic =
6810           ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6811       return UseVectorIntrinsic || !NeedToScalarize;
6812     }
6813     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6814       assert(CM.getWideningDecision(I, VF) ==
6815                  LoopVectorizationCostModel::CM_Scalarize &&
6816              "Memory widening decisions should have been taken care by now");
6817       return false;
6818     }
6819     return true;
6820   };
6821
6822   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6823     return false;
6824
6825   // Success: widen this instruction. We optimize the common case where
6826   // consecutive instructions can be represented by a single recipe.
6827   if (!VPBB->empty()) {
6828     VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());
6829     if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))
6830       return true;
6831   }
6832
6833   VPBB->appendRecipe(new VPWidenRecipe(I));
6834   return true;
6835 }
6836
6837 VPBasicBlock *VPRecipeBuilder::handleReplication(
6838     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6839     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6840     VPlanPtr &Plan) {
6841   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6842       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6843       Range);
6844
6845   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6846       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6847
6848   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6849
6850   // Find if I uses a predicated instruction. If so, it will use its scalar
6851   // value. Avoid hoisting the insert-element which packs the scalar value into
6852   // a vector value, as that happens iff all users use the vector value.
6853   for (auto &Op : I->operands())
6854     if (auto *PredInst = dyn_cast<Instruction>(Op))
6855       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6856         PredInst2Recipe[PredInst]->setAlsoPack(false);
6857
6858   // Finalize the recipe for Instr, first if it is not predicated.
6859   if (!IsPredicated) {
6860     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
6861     VPBB->appendRecipe(Recipe);
6862     return VPBB;
6863   }
6864   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
6865   assert(VPBB->getSuccessors().empty() &&
6866          "VPBB has successors when handling predicated replication.");
6867   // Record predicated instructions for above packing optimizations.
6868   PredInst2Recipe[I] = Recipe;
6869   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
6870   VPBlockUtils::insertBlockAfter(Region, VPBB);
6871   auto *RegSucc = new VPBasicBlock();
6872   VPBlockUtils::insertBlockAfter(RegSucc, Region);
6873   return RegSucc;
6874 }
6875
6876 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
6877                                                       VPRecipeBase *PredRecipe,
6878                                                       VPlanPtr &Plan) {
6879   // Instructions marked for predication are replicated and placed under an
6880   // if-then construct to prevent side-effects.
6881
6882   // Generate recipes to compute the block mask for this region.
6883   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
6884
6885   // Build the triangular if-then region.
6886   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
6887   assert(Instr->getParent() && "Predicated instruction not in any basic block");
6888   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
6889   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
6890   auto *PHIRecipe =
6891       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
6892   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
6893   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
6894   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
6895
6896   // Note: first set Entry as region entry and then connect successors starting
6897   // from it in order, to propagate the "parent" of each VPBasicBlock.
6898   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
6899   VPBlockUtils::connectBlocks(Pred, Exit);
6900
6901   return Region;
6902 }
6903
6904 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
6905                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
6906   VPRecipeBase *Recipe = nullptr;
6907   // Check if Instr should belong to an interleave memory recipe, or already
6908   // does. In the latter case Instr is irrelevant.
6909   if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {
6910     VPBB->appendRecipe(Recipe);
6911     return true;
6912   }
6913
6914   // Check if Instr is a memory operation that should be widened.
6915   if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
6916     VPBB->appendRecipe(Recipe);
6917     return true;
6918   }
6919
6920   // Check if Instr should form some PHI recipe.
6921   if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
6922     VPBB->appendRecipe(Recipe);
6923     return true;
6924   }
6925   if ((Recipe = tryToBlend(Instr, Plan))) {
6926     VPBB->appendRecipe(Recipe);
6927     return true;
6928   }
6929   if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
6930     VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
6931     return true;
6932   }
6933
6934   // Check if Instr is to be widened by a general VPWidenRecipe, after
6935   // having first checked for specific widening recipes that deal with
6936   // Interleave Groups, Inductions and Phi nodes.
6937   if (tryToWiden(Instr, VPBB, Range))
6938     return true;
6939
6940   return false;
6941 }
6942
6943 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
6944                                                         unsigned MaxVF) {
6945   assert(OrigLoop->empty() && "Inner loop expected.");
6946
6947   // Collect conditions feeding internal conditional branches; they need to be
6948   // represented in VPlan for it to model masking.
6949   SmallPtrSet<Value *, 1> NeedDef;
6950
6951   auto *Latch = OrigLoop->getLoopLatch();
6952   for (BasicBlock *BB : OrigLoop->blocks()) {
6953     if (BB == Latch)
6954       continue;
6955     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
6956     if (Branch && Branch->isConditional())
6957       NeedDef.insert(Branch->getCondition());
6958   }
6959
6960   // If the tail is to be folded by masking, the primary induction variable
6961   // needs to be represented in VPlan for it to model early-exit masking.
6962   // Also, both the Phi and the live-out instruction of each reduction are
6963   // required in order to introduce a select between them in VPlan.
6964   if (CM.foldTailByMasking()) {
6965     NeedDef.insert(Legal->getPrimaryInduction());
6966     for (auto &Reduction : *Legal->getReductionVars()) {
6967       NeedDef.insert(Reduction.first);
6968       NeedDef.insert(Reduction.second.getLoopExitInstr());
6969     }
6970   }
6971
6972   // Collect instructions from the original loop that will become trivially dead
6973   // in the vectorized loop. We don't need to vectorize these instructions. For
6974   // example, original induction update instructions can become dead because we
6975   // separately emit induction "steps" when generating code for the new loop.
6976   // Similarly, we create a new latch condition when setting up the structure
6977   // of the new loop, so the old one can become dead.
6978   SmallPtrSet<Instruction *, 4> DeadInstructions;
6979   collectTriviallyDeadInstructions(DeadInstructions);
6980
6981   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6982     VFRange SubRange = {VF, MaxVF + 1};
6983     VPlans.push_back(
6984         buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
6985     VF = SubRange.End;
6986   }
6987 }
6988
6989 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
6990     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
6991     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6992   // Hold a mapping from predicated instructions to their recipes, in order to
6993   // fix their AlsoPack behavior if a user is determined to replicate and use a
6994   // scalar instead of vector value.
6995   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
6996
6997   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
6998   DenseMap<Instruction *, Instruction *> SinkAfterInverse;
6999
7000   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7001   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7002   auto Plan = std::make_unique<VPlan>(VPBB);
7003
7004   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
7005   // Represent values that will have defs inside VPlan.
7006   for (Value *V : NeedDef)
7007     Plan->addVPValue(V);
7008
7009   // Scan the body of the loop in a topological order to visit each basic block
7010   // after having visited its predecessor basic blocks.
7011   LoopBlocksDFS DFS(OrigLoop);
7012   DFS.perform(LI);
7013
7014   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7015     // Relevant instructions from basic block BB will be grouped into VPRecipe
7016     // ingredients and fill a new VPBasicBlock.
7017     unsigned VPBBsForBB = 0;
7018     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7019     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7020     VPBB = FirstVPBBForBB;
7021     Builder.setInsertPoint(VPBB);
7022
7023     std::vector<Instruction *> Ingredients;
7024
7025     // Organize the ingredients to vectorize from current basic block in the
7026     // right order.
7027     for (Instruction &I : BB->instructionsWithoutDebug()) {
7028       Instruction *Instr = &I;
7029
7030       // First filter out irrelevant instructions, to ensure no recipes are
7031       // built for them.
7032       if (isa<BranchInst>(Instr) ||
7033           DeadInstructions.find(Instr) != DeadInstructions.end())
7034         continue;
7035
7036       // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
7037       // member of the IG, do not construct any Recipe for it.
7038       const InterleaveGroup<Instruction> *IG =
7039           CM.getInterleavedAccessGroup(Instr);
7040       if (IG && Instr != IG->getInsertPos() &&
7041           Range.Start >= 2 && // Query is illegal for VF == 1
7042           CM.getWideningDecision(Instr, Range.Start) ==
7043               LoopVectorizationCostModel::CM_Interleave) {
7044         auto SinkCandidate = SinkAfterInverse.find(Instr);
7045         if (SinkCandidate != SinkAfterInverse.end())
7046           Ingredients.push_back(SinkCandidate->second);
7047         continue;
7048       }
7049
7050       // Move instructions to handle first-order recurrences, step 1: avoid
7051       // handling this instruction until after we've handled the instruction it
7052       // should follow.
7053       auto SAIt = SinkAfter.find(Instr);
7054       if (SAIt != SinkAfter.end()) {
7055         LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after"
7056                           << *SAIt->second
7057                           << " to vectorize a 1st order recurrence.\n");
7058         SinkAfterInverse[SAIt->second] = Instr;
7059         continue;
7060       }
7061
7062       Ingredients.push_back(Instr);
7063
7064       // Move instructions to handle first-order recurrences, step 2: push the
7065       // instruction to be sunk at its insertion point.
7066       auto SAInvIt = SinkAfterInverse.find(Instr);
7067       if (SAInvIt != SinkAfterInverse.end())
7068         Ingredients.push_back(SAInvIt->second);
7069     }
7070
7071     // Introduce each ingredient into VPlan.
7072     for (Instruction *Instr : Ingredients) {
7073       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7074         continue;
7075
7076       // Otherwise, if all widening options failed, Instruction is to be
7077       // replicated. This may create a successor for VPBB.
7078       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7079           Instr, Range, VPBB, PredInst2Recipe, Plan);
7080       if (NextVPBB != VPBB) {
7081         VPBB = NextVPBB;
7082         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7083                                     : "");
7084       }
7085     }
7086   }
7087
7088   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7089   // may also be empty, such as the last one VPBB, reflecting original
7090   // basic-blocks with no recipes.
7091   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7092   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7093   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7094   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7095   delete PreEntry;
7096
7097   // Finally, if tail is folded by masking, introduce selects between the phi
7098   // and the live-out instruction of each reduction, at the end of the latch.
7099   if (CM.foldTailByMasking()) {
7100     Builder.setInsertPoint(VPBB);
7101     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7102     for (auto &Reduction : *Legal->getReductionVars()) {
7103       VPValue *Phi = Plan->getVPValue(Reduction.first);
7104       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7105       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7106     }
7107   }
7108
7109   std::string PlanName;
7110   raw_string_ostream RSO(PlanName);
7111   unsigned VF = Range.Start;
7112   Plan->addVF(VF);
7113   RSO << "Initial VPlan for VF={" << VF;
7114   for (VF *= 2; VF < Range.End; VF *= 2) {
7115     Plan->addVF(VF);
7116     RSO << "," << VF;
7117   }
7118   RSO << "},UF>=1";
7119   RSO.flush();
7120   Plan->setName(PlanName);
7121
7122   return Plan;
7123 }
7124
7125 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7126   // Outer loop handling: They may require CFG and instruction level
7127   // transformations before even evaluating whether vectorization is profitable.
7128   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7129   // the vectorization pipeline.
7130   assert(!OrigLoop->empty());
7131   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7132
7133   // Create new empty VPlan
7134   auto Plan = std::make_unique<VPlan>();
7135
7136   // Build hierarchical CFG
7137   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7138   HCFGBuilder.buildHierarchicalCFG();
7139
7140   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7141     Plan->addVF(VF);
7142
7143   if (EnableVPlanPredication) {
7144     VPlanPredicator VPP(*Plan);
7145     VPP.predicate();
7146
7147     // Avoid running transformation to recipes until masked code generation in
7148     // VPlan-native path is in place.
7149     return Plan;
7150   }
7151
7152   SmallPtrSet<Instruction *, 1> DeadInstructions;
7153   VPlanHCFGTransforms::VPInstructionsToVPRecipes(
7154       Plan, Legal->getInductionVars(), DeadInstructions);
7155
7156   return Plan;
7157 }
7158
7159 Value* LoopVectorizationPlanner::VPCallbackILV::
7160 getOrCreateVectorValues(Value *V, unsigned Part) {
7161       return ILV.getOrCreateVectorValue(V, Part);
7162 }
7163
7164 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
7165   O << " +\n"
7166     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7167   IG->getInsertPos()->printAsOperand(O, false);
7168   if (User) {
7169     O << ", ";
7170     User->getOperand(0)->printAsOperand(O);
7171   }
7172   O << "\\l\"";
7173   for (unsigned i = 0; i < IG->getFactor(); ++i)
7174     if (Instruction *I = IG->getMember(i))
7175       O << " +\n"
7176         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7177 }
7178
7179 void VPWidenRecipe::execute(VPTransformState &State) {
7180   for (auto &Instr : make_range(Begin, End))
7181     State.ILV->widenInstruction(Instr);
7182 }
7183
7184 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7185   assert(!State.Instance && "Int or FP induction being replicated.");
7186   State.ILV->widenIntOrFpInduction(IV, Trunc);
7187 }
7188
7189 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7190   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7191 }
7192
7193 void VPBlendRecipe::execute(VPTransformState &State) {
7194   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7195   // We know that all PHIs in non-header blocks are converted into
7196   // selects, so we don't have to worry about the insertion order and we
7197   // can just use the builder.
7198   // At this point we generate the predication tree. There may be
7199   // duplications since this is a simple recursive scan, but future
7200   // optimizations will clean it up.
7201
7202   unsigned NumIncoming = Phi->getNumIncomingValues();
7203
7204   assert((User || NumIncoming == 1) &&
7205          "Multiple predecessors with predecessors having a full mask");
7206   // Generate a sequence of selects of the form:
7207   // SELECT(Mask3, In3,
7208   //      SELECT(Mask2, In2,
7209   //                   ( ...)))
7210   InnerLoopVectorizer::VectorParts Entry(State.UF);
7211   for (unsigned In = 0; In < NumIncoming; ++In) {
7212     for (unsigned Part = 0; Part < State.UF; ++Part) {
7213       // We might have single edge PHIs (blocks) - use an identity
7214       // 'select' for the first PHI operand.
7215       Value *In0 =
7216           State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7217       if (In == 0)
7218         Entry[Part] = In0; // Initialize with the first incoming value.
7219       else {
7220         // Select between the current value and the previous incoming edge
7221         // based on the incoming mask.
7222         Value *Cond = State.get(User->getOperand(In), Part);
7223         Entry[Part] =
7224             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7225       }
7226     }
7227   }
7228   for (unsigned Part = 0; Part < State.UF; ++Part)
7229     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7230 }
7231
7232 void VPInterleaveRecipe::execute(VPTransformState &State) {
7233   assert(!State.Instance && "Interleave group being replicated.");
7234   if (!User)
7235     return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
7236
7237   // Last (and currently only) operand is a mask.
7238   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7239   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7240   for (unsigned Part = 0; Part < State.UF; ++Part)
7241     MaskValues[Part] = State.get(Mask, Part);
7242   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
7243 }
7244
7245 void VPReplicateRecipe::execute(VPTransformState &State) {
7246   if (State.Instance) { // Generate a single instance.
7247     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7248     // Insert scalar instance packing it into a vector.
7249     if (AlsoPack && State.VF > 1) {
7250       // If we're constructing lane 0, initialize to start from undef.
7251       if (State.Instance->Lane == 0) {
7252         Value *Undef =
7253             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7254         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7255       }
7256       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7257     }
7258     return;
7259   }
7260
7261   // Generate scalar instances for all VF lanes of all UF parts, unless the
7262   // instruction is uniform inwhich case generate only the first lane for each
7263   // of the UF parts.
7264   unsigned EndLane = IsUniform ? 1 : State.VF;
7265   for (unsigned Part = 0; Part < State.UF; ++Part)
7266     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7267       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7268 }
7269
7270 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7271   assert(State.Instance && "Branch on Mask works only on single instance.");
7272
7273   unsigned Part = State.Instance->Part;
7274   unsigned Lane = State.Instance->Lane;
7275
7276   Value *ConditionBit = nullptr;
7277   if (!User) // Block in mask is all-one.
7278     ConditionBit = State.Builder.getTrue();
7279   else {
7280     VPValue *BlockInMask = User->getOperand(0);
7281     ConditionBit = State.get(BlockInMask, Part);
7282     if (ConditionBit->getType()->isVectorTy())
7283       ConditionBit = State.Builder.CreateExtractElement(
7284           ConditionBit, State.Builder.getInt32(Lane));
7285   }
7286
7287   // Replace the temporary unreachable terminator with a new conditional branch,
7288   // whose two destinations will be set later when they are created.
7289   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7290   assert(isa<UnreachableInst>(CurrentTerminator) &&
7291          "Expected to replace unreachable terminator with conditional branch.");
7292   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7293   CondBr->setSuccessor(0, nullptr);
7294   ReplaceInstWithInst(CurrentTerminator, CondBr);
7295 }
7296
7297 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7298   assert(State.Instance && "Predicated instruction PHI works per instance.");
7299   Instruction *ScalarPredInst = cast<Instruction>(
7300       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7301   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7302   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7303   assert(PredicatingBB && "Predicated block has no single predecessor.");
7304
7305   // By current pack/unpack logic we need to generate only a single phi node: if
7306   // a vector value for the predicated instruction exists at this point it means
7307   // the instruction has vector users only, and a phi for the vector value is
7308   // needed. In this case the recipe of the predicated instruction is marked to
7309   // also do that packing, thereby "hoisting" the insert-element sequence.
7310   // Otherwise, a phi node for the scalar value is needed.
7311   unsigned Part = State.Instance->Part;
7312   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7313     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7314     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7315     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7316     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7317     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7318     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7319   } else {
7320     Type *PredInstType = PredInst->getType();
7321     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7322     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7323     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7324     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7325   }
7326 }
7327
7328 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7329   if (!User)
7330     return State.ILV->vectorizeMemoryInstruction(&Instr);
7331
7332   // Last (and currently only) operand is a mask.
7333   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7334   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7335   for (unsigned Part = 0; Part < State.UF; ++Part)
7336     MaskValues[Part] = State.get(Mask, Part);
7337   State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
7338 }
7339
7340 static ScalarEpilogueLowering
7341 getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
7342                           ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
7343   ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
7344   if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7345       (F->hasOptSize() ||
7346        llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
7347     SEL = CM_ScalarEpilogueNotAllowedOptSize;
7348   else if (PreferPredicateOverEpilog || Hints.getPredicate())
7349     SEL = CM_ScalarEpilogueNotNeededUsePredicate;
7350
7351   return SEL;
7352 }
7353
7354 // Process the loop in the VPlan-native vectorization path. This path builds
7355 // VPlan upfront in the vectorization pipeline, which allows to apply
7356 // VPlan-to-VPlan transformations from the very beginning without modifying the
7357 // input LLVM IR.
7358 static bool processLoopInVPlanNativePath(
7359     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7360     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7361     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7362     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7363     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7364
7365   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7366   Function *F = L->getHeader()->getParent();
7367   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7368   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
7369
7370   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7371                                 &Hints, IAI);
7372   // Use the planner for outer loop vectorization.
7373   // TODO: CM is not used at this point inside the planner. Turn CM into an
7374   // optional argument if we don't need it in the future.
7375   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);
7376
7377   // Get user vectorization factor.
7378   const unsigned UserVF = Hints.getWidth();
7379
7380   // Plan how to best vectorize, return the best VF and its cost.
7381   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7382
7383   // If we are stress testing VPlan builds, do not attempt to generate vector
7384   // code. Masked vector code generation support will follow soon.
7385   // Also, do not attempt to vectorize if no vector code will be produced.
7386   if (VPlanBuildStressTest || EnableVPlanPredication ||
7387       VectorizationFactor::Disabled() == VF)
7388     return false;
7389
7390   LVP.setBestPlan(VF.Width, 1);
7391
7392   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7393                          &CM);
7394   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7395                     << L->getHeader()->getParent()->getName() << "\"\n");
7396   LVP.executePlan(LB, DT);
7397
7398   // Mark the loop as already vectorized to avoid vectorizing again.
7399   Hints.setAlreadyVectorized();
7400
7401   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7402   return true;
7403 }
7404
7405 bool LoopVectorizePass::processLoop(Loop *L) {
7406   assert((EnableVPlanNativePath || L->empty()) &&
7407          "VPlan-native path is not enabled. Only process inner loops.");
7408
7409 #ifndef NDEBUG
7410   const std::string DebugLocStr = getDebugLocString(L);
7411 #endif /* NDEBUG */
7412
7413   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7414                     << L->getHeader()->getParent()->getName() << "\" from "
7415                     << DebugLocStr << "\n");
7416
7417   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7418
7419   LLVM_DEBUG(
7420       dbgs() << "LV: Loop hints:"
7421              << " force="
7422              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7423                      ? "disabled"
7424                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7425                             ? "enabled"
7426                             : "?"))
7427              << " width=" << Hints.getWidth()
7428              << " unroll=" << Hints.getInterleave() << "\n");
7429
7430   // Function containing loop
7431   Function *F = L->getHeader()->getParent();
7432
7433   // Looking at the diagnostic output is the only way to determine if a loop
7434   // was vectorized (other than looking at the IR or machine code), so it
7435   // is important to generate an optimization remark for each loop. Most of
7436   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7437   // generated as OptimizationRemark and OptimizationRemarkMissed are
7438   // less verbose reporting vectorized loops and unvectorized loops that may
7439   // benefit from vectorization, respectively.
7440
7441   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7442     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7443     return false;
7444   }
7445
7446   PredicatedScalarEvolution PSE(*SE, *L);
7447
7448   // Check if it is legal to vectorize the loop.
7449   LoopVectorizationRequirements Requirements(*ORE);
7450   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7451                                 &Requirements, &Hints, DB, AC);
7452   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7453     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7454     Hints.emitRemarkWithHints();
7455     return false;
7456   }
7457
7458   // Check the function attributes and profiles to find out if this function
7459   // should be optimized for size.
7460   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
7461
7462   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7463   // here. They may require CFG and instruction level transformations before
7464   // even evaluating whether vectorization is profitable. Since we cannot modify
7465   // the incoming IR, we need to build VPlan upfront in the vectorization
7466   // pipeline.
7467   if (!L->empty())
7468     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7469                                         ORE, BFI, PSI, Hints);
7470
7471   assert(L->empty() && "Inner loop expected.");
7472   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7473   // count by optimizing for size, to minimize overheads.
7474   // Prefer constant trip counts over profile data, over upper bound estimate.
7475   unsigned ExpectedTC = 0;
7476   bool HasExpectedTC = false;
7477   if (const SCEVConstant *ConstExits =
7478       dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) {
7479     const APInt &ExitsCount = ConstExits->getAPInt();
7480     // We are interested in small values for ExpectedTC. Skip over those that
7481     // can't fit an unsigned.
7482     if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) {
7483       ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1;
7484       HasExpectedTC = true;
7485     }
7486   }
7487   // ExpectedTC may be large because it's bound by a variable. Check
7488   // profiling information to validate we should vectorize.
7489   if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
7490     auto EstimatedTC = getLoopEstimatedTripCount(L);
7491     if (EstimatedTC) {
7492       ExpectedTC = *EstimatedTC;
7493       HasExpectedTC = true;
7494     }
7495   }
7496   if (!HasExpectedTC) {
7497     ExpectedTC = SE->getSmallConstantMaxTripCount(L);
7498     HasExpectedTC = (ExpectedTC > 0);
7499   }
7500
7501   if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
7502     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7503                       << "This loop is worth vectorizing only if no scalar "
7504                       << "iteration overheads are incurred.");
7505     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7506       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7507     else {
7508       LLVM_DEBUG(dbgs() << "\n");
7509       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7510     }
7511   }
7512
7513   // Check the function attributes to see if implicit floats are allowed.
7514   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7515   // an integer loop and the vector instructions selected are purely integer
7516   // vector instructions?
7517   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7518     reportVectorizationFailure(
7519         "Can't vectorize when the NoImplicitFloat attribute is used",
7520         "loop not vectorized due to NoImplicitFloat attribute",
7521         "NoImplicitFloat", ORE, L);
7522     Hints.emitRemarkWithHints();
7523     return false;
7524   }
7525
7526   // Check if the target supports potentially unsafe FP vectorization.
7527   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7528   // for the target we're vectorizing for, to make sure none of the
7529   // additional fp-math flags can help.
7530   if (Hints.isPotentiallyUnsafe() &&
7531       TTI->isFPVectorizationPotentiallyUnsafe()) {
7532     reportVectorizationFailure(
7533         "Potentially unsafe FP op prevents vectorization",
7534         "loop not vectorized due to unsafe FP support.",
7535         "UnsafeFP", ORE, L);
7536     Hints.emitRemarkWithHints();
7537     return false;
7538   }
7539
7540   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7541   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7542
7543   // If an override option has been passed in for interleaved accesses, use it.
7544   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7545     UseInterleaved = EnableInterleavedMemAccesses;
7546
7547   // Analyze interleaved memory accesses.
7548   if (UseInterleaved) {
7549     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7550   }
7551
7552   // Use the cost model.
7553   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7554                                 F, &Hints, IAI);
7555   CM.collectValuesToIgnore();
7556
7557   // Use the planner for vectorization.
7558   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);
7559
7560   // Get user vectorization factor.
7561   unsigned UserVF = Hints.getWidth();
7562
7563   // Plan how to best vectorize, return the best VF and its cost.
7564   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7565
7566   VectorizationFactor VF = VectorizationFactor::Disabled();
7567   unsigned IC = 1;
7568   unsigned UserIC = Hints.getInterleave();
7569
7570   if (MaybeVF) {
7571     VF = *MaybeVF;
7572     // Select the interleave count.
7573     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7574   }
7575
7576   // Identify the diagnostic messages that should be produced.
7577   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7578   bool VectorizeLoop = true, InterleaveLoop = true;
7579   if (Requirements.doesNotMeet(F, L, Hints)) {
7580     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7581                          "requirements.\n");
7582     Hints.emitRemarkWithHints();
7583     return false;
7584   }
7585
7586   if (VF.Width == 1) {
7587     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7588     VecDiagMsg = std::make_pair(
7589         "VectorizationNotBeneficial",
7590         "the cost-model indicates that vectorization is not beneficial");
7591     VectorizeLoop = false;
7592   }
7593
7594   if (!MaybeVF && UserIC > 1) {
7595     // Tell the user interleaving was avoided up-front, despite being explicitly
7596     // requested.
7597     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7598                          "interleaving should be avoided up front\n");
7599     IntDiagMsg = std::make_pair(
7600         "InterleavingAvoided",
7601         "Ignoring UserIC, because interleaving was avoided up front");
7602     InterleaveLoop = false;
7603   } else if (IC == 1 && UserIC <= 1) {
7604     // Tell the user interleaving is not beneficial.
7605     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7606     IntDiagMsg = std::make_pair(
7607         "InterleavingNotBeneficial",
7608         "the cost-model indicates that interleaving is not beneficial");
7609     InterleaveLoop = false;
7610     if (UserIC == 1) {
7611       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7612       IntDiagMsg.second +=
7613           " and is explicitly disabled or interleave count is set to 1";
7614     }
7615   } else if (IC > 1 && UserIC == 1) {
7616     // Tell the user interleaving is beneficial, but it explicitly disabled.
7617     LLVM_DEBUG(
7618         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7619     IntDiagMsg = std::make_pair(
7620         "InterleavingBeneficialButDisabled",
7621         "the cost-model indicates that interleaving is beneficial "
7622         "but is explicitly disabled or interleave count is set to 1");
7623     InterleaveLoop = false;
7624   }
7625
7626   // Override IC if user provided an interleave count.
7627   IC = UserIC > 0 ? UserIC : IC;
7628
7629   // Emit diagnostic messages, if any.
7630   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7631   if (!VectorizeLoop && !InterleaveLoop) {
7632     // Do not vectorize or interleaving the loop.
7633     ORE->emit([&]() {
7634       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7635                                       L->getStartLoc(), L->getHeader())
7636              << VecDiagMsg.second;
7637     });
7638     ORE->emit([&]() {
7639       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7640                                       L->getStartLoc(), L->getHeader())
7641              << IntDiagMsg.second;
7642     });
7643     return false;
7644   } else if (!VectorizeLoop && InterleaveLoop) {
7645     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7646     ORE->emit([&]() {
7647       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7648                                         L->getStartLoc(), L->getHeader())
7649              << VecDiagMsg.second;
7650     });
7651   } else if (VectorizeLoop && !InterleaveLoop) {
7652     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7653                       << ") in " << DebugLocStr << '\n');
7654     ORE->emit([&]() {
7655       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7656                                         L->getStartLoc(), L->getHeader())
7657              << IntDiagMsg.second;
7658     });
7659   } else if (VectorizeLoop && InterleaveLoop) {
7660     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7661                       << ") in " << DebugLocStr << '\n');
7662     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7663   }
7664
7665   LVP.setBestPlan(VF.Width, IC);
7666
7667   using namespace ore;
7668   bool DisableRuntimeUnroll = false;
7669   MDNode *OrigLoopID = L->getLoopID();
7670
7671   if (!VectorizeLoop) {
7672     assert(IC > 1 && "interleave count should not be 1 or 0");
7673     // If we decided that it is not legal to vectorize the loop, then
7674     // interleave it.
7675     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7676                                &CM);
7677     LVP.executePlan(Unroller, DT);
7678
7679     ORE->emit([&]() {
7680       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7681                                 L->getHeader())
7682              << "interleaved loop (interleaved count: "
7683              << NV("InterleaveCount", IC) << ")";
7684     });
7685   } else {
7686     // If we decided that it is *legal* to vectorize the loop, then do it.
7687     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7688                            &LVL, &CM);
7689     LVP.executePlan(LB, DT);
7690     ++LoopsVectorized;
7691
7692     // Add metadata to disable runtime unrolling a scalar loop when there are
7693     // no runtime checks about strides and memory. A scalar loop that is
7694     // rarely used is not worth unrolling.
7695     if (!LB.areSafetyChecksAdded())
7696       DisableRuntimeUnroll = true;
7697
7698     // Report the vectorization decision.
7699     ORE->emit([&]() {
7700       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7701                                 L->getHeader())
7702              << "vectorized loop (vectorization width: "
7703              << NV("VectorizationFactor", VF.Width)
7704              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7705     });
7706   }
7707
7708   Optional<MDNode *> RemainderLoopID =
7709       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7710                                       LLVMLoopVectorizeFollowupEpilogue});
7711   if (RemainderLoopID.hasValue()) {
7712     L->setLoopID(RemainderLoopID.getValue());
7713   } else {
7714     if (DisableRuntimeUnroll)
7715       AddRuntimeUnrollDisableMetaData(L);
7716
7717     // Mark the loop as already vectorized to avoid vectorizing again.
7718     Hints.setAlreadyVectorized();
7719   }
7720
7721   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7722   return true;
7723 }
7724
7725 bool LoopVectorizePass::runImpl(
7726     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7727     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7728     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7729     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7730     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7731   SE = &SE_;
7732   LI = &LI_;
7733   TTI = &TTI_;
7734   DT = &DT_;
7735   BFI = &BFI_;
7736   TLI = TLI_;
7737   AA = &AA_;
7738   AC = &AC_;
7739   GetLAA = &GetLAA_;
7740   DB = &DB_;
7741   ORE = &ORE_;
7742   PSI = PSI_;
7743
7744   // Don't attempt if
7745   // 1. the target claims to have no vector registers, and
7746   // 2. interleaving won't help ILP.
7747   //
7748   // The second condition is necessary because, even if the target has no
7749   // vector registers, loop vectorization may still enable scalar
7750   // interleaving.
7751   if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
7752     return false;
7753
7754   bool Changed = false;
7755
7756   // The vectorizer requires loops to be in simplified form.
7757   // Since simplification may add new inner loops, it has to run before the
7758   // legality and profitability checks. This means running the loop vectorizer
7759   // will simplify all loops, regardless of whether anything end up being
7760   // vectorized.
7761   for (auto &L : *LI)
7762     Changed |=
7763         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7764
7765   // Build up a worklist of inner-loops to vectorize. This is necessary as
7766   // the act of vectorizing or partially unrolling a loop creates new loops
7767   // and can invalidate iterators across the loops.
7768   SmallVector<Loop *, 8> Worklist;
7769
7770   for (Loop *L : *LI)
7771     collectSupportedLoops(*L, LI, ORE, Worklist);
7772
7773   LoopsAnalyzed += Worklist.size();
7774
7775   // Now walk the identified inner loops.
7776   while (!Worklist.empty()) {
7777     Loop *L = Worklist.pop_back_val();
7778
7779     // For the inner loops we actually process, form LCSSA to simplify the
7780     // transform.
7781     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7782
7783     Changed |= processLoop(L);
7784   }
7785
7786   // Process each loop nest in the function.
7787   return Changed;
7788 }
7789
7790 PreservedAnalyses LoopVectorizePass::run(Function &F,
7791                                          FunctionAnalysisManager &AM) {
7792     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7793     auto &LI = AM.getResult<LoopAnalysis>(F);
7794     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7795     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7796     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7797     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7798     auto &AA = AM.getResult<AAManager>(F);
7799     auto &AC = AM.getResult<AssumptionAnalysis>(F);
7800     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7801     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7802     MemorySSA *MSSA = EnableMSSALoopDependency
7803                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7804                           : nullptr;
7805
7806     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7807     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7808         [&](Loop &L) -> const LoopAccessInfo & {
7809       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7810       return LAM.getResult<LoopAccessAnalysis>(L, AR);
7811     };
7812     const ModuleAnalysisManager &MAM =
7813         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7814     ProfileSummaryInfo *PSI =
7815         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
7816     bool Changed =
7817         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
7818     if (!Changed)
7819       return PreservedAnalyses::all();
7820     PreservedAnalyses PA;
7821
7822     // We currently do not preserve loopinfo/dominator analyses with outer loop
7823     // vectorization. Until this is addressed, mark these analyses as preserved
7824     // only for non-VPlan-native path.
7825     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7826     if (!EnableVPlanNativePath) {
7827       PA.preserve<LoopAnalysis>();
7828       PA.preserve<DominatorTreeAnalysis>();
7829     }
7830     PA.preserve<BasicAA>();
7831     PA.preserve<GlobalsAA>();
7832     return PA;
7833 }