lib/Transforms/Vectorize/LoopVectorize.cpp

   1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
  10 // and generates target-independent LLVM-IR.
  11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
  12 // of instructions in order to estimate the profitability of vectorization.
  13 //
  14 // The loop vectorizer combines consecutive loop iterations into a single
  15 // 'wide' iteration. After this transformation the index is incremented
  16 // by the SIMD vector width, and not by one.
  17 //
  18 // This pass has three parts:
  19 // 1. The main loop pass that drives the different parts.
  20 // 2. LoopVectorizationLegality - A unit that checks for the legality
  21 //    of the vectorization.
  22 // 3. InnerLoopVectorizer - A unit that performs the actual
  23 //    widening of instructions.
  24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
  25 //    of vectorization. It decides on the optimal vector width, which
  26 //    can be one, if vectorization is not profitable.
  27 //
  28 // There is a development effort going on to migrate loop vectorizer to the
  29 // VPlan infrastructure and to introduce outer loop vectorization support (see
  30 // docs/Proposal/VectorizationPlan.rst and
  31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
  32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
  33 // alternative vectorization path that is natively implemented on top of the
  34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
  35 //
  36 //===----------------------------------------------------------------------===//
  37 //
  38 // The reduction-variable vectorization is based on the paper:
  39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
  40 //
  41 // Variable uniformity checks are inspired by:
  42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
  43 //
  44 // The interleaved access vectorization is based on the paper:
  45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
  46 //  Data for SIMD
  47 //
  48 // Other ideas/concepts are from:
  49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
  50 //
  51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
  52 //  Vectorizing Compilers.
  53 //
  54 //===----------------------------------------------------------------------===//
  55
  56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
  57 #include "LoopVectorizationPlanner.h"
  58 #include "VPRecipeBuilder.h"
  59 #include "VPlan.h"
  60 #include "VPlanHCFGBuilder.h"
  61 #include "VPlanHCFGTransforms.h"
  62 #include "VPlanPredicator.h"
  63 #include "llvm/ADT/APInt.h"
  64 #include "llvm/ADT/ArrayRef.h"
  65 #include "llvm/ADT/DenseMap.h"
  66 #include "llvm/ADT/DenseMapInfo.h"
  67 #include "llvm/ADT/Hashing.h"
  68 #include "llvm/ADT/MapVector.h"
  69 #include "llvm/ADT/None.h"
  70 #include "llvm/ADT/Optional.h"
  71 #include "llvm/ADT/STLExtras.h"
  72 #include "llvm/ADT/SetVector.h"
  73 #include "llvm/ADT/SmallPtrSet.h"
  74 #include "llvm/ADT/SmallVector.h"
  75 #include "llvm/ADT/Statistic.h"
  76 #include "llvm/ADT/StringRef.h"
  77 #include "llvm/ADT/Twine.h"
  78 #include "llvm/ADT/iterator_range.h"
  79 #include "llvm/Analysis/AssumptionCache.h"
  80 #include "llvm/Analysis/BasicAliasAnalysis.h"
  81 #include "llvm/Analysis/BlockFrequencyInfo.h"
  82 #include "llvm/Analysis/CFG.h"
  83 #include "llvm/Analysis/CodeMetrics.h"
  84 #include "llvm/Analysis/DemandedBits.h"
  85 #include "llvm/Analysis/GlobalsModRef.h"
  86 #include "llvm/Analysis/LoopAccessAnalysis.h"
  87 #include "llvm/Analysis/LoopAnalysisManager.h"
  88 #include "llvm/Analysis/LoopInfo.h"
  89 #include "llvm/Analysis/LoopIterator.h"
  90 #include "llvm/Analysis/MemorySSA.h"
  91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  92 #include "llvm/Analysis/ProfileSummaryInfo.h"
  93 #include "llvm/Analysis/ScalarEvolution.h"
  94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
  95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  96 #include "llvm/Analysis/TargetLibraryInfo.h"
  97 #include "llvm/Analysis/TargetTransformInfo.h"
  98 #include "llvm/Analysis/VectorUtils.h"
  99 #include "llvm/IR/Attributes.h"
 100 #include "llvm/IR/BasicBlock.h"
 101 #include "llvm/IR/CFG.h"
 102 #include "llvm/IR/Constant.h"
 103 #include "llvm/IR/Constants.h"
 104 #include "llvm/IR/DataLayout.h"
 105 #include "llvm/IR/DebugInfoMetadata.h"
 106 #include "llvm/IR/DebugLoc.h"
 107 #include "llvm/IR/DerivedTypes.h"
 108 #include "llvm/IR/DiagnosticInfo.h"
 109 #include "llvm/IR/Dominators.h"
 110 #include "llvm/IR/Function.h"
 111 #include "llvm/IR/IRBuilder.h"
 112 #include "llvm/IR/InstrTypes.h"
 113 #include "llvm/IR/Instruction.h"
 114 #include "llvm/IR/Instructions.h"
 115 #include "llvm/IR/IntrinsicInst.h"
 116 #include "llvm/IR/Intrinsics.h"
 117 #include "llvm/IR/LLVMContext.h"
 118 #include "llvm/IR/Metadata.h"
 119 #include "llvm/IR/Module.h"
 120 #include "llvm/IR/Operator.h"
 121 #include "llvm/IR/Type.h"
 122 #include "llvm/IR/Use.h"
 123 #include "llvm/IR/User.h"
 124 #include "llvm/IR/Value.h"
 125 #include "llvm/IR/ValueHandle.h"
 126 #include "llvm/IR/Verifier.h"
 127 #include "llvm/Pass.h"
 128 #include "llvm/Support/Casting.h"
 129 #include "llvm/Support/CommandLine.h"
 130 #include "llvm/Support/Compiler.h"
 131 #include "llvm/Support/Debug.h"
 132 #include "llvm/Support/ErrorHandling.h"
 133 #include "llvm/Support/MathExtras.h"
 134 #include "llvm/Support/raw_ostream.h"
 135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 136 #include "llvm/Transforms/Utils/LoopSimplify.h"
 137 #include "llvm/Transforms/Utils/LoopUtils.h"
 138 #include "llvm/Transforms/Utils/LoopVersioning.h"
 139 #include "llvm/Transforms/Utils/SizeOpts.h"
 140 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 141 #include <algorithm>
 142 #include <cassert>
 143 #include <cstdint>
 144 #include <cstdlib>
 145 #include <functional>
 146 #include <iterator>
 147 #include <limits>
 148 #include <memory>
 149 #include <string>
 150 #include <tuple>
 151 #include <utility>
 152 #include <vector>
 153
 154 using namespace llvm;
 155
 156 #define LV_NAME "loop-vectorize"
 157 #define DEBUG_TYPE LV_NAME
 158
 159 /// @{
 160 /// Metadata attribute names
 161 static const char *const LLVMLoopVectorizeFollowupAll =
 162     "llvm.loop.vectorize.followup_all";
 163 static const char *const LLVMLoopVectorizeFollowupVectorized =
 164     "llvm.loop.vectorize.followup_vectorized";
 165 static const char *const LLVMLoopVectorizeFollowupEpilogue =
 166     "llvm.loop.vectorize.followup_epilogue";
 167 /// @}
 168
 169 STATISTIC(LoopsVectorized, "Number of loops vectorized");
 170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
 171
 172 /// Loops with a known constant trip count below this number are vectorized only
 173 /// if no scalar iteration overheads are incurred.
 174 static cl::opt<unsigned> TinyTripCountVectorThreshold(
 175     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
 176     cl::desc("Loops with a constant trip count that is smaller than this "
 177              "value are vectorized only if no scalar iteration overheads "
 178              "are incurred."));
 179
 180 // Indicates that an epilogue is undesired, predication is preferred.
 181 // This means that the vectorizer will try to fold the loop-tail (epilogue)
 182 // into the loop and predicate the loop body accordingly.
 183 static cl::opt<bool> PreferPredicateOverEpilog(
 184     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
 185     cl::desc("Indicate that an epilogue is undesired, predication should be "
 186              "used instead."));
 187
 188 static cl::opt<bool> MaximizeBandwidth(
 189     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
 190     cl::desc("Maximize bandwidth when selecting vectorization factor which "
 191              "will be determined by the smallest type in loop."));
 192
 193 static cl::opt<bool> EnableInterleavedMemAccesses(
 194     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
 195     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
 196
 197 /// An interleave-group may need masking if it resides in a block that needs
 198 /// predication, or in order to mask away gaps.
 199 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
 200     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
 201     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
 202
 203 /// We don't interleave loops with a known constant trip count below this
 204 /// number.
 205 static const unsigned TinyTripCountInterleaveThreshold = 128;
 206
 207 static cl::opt<unsigned> ForceTargetNumScalarRegs(
 208     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
 209     cl::desc("A flag that overrides the target's number of scalar registers."));
 210
 211 static cl::opt<unsigned> ForceTargetNumVectorRegs(
 212     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
 213     cl::desc("A flag that overrides the target's number of vector registers."));
 214
 215 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
 216     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
 217     cl::desc("A flag that overrides the target's max interleave factor for "
 218              "scalar loops."));
 219
 220 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
 221     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
 222     cl::desc("A flag that overrides the target's max interleave factor for "
 223              "vectorized loops."));
 224
 225 static cl::opt<unsigned> ForceTargetInstructionCost(
 226     "force-target-instruction-cost", cl::init(0), cl::Hidden,
 227     cl::desc("A flag that overrides the target's expected cost for "
 228              "an instruction to a single constant value. Mostly "
 229              "useful for getting consistent testing."));
 230
 231 static cl::opt<unsigned> SmallLoopCost(
 232     "small-loop-cost", cl::init(20), cl::Hidden,
 233     cl::desc(
 234         "The cost of a loop that is considered 'small' by the interleaver."));
 235
 236 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
 237     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
 238     cl::desc("Enable the use of the block frequency analysis to access PGO "
 239              "heuristics minimizing code growth in cold regions and being more "
 240              "aggressive in hot regions."));
 241
 242 // Runtime interleave loops for load/store throughput.
 243 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
 244     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
 245     cl::desc(
 246         "Enable runtime interleaving until load/store ports are saturated"));
 247
 248 /// The number of stores in a loop that are allowed to need predication.
 249 static cl::opt<unsigned> NumberOfStoresToPredicate(
 250     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
 251     cl::desc("Max number of stores to be predicated behind an if."));
 252
 253 static cl::opt<bool> EnableIndVarRegisterHeur(
 254     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
 255     cl::desc("Count the induction variable only once when interleaving"));
 256
 257 static cl::opt<bool> EnableCondStoresVectorization(
 258     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
 259     cl::desc("Enable if predication of stores during vectorization."));
 260
 261 static cl::opt<unsigned> MaxNestedScalarReductionIC(
 262     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
 263     cl::desc("The maximum interleave count to use when interleaving a scalar "
 264              "reduction in a nested loop."));
 265
 266 cl::opt<bool> EnableVPlanNativePath(
 267     "enable-vplan-native-path", cl::init(false), cl::Hidden,
 268     cl::desc("Enable VPlan-native vectorization path with "
 269              "support for outer loop vectorization."));
 270
 271 // FIXME: Remove this switch once we have divergence analysis. Currently we
 272 // assume divergent non-backedge branches when this switch is true.
 273 cl::opt<bool> EnableVPlanPredication(
 274     "enable-vplan-predication", cl::init(false), cl::Hidden,
 275     cl::desc("Enable VPlan-native vectorization path predicator with "
 276              "support for outer loop vectorization."));
 277
 278 // This flag enables the stress testing of the VPlan H-CFG construction in the
 279 // VPlan-native vectorization path. It must be used in conjuction with
 280 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
 281 // verification of the H-CFGs built.
 282 static cl::opt<bool> VPlanBuildStressTest(
 283     "vplan-build-stress-test", cl::init(false), cl::Hidden,
 284     cl::desc(
 285         "Build VPlan for every supported loop nest in the function and bail "
 286         "out right after the build (stress test the VPlan H-CFG construction "
 287         "in the VPlan-native vectorization path)."));
 288
 289 cl::opt<bool> llvm::EnableLoopInterleaving(
 290     "interleave-loops", cl::init(true), cl::Hidden,
 291     cl::desc("Enable loop interleaving in Loop vectorization passes"));
 292 cl::opt<bool> llvm::EnableLoopVectorization(
 293     "vectorize-loops", cl::init(true), cl::Hidden,
 294     cl::desc("Run the Loop vectorization passes"));
 295
 296 /// A helper function for converting Scalar types to vector types.
 297 /// If the incoming type is void, we return void. If the VF is 1, we return
 298 /// the scalar type.
 299 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
 300   if (Scalar->isVoidTy() || VF == 1)
 301     return Scalar;
 302   return VectorType::get(Scalar, VF);
 303 }
 304
 305 /// A helper function that returns the type of loaded or stored value.
 306 static Type *getMemInstValueType(Value *I) {
 307   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
 308          "Expected Load or Store instruction");
 309   if (auto *LI = dyn_cast<LoadInst>(I))
 310     return LI->getType();
 311   return cast<StoreInst>(I)->getValueOperand()->getType();
 312 }
 313
 314 /// A helper function that returns true if the given type is irregular. The
 315 /// type is irregular if its allocated size doesn't equal the store size of an
 316 /// element of the corresponding vector type at the given vectorization factor.
 317 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
 318   // Determine if an array of VF elements of type Ty is "bitcast compatible"
 319   // with a <VF x Ty> vector.
 320   if (VF > 1) {
 321     auto *VectorTy = VectorType::get(Ty, VF);
 322     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
 323   }
 324
 325   // If the vectorization factor is one, we just check if an array of type Ty
 326   // requires padding between elements.
 327   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
 328 }
 329
 330 /// A helper function that returns the reciprocal of the block probability of
 331 /// predicated blocks. If we return X, we are assuming the predicated block
 332 /// will execute once for every X iterations of the loop header.
 333 ///
 334 /// TODO: We should use actual block probability here, if available. Currently,
 335 ///       we always assume predicated blocks have a 50% chance of executing.
 336 static unsigned getReciprocalPredBlockProb() { return 2; }
 337
 338 /// A helper function that adds a 'fast' flag to floating-point operations.
 339 static Value *addFastMathFlag(Value *V) {
 340   if (isa<FPMathOperator>(V))
 341     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
 342   return V;
 343 }
 344
 345 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
 346   if (isa<FPMathOperator>(V))
 347     cast<Instruction>(V)->setFastMathFlags(FMF);
 348   return V;
 349 }
 350
 351 /// A helper function that returns an integer or floating-point constant with
 352 /// value C.
 353 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
 354   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
 355                            : ConstantFP::get(Ty, C);
 356 }
 357
 358 /// Returns "best known" trip count for the specified loop \p L as defined by
 359 /// the following procedure:
 360 ///   1) Returns exact trip count if it is known.
 361 ///   2) Returns expected trip count according to profile data if any.
 362 ///   3) Returns upper bound estimate if it is known.
 363 ///   4) Returns None if all of the above failed.
 364 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
 365   // Check if exact trip count is known.
 366   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
 367     return ExpectedTC;
 368
 369   // Check if there is an expected trip count available from profile data.
 370   if (LoopVectorizeWithBlockFrequency)
 371     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
 372       return EstimatedTC;
 373
 374   // Check if upper bound estimate is known.
 375   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
 376     return ExpectedTC;
 377
 378   return None;
 379 }
 380
 381 namespace llvm {
 382
 383 /// InnerLoopVectorizer vectorizes loops which contain only one basic
 384 /// block to a specified vectorization factor (VF).
 385 /// This class performs the widening of scalars into vectors, or multiple
 386 /// scalars. This class also implements the following features:
 387 /// * It inserts an epilogue loop for handling loops that don't have iteration
 388 ///   counts that are known to be a multiple of the vectorization factor.
 389 /// * It handles the code generation for reduction variables.
 390 /// * Scalarization (implementation using scalars) of un-vectorizable
 391 ///   instructions.
 392 /// InnerLoopVectorizer does not perform any vectorization-legality
 393 /// checks, and relies on the caller to check for the different legality
 394 /// aspects. The InnerLoopVectorizer relies on the
 395 /// LoopVectorizationLegality class to provide information about the induction
 396 /// and reduction variables that were found to a given vectorization factor.
 397 class InnerLoopVectorizer {
 398 public:
 399   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
 400                       LoopInfo *LI, DominatorTree *DT,
 401                       const TargetLibraryInfo *TLI,
 402                       const TargetTransformInfo *TTI, AssumptionCache *AC,
 403                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
 404                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
 405                       LoopVectorizationCostModel *CM)
 406       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
 407         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
 408         Builder(PSE.getSE()->getContext()),
 409         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
 410   virtual ~InnerLoopVectorizer() = default;
 411
 412   /// Create a new empty loop. Unlink the old loop and connect the new one.
 413   /// Return the pre-header block of the new loop.
 414   BasicBlock *createVectorizedLoopSkeleton();
 415
 416   /// Widen a single instruction within the innermost loop.
 417   void widenInstruction(Instruction &I);
 418
 419   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
 420   void fixVectorizedLoop();
 421
 422   // Return true if any runtime check is added.
 423   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
 424
 425   /// A type for vectorized values in the new loop. Each value from the
 426   /// original loop, when vectorized, is represented by UF vector values in the
 427   /// new unrolled loop, where UF is the unroll factor.
 428   using VectorParts = SmallVector<Value *, 2>;
 429
 430   /// Vectorize a single PHINode in a block. This method handles the induction
 431   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
 432   /// arbitrary length vectors.
 433   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
 434
 435   /// A helper function to scalarize a single Instruction in the innermost loop.
 436   /// Generates a sequence of scalar instances for each lane between \p MinLane
 437   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
 438   /// inclusive..
 439   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
 440                             bool IfPredicateInstr);
 441
 442   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
 443   /// is provided, the integer induction variable will first be truncated to
 444   /// the corresponding type.
 445   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
 446
 447   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
 448   /// vector or scalar value on-demand if one is not yet available. When
 449   /// vectorizing a loop, we visit the definition of an instruction before its
 450   /// uses. When visiting the definition, we either vectorize or scalarize the
 451   /// instruction, creating an entry for it in the corresponding map. (In some
 452   /// cases, such as induction variables, we will create both vector and scalar
 453   /// entries.) Then, as we encounter uses of the definition, we derive values
 454   /// for each scalar or vector use unless such a value is already available.
 455   /// For example, if we scalarize a definition and one of its uses is vector,
 456   /// we build the required vector on-demand with an insertelement sequence
 457   /// when visiting the use. Otherwise, if the use is scalar, we can use the
 458   /// existing scalar definition.
 459   ///
 460   /// Return a value in the new loop corresponding to \p V from the original
 461   /// loop at unroll index \p Part. If the value has already been vectorized,
 462   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
 463   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
 464   /// a new vector value on-demand by inserting the scalar values into a vector
 465   /// with an insertelement sequence. If the value has been neither vectorized
 466   /// nor scalarized, it must be loop invariant, so we simply broadcast the
 467   /// value into a vector.
 468   Value *getOrCreateVectorValue(Value *V, unsigned Part);
 469
 470   /// Return a value in the new loop corresponding to \p V from the original
 471   /// loop at unroll and vector indices \p Instance. If the value has been
 472   /// vectorized but not scalarized, the necessary extractelement instruction
 473   /// will be generated.
 474   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
 475
 476   /// Construct the vector value of a scalarized value \p V one lane at a time.
 477   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
 478
 479   /// Try to vectorize the interleaved access group that \p Instr belongs to,
 480   /// optionally masking the vector operations if \p BlockInMask is non-null.
 481   void vectorizeInterleaveGroup(Instruction *Instr,
 482                                 VectorParts *BlockInMask = nullptr);
 483
 484   /// Vectorize Load and Store instructions, optionally masking the vector
 485   /// operations if \p BlockInMask is non-null.
 486   void vectorizeMemoryInstruction(Instruction *Instr,
 487                                   VectorParts *BlockInMask = nullptr);
 488
 489   /// Set the debug location in the builder using the debug location in
 490   /// the instruction.
 491   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
 492
 493   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
 494   void fixNonInductionPHIs(void);
 495
 496 protected:
 497   friend class LoopVectorizationPlanner;
 498
 499   /// A small list of PHINodes.
 500   using PhiVector = SmallVector<PHINode *, 4>;
 501
 502   /// A type for scalarized values in the new loop. Each value from the
 503   /// original loop, when scalarized, is represented by UF x VF scalar values
 504   /// in the new unrolled loop, where UF is the unroll factor and VF is the
 505   /// vectorization factor.
 506   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
 507
 508   /// Set up the values of the IVs correctly when exiting the vector loop.
 509   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
 510                     Value *CountRoundDown, Value *EndValue,
 511                     BasicBlock *MiddleBlock);
 512
 513   /// Create a new induction variable inside L.
 514   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
 515                                    Value *Step, Instruction *DL);
 516
 517   /// Handle all cross-iteration phis in the header.
 518   void fixCrossIterationPHIs();
 519
 520   /// Fix a first-order recurrence. This is the second phase of vectorizing
 521   /// this phi node.
 522   void fixFirstOrderRecurrence(PHINode *Phi);
 523
 524   /// Fix a reduction cross-iteration phi. This is the second phase of
 525   /// vectorizing this phi node.
 526   void fixReduction(PHINode *Phi);
 527
 528   /// The Loop exit block may have single value PHI nodes with some
 529   /// incoming value. While vectorizing we only handled real values
 530   /// that were defined inside the loop and we should have one value for
 531   /// each predecessor of its parent basic block. See PR14725.
 532   void fixLCSSAPHIs();
 533
 534   /// Iteratively sink the scalarized operands of a predicated instruction into
 535   /// the block that was created for it.
 536   void sinkScalarOperands(Instruction *PredInst);
 537
 538   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
 539   /// represented as.
 540   void truncateToMinimalBitwidths();
 541
 542   /// Insert the new loop to the loop hierarchy and pass manager
 543   /// and update the analysis passes.
 544   void updateAnalysis();
 545
 546   /// Create a broadcast instruction. This method generates a broadcast
 547   /// instruction (shuffle) for loop invariant values and for the induction
 548   /// value. If this is the induction variable then we extend it to N, N+1, ...
 549   /// this is needed because each iteration in the loop corresponds to a SIMD
 550   /// element.
 551   virtual Value *getBroadcastInstrs(Value *V);
 552
 553   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
 554   /// to each vector element of Val. The sequence starts at StartIndex.
 555   /// \p Opcode is relevant for FP induction variable.
 556   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
 557                                Instruction::BinaryOps Opcode =
 558                                Instruction::BinaryOpsEnd);
 559
 560   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
 561   /// variable on which to base the steps, \p Step is the size of the step, and
 562   /// \p EntryVal is the value from the original loop that maps to the steps.
 563   /// Note that \p EntryVal doesn't have to be an induction variable - it
 564   /// can also be a truncate instruction.
 565   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
 566                         const InductionDescriptor &ID);
 567
 568   /// Create a vector induction phi node based on an existing scalar one. \p
 569   /// EntryVal is the value from the original loop that maps to the vector phi
 570   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
 571   /// truncate instruction, instead of widening the original IV, we widen a
 572   /// version of the IV truncated to \p EntryVal's type.
 573   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
 574                                        Value *Step, Instruction *EntryVal);
 575
 576   /// Returns true if an instruction \p I should be scalarized instead of
 577   /// vectorized for the chosen vectorization factor.
 578   bool shouldScalarizeInstruction(Instruction *I) const;
 579
 580   /// Returns true if we should generate a scalar version of \p IV.
 581   bool needsScalarInduction(Instruction *IV) const;
 582
 583   /// If there is a cast involved in the induction variable \p ID, which should
 584   /// be ignored in the vectorized loop body, this function records the
 585   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
 586   /// cast. We had already proved that the casted Phi is equal to the uncasted
 587   /// Phi in the vectorized loop (under a runtime guard), and therefore
 588   /// there is no need to vectorize the cast - the same value can be used in the
 589   /// vector loop for both the Phi and the cast.
 590   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
 591   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
 592   ///
 593   /// \p EntryVal is the value from the original loop that maps to the vector
 594   /// phi node and is used to distinguish what is the IV currently being
 595   /// processed - original one (if \p EntryVal is a phi corresponding to the
 596   /// original IV) or the "newly-created" one based on the proof mentioned above
 597   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
 598   /// latter case \p EntryVal is a TruncInst and we must not record anything for
 599   /// that IV, but it's error-prone to expect callers of this routine to care
 600   /// about that, hence this explicit parameter.
 601   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
 602                                              const Instruction *EntryVal,
 603                                              Value *VectorLoopValue,
 604                                              unsigned Part,
 605                                              unsigned Lane = UINT_MAX);
 606
 607   /// Generate a shuffle sequence that will reverse the vector Vec.
 608   virtual Value *reverseVector(Value *Vec);
 609
 610   /// Returns (and creates if needed) the original loop trip count.
 611   Value *getOrCreateTripCount(Loop *NewLoop);
 612
 613   /// Returns (and creates if needed) the trip count of the widened loop.
 614   Value *getOrCreateVectorTripCount(Loop *NewLoop);
 615
 616   /// Returns a bitcasted value to the requested vector type.
 617   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
 618   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
 619                                 const DataLayout &DL);
 620
 621   /// Emit a bypass check to see if the vector trip count is zero, including if
 622   /// it overflows.
 623   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
 624
 625   /// Emit a bypass check to see if all of the SCEV assumptions we've
 626   /// had to make are correct.
 627   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
 628
 629   /// Emit bypass checks to check any memory assumptions we may have made.
 630   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
 631
 632   /// Compute the transformed value of Index at offset StartValue using step
 633   /// StepValue.
 634   /// For integer induction, returns StartValue + Index * StepValue.
 635   /// For pointer induction, returns StartValue[Index * StepValue].
 636   /// FIXME: The newly created binary instructions should contain nsw/nuw
 637   /// flags, which can be found from the original scalar operations.
 638   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
 639                               const DataLayout &DL,
 640                               const InductionDescriptor &ID) const;
 641
 642   /// Add additional metadata to \p To that was not present on \p Orig.
 643   ///
 644   /// Currently this is used to add the noalias annotations based on the
 645   /// inserted memchecks.  Use this for instructions that are *cloned* into the
 646   /// vector loop.
 647   void addNewMetadata(Instruction *To, const Instruction *Orig);
 648
 649   /// Add metadata from one instruction to another.
 650   ///
 651   /// This includes both the original MDs from \p From and additional ones (\see
 652   /// addNewMetadata).  Use this for *newly created* instructions in the vector
 653   /// loop.
 654   void addMetadata(Instruction *To, Instruction *From);
 655
 656   /// Similar to the previous function but it adds the metadata to a
 657   /// vector of instructions.
 658   void addMetadata(ArrayRef<Value *> To, Instruction *From);
 659
 660   /// The original loop.
 661   Loop *OrigLoop;
 662
 663   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
 664   /// dynamic knowledge to simplify SCEV expressions and converts them to a
 665   /// more usable form.
 666   PredicatedScalarEvolution &PSE;
 667
 668   /// Loop Info.
 669   LoopInfo *LI;
 670
 671   /// Dominator Tree.
 672   DominatorTree *DT;
 673
 674   /// Alias Analysis.
 675   AliasAnalysis *AA;
 676
 677   /// Target Library Info.
 678   const TargetLibraryInfo *TLI;
 679
 680   /// Target Transform Info.
 681   const TargetTransformInfo *TTI;
 682
 683   /// Assumption Cache.
 684   AssumptionCache *AC;
 685
 686   /// Interface to emit optimization remarks.
 687   OptimizationRemarkEmitter *ORE;
 688
 689   /// LoopVersioning.  It's only set up (non-null) if memchecks were
 690   /// used.
 691   ///
 692   /// This is currently only used to add no-alias metadata based on the
 693   /// memchecks.  The actually versioning is performed manually.
 694   std::unique_ptr<LoopVersioning> LVer;
 695
 696   /// The vectorization SIMD factor to use. Each vector will have this many
 697   /// vector elements.
 698   unsigned VF;
 699
 700   /// The vectorization unroll factor to use. Each scalar is vectorized to this
 701   /// many different vector instructions.
 702   unsigned UF;
 703
 704   /// The builder that we use
 705   IRBuilder<> Builder;
 706
 707   // --- Vectorization state ---
 708
 709   /// The vector-loop preheader.
 710   BasicBlock *LoopVectorPreHeader;
 711
 712   /// The scalar-loop preheader.
 713   BasicBlock *LoopScalarPreHeader;
 714
 715   /// Middle Block between the vector and the scalar.
 716   BasicBlock *LoopMiddleBlock;
 717
 718   /// The ExitBlock of the scalar loop.
 719   BasicBlock *LoopExitBlock;
 720
 721   /// The vector loop body.
 722   BasicBlock *LoopVectorBody;
 723
 724   /// The scalar loop body.
 725   BasicBlock *LoopScalarBody;
 726
 727   /// A list of all bypass blocks. The first block is the entry of the loop.
 728   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
 729
 730   /// The new Induction variable which was added to the new block.
 731   PHINode *Induction = nullptr;
 732
 733   /// The induction variable of the old basic block.
 734   PHINode *OldInduction = nullptr;
 735
 736   /// Maps values from the original loop to their corresponding values in the
 737   /// vectorized loop. A key value can map to either vector values, scalar
 738   /// values or both kinds of values, depending on whether the key was
 739   /// vectorized and scalarized.
 740   VectorizerValueMap VectorLoopValueMap;
 741
 742   /// Store instructions that were predicated.
 743   SmallVector<Instruction *, 4> PredicatedInstructions;
 744
 745   /// Trip count of the original loop.
 746   Value *TripCount = nullptr;
 747
 748   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
 749   Value *VectorTripCount = nullptr;
 750
 751   /// The legality analysis.
 752   LoopVectorizationLegality *Legal;
 753
 754   /// The profitablity analysis.
 755   LoopVectorizationCostModel *Cost;
 756
 757   // Record whether runtime checks are added.
 758   bool AddedSafetyChecks = false;
 759
 760   // Holds the end values for each induction variable. We save the end values
 761   // so we can later fix-up the external users of the induction variables.
 762   DenseMap<PHINode *, Value *> IVEndValues;
 763
 764   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
 765   // fixed up at the end of vector code generation.
 766   SmallVector<PHINode *, 8> OrigPHIsToFix;
 767 };
 768
 769 class InnerLoopUnroller : public InnerLoopVectorizer {
 770 public:
 771   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
 772                     LoopInfo *LI, DominatorTree *DT,
 773                     const TargetLibraryInfo *TLI,
 774                     const TargetTransformInfo *TTI, AssumptionCache *AC,
 775                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
 776                     LoopVectorizationLegality *LVL,
 777                     LoopVectorizationCostModel *CM)
 778       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
 779                             UnrollFactor, LVL, CM) {}
 780
 781 private:
 782   Value *getBroadcastInstrs(Value *V) override;
 783   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
 784                        Instruction::BinaryOps Opcode =
 785                        Instruction::BinaryOpsEnd) override;
 786   Value *reverseVector(Value *Vec) override;
 787 };
 788
 789 } // end namespace llvm
 790
 791 /// Look for a meaningful debug location on the instruction or it's
 792 /// operands.
 793 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
 794   if (!I)
 795     return I;
 796
 797   DebugLoc Empty;
 798   if (I->getDebugLoc() != Empty)
 799     return I;
 800
 801   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
 802     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
 803       if (OpInst->getDebugLoc() != Empty)
 804         return OpInst;
 805   }
 806
 807   return I;
 808 }
 809
 810 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
 811   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
 812     const DILocation *DIL = Inst->getDebugLoc();
 813     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
 814         !isa<DbgInfoIntrinsic>(Inst)) {
 815       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
 816       if (NewDIL)
 817         B.SetCurrentDebugLocation(NewDIL.getValue());
 818       else
 819         LLVM_DEBUG(dbgs()
 820                    << "Failed to create new discriminator: "
 821                    << DIL->getFilename() << " Line: " << DIL->getLine());
 822     }
 823     else
 824       B.SetCurrentDebugLocation(DIL);
 825   } else
 826     B.SetCurrentDebugLocation(DebugLoc());
 827 }
 828
 829 /// Write a record \p DebugMsg about vectorization failure to the debug
 830 /// output stream. If \p I is passed, it is an instruction that prevents
 831 /// vectorization.
 832 #ifndef NDEBUG
 833 static void debugVectorizationFailure(const StringRef DebugMsg,
 834     Instruction *I) {
 835   dbgs() << "LV: Not vectorizing: " << DebugMsg;
 836   if (I != nullptr)
 837     dbgs() << " " << *I;
 838   else
 839     dbgs() << '.';
 840   dbgs() << '\n';
 841 }
 842 #endif
 843
 844 /// Create an analysis remark that explains why vectorization failed
 845 ///
 846 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
 847 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
 848 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
 849 /// the location of the remark.  \return the remark object that can be
 850 /// streamed to.
 851 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
 852     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
 853   Value *CodeRegion = TheLoop->getHeader();
 854   DebugLoc DL = TheLoop->getStartLoc();
 855
 856   if (I) {
 857     CodeRegion = I->getParent();
 858     // If there is no debug location attached to the instruction, revert back to
 859     // using the loop's.
 860     if (I->getDebugLoc())
 861       DL = I->getDebugLoc();
 862   }
 863
 864   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
 865   R << "loop not vectorized: ";
 866   return R;
 867 }
 868
 869 namespace llvm {
 870
 871 void reportVectorizationFailure(const StringRef DebugMsg,
 872     const StringRef OREMsg, const StringRef ORETag,
 873     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
 874   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
 875   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
 876   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
 877                 ORETag, TheLoop, I) << OREMsg);
 878 }
 879
 880 } // end namespace llvm
 881
 882 #ifndef NDEBUG
 883 /// \return string containing a file name and a line # for the given loop.
 884 static std::string getDebugLocString(const Loop *L) {
 885   std::string Result;
 886   if (L) {
 887     raw_string_ostream OS(Result);
 888     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
 889       LoopDbgLoc.print(OS);
 890     else
 891       // Just print the module name.
 892       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
 893     OS.flush();
 894   }
 895   return Result;
 896 }
 897 #endif
 898
 899 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
 900                                          const Instruction *Orig) {
 901   // If the loop was versioned with memchecks, add the corresponding no-alias
 902   // metadata.
 903   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
 904     LVer->annotateInstWithNoAlias(To, Orig);
 905 }
 906
 907 void InnerLoopVectorizer::addMetadata(Instruction *To,
 908                                       Instruction *From) {
 909   propagateMetadata(To, From);
 910   addNewMetadata(To, From);
 911 }
 912
 913 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
 914                                       Instruction *From) {
 915   for (Value *V : To) {
 916     if (Instruction *I = dyn_cast<Instruction>(V))
 917       addMetadata(I, From);
 918   }
 919 }
 920
 921 namespace llvm {
 922
 923 // Loop vectorization cost-model hints how the scalar epilogue loop should be
 924 // lowered.
 925 enum ScalarEpilogueLowering {
 926
 927   // The default: allowing scalar epilogues.
 928   CM_ScalarEpilogueAllowed,
 929
 930   // Vectorization with OptForSize: don't allow epilogues.
 931   CM_ScalarEpilogueNotAllowedOptSize,
 932
 933   // A special case of vectorisation with OptForSize: loops with a very small
 934   // trip count are considered for vectorization under OptForSize, thereby
 935   // making sure the cost of their loop body is dominant, free of runtime
 936   // guards and scalar iteration overheads.
 937   CM_ScalarEpilogueNotAllowedLowTripLoop,
 938
 939   // Loop hint predicate indicating an epilogue is undesired.
 940   CM_ScalarEpilogueNotNeededUsePredicate
 941 };
 942
 943 /// LoopVectorizationCostModel - estimates the expected speedups due to
 944 /// vectorization.
 945 /// In many cases vectorization is not profitable. This can happen because of
 946 /// a number of reasons. In this class we mainly attempt to predict the
 947 /// expected speedup/slowdowns due to the supported instruction set. We use the
 948 /// TargetTransformInfo to query the different backends for the cost of
 949 /// different operations.
 950 class LoopVectorizationCostModel {
 951 public:
 952   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
 953                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
 954                              LoopVectorizationLegality *Legal,
 955                              const TargetTransformInfo &TTI,
 956                              const TargetLibraryInfo *TLI, DemandedBits *DB,
 957                              AssumptionCache *AC,
 958                              OptimizationRemarkEmitter *ORE, const Function *F,
 959                              const LoopVectorizeHints *Hints,
 960                              InterleavedAccessInfo &IAI)
 961       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
 962         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
 963         Hints(Hints), InterleaveInfo(IAI) {}
 964
 965   /// \return An upper bound for the vectorization factor, or None if
 966   /// vectorization and interleaving should be avoided up front.
 967   Optional<unsigned> computeMaxVF();
 968
 969   /// \return True if runtime checks are required for vectorization, and false
 970   /// otherwise.
 971   bool runtimeChecksRequired();
 972
 973   /// \return The most profitable vectorization factor and the cost of that VF.
 974   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
 975   /// then this vectorization factor will be selected if vectorization is
 976   /// possible.
 977   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
 978
 979   /// Setup cost-based decisions for user vectorization factor.
 980   void selectUserVectorizationFactor(unsigned UserVF) {
 981     collectUniformsAndScalars(UserVF);
 982     collectInstsToScalarize(UserVF);
 983   }
 984
 985   /// \return The size (in bits) of the smallest and widest types in the code
 986   /// that needs to be vectorized. We ignore values that remain scalar such as
 987   /// 64 bit loop indices.
 988   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
 989
 990   /// \return The desired interleave count.
 991   /// If interleave count has been specified by metadata it will be returned.
 992   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
 993   /// are the selected vectorization factor and the cost of the selected VF.
 994   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
 995
 996   /// Memory access instruction may be vectorized in more than one way.
 997   /// Form of instruction after vectorization depends on cost.
 998   /// This function takes cost-based decisions for Load/Store instructions
 999   /// and collects them in a map. This decisions map is used for building
1000   /// the lists of loop-uniform and loop-scalar instructions.
1001   /// The calculated cost is saved with widening decision in order to
1002   /// avoid redundant calculations.
1003   void setCostBasedWideningDecision(unsigned VF);
1004
1005   /// A struct that represents some properties of the register usage
1006   /// of a loop.
1007   struct RegisterUsage {
1008     /// Holds the number of loop invariant values that are used in the loop.
1009     /// The key is ClassID of target-provided register class.
1010     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1011     /// Holds the maximum number of concurrent live intervals in the loop.
1012     /// The key is ClassID of target-provided register class.
1013     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1014   };
1015
1016   /// \return Returns information about the register usages of the loop for the
1017   /// given vectorization factors.
1018   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1019
1020   /// Collect values we want to ignore in the cost model.
1021   void collectValuesToIgnore();
1022
1023   /// \returns The smallest bitwidth each instruction can be represented with.
1024   /// The vector equivalents of these instructions should be truncated to this
1025   /// type.
1026   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1027     return MinBWs;
1028   }
1029
1030   /// \returns True if it is more profitable to scalarize instruction \p I for
1031   /// vectorization factor \p VF.
1032   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1033     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1034
1035     // Cost model is not run in the VPlan-native path - return conservative
1036     // result until this changes.
1037     if (EnableVPlanNativePath)
1038       return false;
1039
1040     auto Scalars = InstsToScalarize.find(VF);
1041     assert(Scalars != InstsToScalarize.end() &&
1042            "VF not yet analyzed for scalarization profitability");
1043     return Scalars->second.find(I) != Scalars->second.end();
1044   }
1045
1046   /// Returns true if \p I is known to be uniform after vectorization.
1047   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1048     if (VF == 1)
1049       return true;
1050
1051     // Cost model is not run in the VPlan-native path - return conservative
1052     // result until this changes.
1053     if (EnableVPlanNativePath)
1054       return false;
1055
1056     auto UniformsPerVF = Uniforms.find(VF);
1057     assert(UniformsPerVF != Uniforms.end() &&
1058            "VF not yet analyzed for uniformity");
1059     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1060   }
1061
1062   /// Returns true if \p I is known to be scalar after vectorization.
1063   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1064     if (VF == 1)
1065       return true;
1066
1067     // Cost model is not run in the VPlan-native path - return conservative
1068     // result until this changes.
1069     if (EnableVPlanNativePath)
1070       return false;
1071
1072     auto ScalarsPerVF = Scalars.find(VF);
1073     assert(ScalarsPerVF != Scalars.end() &&
1074            "Scalar values are not calculated for VF");
1075     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1076   }
1077
1078   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1079   /// for vectorization factor \p VF.
1080   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1081     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1082            !isProfitableToScalarize(I, VF) &&
1083            !isScalarAfterVectorization(I, VF);
1084   }
1085
1086   /// Decision that was taken during cost calculation for memory instruction.
1087   enum InstWidening {
1088     CM_Unknown,
1089     CM_Widen,         // For consecutive accesses with stride +1.
1090     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1091     CM_Interleave,
1092     CM_GatherScatter,
1093     CM_Scalarize
1094   };
1095
1096   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1097   /// instruction \p I and vector width \p VF.
1098   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1099                            unsigned Cost) {
1100     assert(VF >= 2 && "Expected VF >=2");
1101     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1102   }
1103
1104   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1105   /// interleaving group \p Grp and vector width \p VF.
1106   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1107                            InstWidening W, unsigned Cost) {
1108     assert(VF >= 2 && "Expected VF >=2");
1109     /// Broadcast this decicion to all instructions inside the group.
1110     /// But the cost will be assigned to one instruction only.
1111     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1112       if (auto *I = Grp->getMember(i)) {
1113         if (Grp->getInsertPos() == I)
1114           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1115         else
1116           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1117       }
1118     }
1119   }
1120
1121   /// Return the cost model decision for the given instruction \p I and vector
1122   /// width \p VF. Return CM_Unknown if this instruction did not pass
1123   /// through the cost modeling.
1124   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1125     assert(VF >= 2 && "Expected VF >=2");
1126
1127     // Cost model is not run in the VPlan-native path - return conservative
1128     // result until this changes.
1129     if (EnableVPlanNativePath)
1130       return CM_GatherScatter;
1131
1132     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1133     auto Itr = WideningDecisions.find(InstOnVF);
1134     if (Itr == WideningDecisions.end())
1135       return CM_Unknown;
1136     return Itr->second.first;
1137   }
1138
1139   /// Return the vectorization cost for the given instruction \p I and vector
1140   /// width \p VF.
1141   unsigned getWideningCost(Instruction *I, unsigned VF) {
1142     assert(VF >= 2 && "Expected VF >=2");
1143     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1144     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1145            "The cost is not calculated");
1146     return WideningDecisions[InstOnVF].second;
1147   }
1148
1149   /// Return True if instruction \p I is an optimizable truncate whose operand
1150   /// is an induction variable. Such a truncate will be removed by adding a new
1151   /// induction variable with the destination type.
1152   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1153     // If the instruction is not a truncate, return false.
1154     auto *Trunc = dyn_cast<TruncInst>(I);
1155     if (!Trunc)
1156       return false;
1157
1158     // Get the source and destination types of the truncate.
1159     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1160     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1161
1162     // If the truncate is free for the given types, return false. Replacing a
1163     // free truncate with an induction variable would add an induction variable
1164     // update instruction to each iteration of the loop. We exclude from this
1165     // check the primary induction variable since it will need an update
1166     // instruction regardless.
1167     Value *Op = Trunc->getOperand(0);
1168     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1169       return false;
1170
1171     // If the truncated value is not an induction variable, return false.
1172     return Legal->isInductionPhi(Op);
1173   }
1174
1175   /// Collects the instructions to scalarize for each predicated instruction in
1176   /// the loop.
1177   void collectInstsToScalarize(unsigned VF);
1178
1179   /// Collect Uniform and Scalar values for the given \p VF.
1180   /// The sets depend on CM decision for Load/Store instructions
1181   /// that may be vectorized as interleave, gather-scatter or scalarized.
1182   void collectUniformsAndScalars(unsigned VF) {
1183     // Do the analysis once.
1184     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1185       return;
1186     setCostBasedWideningDecision(VF);
1187     collectLoopUniforms(VF);
1188     collectLoopScalars(VF);
1189   }
1190
1191   /// Returns true if the target machine supports masked store operation
1192   /// for the given \p DataType and kind of access to \p Ptr.
1193   bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1194     return Legal->isConsecutivePtr(Ptr) &&
1195            TTI.isLegalMaskedStore(DataType, Alignment);
1196   }
1197
1198   /// Returns true if the target machine supports masked load operation
1199   /// for the given \p DataType and kind of access to \p Ptr.
1200   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1201     return Legal->isConsecutivePtr(Ptr) &&
1202            TTI.isLegalMaskedLoad(DataType, Alignment);
1203   }
1204
1205   /// Returns true if the target machine supports masked scatter operation
1206   /// for the given \p DataType.
1207   bool isLegalMaskedScatter(Type *DataType) {
1208     return TTI.isLegalMaskedScatter(DataType);
1209   }
1210
1211   /// Returns true if the target machine supports masked gather operation
1212   /// for the given \p DataType.
1213   bool isLegalMaskedGather(Type *DataType) {
1214     return TTI.isLegalMaskedGather(DataType);
1215   }
1216
1217   /// Returns true if the target machine can represent \p V as a masked gather
1218   /// or scatter operation.
1219   bool isLegalGatherOrScatter(Value *V) {
1220     bool LI = isa<LoadInst>(V);
1221     bool SI = isa<StoreInst>(V);
1222     if (!LI && !SI)
1223       return false;
1224     auto *Ty = getMemInstValueType(V);
1225     return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1226   }
1227
1228   /// Returns true if \p I is an instruction that will be scalarized with
1229   /// predication. Such instructions include conditional stores and
1230   /// instructions that may divide by zero.
1231   /// If a non-zero VF has been calculated, we check if I will be scalarized
1232   /// predication for that VF.
1233   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1234
1235   // Returns true if \p I is an instruction that will be predicated either
1236   // through scalar predication or masked load/store or masked gather/scatter.
1237   // Superset of instructions that return true for isScalarWithPredication.
1238   bool isPredicatedInst(Instruction *I) {
1239     if (!blockNeedsPredication(I->getParent()))
1240       return false;
1241     // Loads and stores that need some form of masked operation are predicated
1242     // instructions.
1243     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1244       return Legal->isMaskRequired(I);
1245     return isScalarWithPredication(I);
1246   }
1247
1248   /// Returns true if \p I is a memory instruction with consecutive memory
1249   /// access that can be widened.
1250   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1251
1252   /// Returns true if \p I is a memory instruction in an interleaved-group
1253   /// of memory accesses that can be vectorized with wide vector loads/stores
1254   /// and shuffles.
1255   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1256
1257   /// Check if \p Instr belongs to any interleaved access group.
1258   bool isAccessInterleaved(Instruction *Instr) {
1259     return InterleaveInfo.isInterleaved(Instr);
1260   }
1261
1262   /// Get the interleaved access group that \p Instr belongs to.
1263   const InterleaveGroup<Instruction> *
1264   getInterleavedAccessGroup(Instruction *Instr) {
1265     return InterleaveInfo.getInterleaveGroup(Instr);
1266   }
1267
1268   /// Returns true if an interleaved group requires a scalar iteration
1269   /// to handle accesses with gaps, and there is nothing preventing us from
1270   /// creating a scalar epilogue.
1271   bool requiresScalarEpilogue() const {
1272     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1273   }
1274
1275   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1276   /// loop hint annotation.
1277   bool isScalarEpilogueAllowed() const {
1278     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1279   }
1280
1281   /// Returns true if all loop blocks should be masked to fold tail loop.
1282   bool foldTailByMasking() const { return FoldTailByMasking; }
1283
1284   bool blockNeedsPredication(BasicBlock *BB) {
1285     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1286   }
1287
1288   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1289   /// with factor VF.  Return the cost of the instruction, including
1290   /// scalarization overhead if it's needed.
1291   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1292
1293   /// Estimate cost of a call instruction CI if it were vectorized with factor
1294   /// VF. Return the cost of the instruction, including scalarization overhead
1295   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1296   /// scalarized -
1297   /// i.e. either vector version isn't available, or is too expensive.
1298   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1299
1300 private:
1301   unsigned NumPredStores = 0;
1302
1303   /// \return An upper bound for the vectorization factor, larger than zero.
1304   /// One is returned if vectorization should best be avoided due to cost.
1305   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1306
1307   /// The vectorization cost is a combination of the cost itself and a boolean
1308   /// indicating whether any of the contributing operations will actually
1309   /// operate on
1310   /// vector values after type legalization in the backend. If this latter value
1311   /// is
1312   /// false, then all operations will be scalarized (i.e. no vectorization has
1313   /// actually taken place).
1314   using VectorizationCostTy = std::pair<unsigned, bool>;
1315
1316   /// Returns the expected execution cost. The unit of the cost does
1317   /// not matter because we use the 'cost' units to compare different
1318   /// vector widths. The cost that is returned is *not* normalized by
1319   /// the factor width.
1320   VectorizationCostTy expectedCost(unsigned VF);
1321
1322   /// Returns the execution time cost of an instruction for a given vector
1323   /// width. Vector width of one means scalar.
1324   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1325
1326   /// The cost-computation logic from getInstructionCost which provides
1327   /// the vector type as an output parameter.
1328   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1329
1330   /// Calculate vectorization cost of memory instruction \p I.
1331   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1332
1333   /// The cost computation for scalarized memory instruction.
1334   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1335
1336   /// The cost computation for interleaving group of memory instructions.
1337   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1338
1339   /// The cost computation for Gather/Scatter instruction.
1340   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1341
1342   /// The cost computation for widening instruction \p I with consecutive
1343   /// memory access.
1344   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1345
1346   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1347   /// Load: scalar load + broadcast.
1348   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1349   /// element)
1350   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1351
1352   /// Estimate the overhead of scalarizing an instruction. This is a
1353   /// convenience wrapper for the type-based getScalarizationOverhead API.
1354   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1355
1356   /// Returns whether the instruction is a load or store and will be a emitted
1357   /// as a vector operation.
1358   bool isConsecutiveLoadOrStore(Instruction *I);
1359
1360   /// Returns true if an artificially high cost for emulated masked memrefs
1361   /// should be used.
1362   bool useEmulatedMaskMemRefHack(Instruction *I);
1363
1364   /// Map of scalar integer values to the smallest bitwidth they can be legally
1365   /// represented as. The vector equivalents of these values should be truncated
1366   /// to this type.
1367   MapVector<Instruction *, uint64_t> MinBWs;
1368
1369   /// A type representing the costs for instructions if they were to be
1370   /// scalarized rather than vectorized. The entries are Instruction-Cost
1371   /// pairs.
1372   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1373
1374   /// A set containing all BasicBlocks that are known to present after
1375   /// vectorization as a predicated block.
1376   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1377
1378   /// Records whether it is allowed to have the original scalar loop execute at
1379   /// least once. This may be needed as a fallback loop in case runtime
1380   /// aliasing/dependence checks fail, or to handle the tail/remainder
1381   /// iterations when the trip count is unknown or doesn't divide by the VF,
1382   /// or as a peel-loop to handle gaps in interleave-groups.
1383   /// Under optsize and when the trip count is very small we don't allow any
1384   /// iterations to execute in the scalar loop.
1385   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1386
1387   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1388   bool FoldTailByMasking = false;
1389
1390   /// A map holding scalar costs for different vectorization factors. The
1391   /// presence of a cost for an instruction in the mapping indicates that the
1392   /// instruction will be scalarized when vectorizing with the associated
1393   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1394   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1395
1396   /// Holds the instructions known to be uniform after vectorization.
1397   /// The data is collected per VF.
1398   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1399
1400   /// Holds the instructions known to be scalar after vectorization.
1401   /// The data is collected per VF.
1402   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1403
1404   /// Holds the instructions (address computations) that are forced to be
1405   /// scalarized.
1406   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1407
1408   /// Returns the expected difference in cost from scalarizing the expression
1409   /// feeding a predicated instruction \p PredInst. The instructions to
1410   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1411   /// non-negative return value implies the expression will be scalarized.
1412   /// Currently, only single-use chains are considered for scalarization.
1413   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1414                               unsigned VF);
1415
1416   /// Collect the instructions that are uniform after vectorization. An
1417   /// instruction is uniform if we represent it with a single scalar value in
1418   /// the vectorized loop corresponding to each vector iteration. Examples of
1419   /// uniform instructions include pointer operands of consecutive or
1420   /// interleaved memory accesses. Note that although uniformity implies an
1421   /// instruction will be scalar, the reverse is not true. In general, a
1422   /// scalarized instruction will be represented by VF scalar values in the
1423   /// vectorized loop, each corresponding to an iteration of the original
1424   /// scalar loop.
1425   void collectLoopUniforms(unsigned VF);
1426
1427   /// Collect the instructions that are scalar after vectorization. An
1428   /// instruction is scalar if it is known to be uniform or will be scalarized
1429   /// during vectorization. Non-uniform scalarized instructions will be
1430   /// represented by VF values in the vectorized loop, each corresponding to an
1431   /// iteration of the original scalar loop.
1432   void collectLoopScalars(unsigned VF);
1433
1434   /// Keeps cost model vectorization decision and cost for instructions.
1435   /// Right now it is used for memory instructions only.
1436   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1437                                 std::pair<InstWidening, unsigned>>;
1438
1439   DecisionList WideningDecisions;
1440
1441   /// Returns true if \p V is expected to be vectorized and it needs to be
1442   /// extracted.
1443   bool needsExtract(Value *V, unsigned VF) const {
1444     Instruction *I = dyn_cast<Instruction>(V);
1445     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1446       return false;
1447
1448     // Assume we can vectorize V (and hence we need extraction) if the
1449     // scalars are not computed yet. This can happen, because it is called
1450     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1451     // the scalars are collected. That should be a safe assumption in most
1452     // cases, because we check if the operands have vectorizable types
1453     // beforehand in LoopVectorizationLegality.
1454     return Scalars.find(VF) == Scalars.end() ||
1455            !isScalarAfterVectorization(I, VF);
1456   };
1457
1458   /// Returns a range containing only operands needing to be extracted.
1459   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1460                                                    unsigned VF) {
1461     return SmallVector<Value *, 4>(make_filter_range(
1462         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1463   }
1464
1465 public:
1466   /// The loop that we evaluate.
1467   Loop *TheLoop;
1468
1469   /// Predicated scalar evolution analysis.
1470   PredicatedScalarEvolution &PSE;
1471
1472   /// Loop Info analysis.
1473   LoopInfo *LI;
1474
1475   /// Vectorization legality.
1476   LoopVectorizationLegality *Legal;
1477
1478   /// Vector target information.
1479   const TargetTransformInfo &TTI;
1480
1481   /// Target Library Info.
1482   const TargetLibraryInfo *TLI;
1483
1484   /// Demanded bits analysis.
1485   DemandedBits *DB;
1486
1487   /// Assumption cache.
1488   AssumptionCache *AC;
1489
1490   /// Interface to emit optimization remarks.
1491   OptimizationRemarkEmitter *ORE;
1492
1493   const Function *TheFunction;
1494
1495   /// Loop Vectorize Hint.
1496   const LoopVectorizeHints *Hints;
1497
1498   /// The interleave access information contains groups of interleaved accesses
1499   /// with the same stride and close to each other.
1500   InterleavedAccessInfo &InterleaveInfo;
1501
1502   /// Values to ignore in the cost model.
1503   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1504
1505   /// Values to ignore in the cost model when VF > 1.
1506   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1507 };
1508
1509 } // end namespace llvm
1510
1511 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1512 // vectorization. The loop needs to be annotated with #pragma omp simd
1513 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1514 // vector length information is not provided, vectorization is not considered
1515 // explicit. Interleave hints are not allowed either. These limitations will be
1516 // relaxed in the future.
1517 // Please, note that we are currently forced to abuse the pragma 'clang
1518 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1519 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1520 // provides *explicit vectorization hints* (LV can bypass legal checks and
1521 // assume that vectorization is legal). However, both hints are implemented
1522 // using the same metadata (llvm.loop.vectorize, processed by
1523 // LoopVectorizeHints). This will be fixed in the future when the native IR
1524 // representation for pragma 'omp simd' is introduced.
1525 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1526                                    OptimizationRemarkEmitter *ORE) {
1527   assert(!OuterLp->empty() && "This is not an outer loop");
1528   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1529
1530   // Only outer loops with an explicit vectorization hint are supported.
1531   // Unannotated outer loops are ignored.
1532   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1533     return false;
1534
1535   Function *Fn = OuterLp->getHeader()->getParent();
1536   if (!Hints.allowVectorization(Fn, OuterLp,
1537                                 true /*VectorizeOnlyWhenForced*/)) {
1538     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1539     return false;
1540   }
1541
1542   if (Hints.getInterleave() > 1) {
1543     // TODO: Interleave support is future work.
1544     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1545                          "outer loops.\n");
1546     Hints.emitRemarkWithHints();
1547     return false;
1548   }
1549
1550   return true;
1551 }
1552
1553 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1554                                   OptimizationRemarkEmitter *ORE,
1555                                   SmallVectorImpl<Loop *> &V) {
1556   // Collect inner loops and outer loops without irreducible control flow. For
1557   // now, only collect outer loops that have explicit vectorization hints. If we
1558   // are stress testing the VPlan H-CFG construction, we collect the outermost
1559   // loop of every loop nest.
1560   if (L.empty() || VPlanBuildStressTest ||
1561       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1562     LoopBlocksRPO RPOT(&L);
1563     RPOT.perform(LI);
1564     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1565       V.push_back(&L);
1566       // TODO: Collect inner loops inside marked outer loops in case
1567       // vectorization fails for the outer loop. Do not invoke
1568       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1569       // already known to be reducible. We can use an inherited attribute for
1570       // that.
1571       return;
1572     }
1573   }
1574   for (Loop *InnerL : L)
1575     collectSupportedLoops(*InnerL, LI, ORE, V);
1576 }
1577
1578 namespace {
1579
1580 /// The LoopVectorize Pass.
1581 struct LoopVectorize : public FunctionPass {
1582   /// Pass identification, replacement for typeid
1583   static char ID;
1584
1585   LoopVectorizePass Impl;
1586
1587   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1588                          bool VectorizeOnlyWhenForced = false)
1589       : FunctionPass(ID) {
1590     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1591     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1592     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1593   }
1594
1595   bool runOnFunction(Function &F) override {
1596     if (skipFunction(F))
1597       return false;
1598
1599     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1600     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1601     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1602     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1603     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1604     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1605     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1606     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1607     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1608     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1609     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1610     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1611     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1612
1613     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1614         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1615
1616     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1617                         GetLAA, *ORE, PSI);
1618   }
1619
1620   void getAnalysisUsage(AnalysisUsage &AU) const override {
1621     AU.addRequired<AssumptionCacheTracker>();
1622     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1623     AU.addRequired<DominatorTreeWrapperPass>();
1624     AU.addRequired<LoopInfoWrapperPass>();
1625     AU.addRequired<ScalarEvolutionWrapperPass>();
1626     AU.addRequired<TargetTransformInfoWrapperPass>();
1627     AU.addRequired<AAResultsWrapperPass>();
1628     AU.addRequired<LoopAccessLegacyAnalysis>();
1629     AU.addRequired<DemandedBitsWrapperPass>();
1630     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1631
1632     // We currently do not preserve loopinfo/dominator analyses with outer loop
1633     // vectorization. Until this is addressed, mark these analyses as preserved
1634     // only for non-VPlan-native path.
1635     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1636     if (!EnableVPlanNativePath) {
1637       AU.addPreserved<LoopInfoWrapperPass>();
1638       AU.addPreserved<DominatorTreeWrapperPass>();
1639     }
1640
1641     AU.addPreserved<BasicAAWrapperPass>();
1642     AU.addPreserved<GlobalsAAWrapperPass>();
1643     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1644   }
1645 };
1646
1647 } // end anonymous namespace
1648
1649 //===----------------------------------------------------------------------===//
1650 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1651 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1652 //===----------------------------------------------------------------------===//
1653
1654 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1655   // We need to place the broadcast of invariant variables outside the loop,
1656   // but only if it's proven safe to do so. Else, broadcast will be inside
1657   // vector loop body.
1658   Instruction *Instr = dyn_cast<Instruction>(V);
1659   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1660                      (!Instr ||
1661                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1662   // Place the code for broadcasting invariant variables in the new preheader.
1663   IRBuilder<>::InsertPointGuard Guard(Builder);
1664   if (SafeToHoist)
1665     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1666
1667   // Broadcast the scalar into all locations in the vector.
1668   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1669
1670   return Shuf;
1671 }
1672
1673 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1674     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1675   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1676          "Expected either an induction phi-node or a truncate of it!");
1677   Value *Start = II.getStartValue();
1678
1679   // Construct the initial value of the vector IV in the vector loop preheader
1680   auto CurrIP = Builder.saveIP();
1681   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1682   if (isa<TruncInst>(EntryVal)) {
1683     assert(Start->getType()->isIntegerTy() &&
1684            "Truncation requires an integer type");
1685     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1686     Step = Builder.CreateTrunc(Step, TruncType);
1687     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1688   }
1689   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1690   Value *SteppedStart =
1691       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1692
1693   // We create vector phi nodes for both integer and floating-point induction
1694   // variables. Here, we determine the kind of arithmetic we will perform.
1695   Instruction::BinaryOps AddOp;
1696   Instruction::BinaryOps MulOp;
1697   if (Step->getType()->isIntegerTy()) {
1698     AddOp = Instruction::Add;
1699     MulOp = Instruction::Mul;
1700   } else {
1701     AddOp = II.getInductionOpcode();
1702     MulOp = Instruction::FMul;
1703   }
1704
1705   // Multiply the vectorization factor by the step using integer or
1706   // floating-point arithmetic as appropriate.
1707   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1708   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1709
1710   // Create a vector splat to use in the induction update.
1711   //
1712   // FIXME: If the step is non-constant, we create the vector splat with
1713   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1714   //        handle a constant vector splat.
1715   Value *SplatVF = isa<Constant>(Mul)
1716                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1717                        : Builder.CreateVectorSplat(VF, Mul);
1718   Builder.restoreIP(CurrIP);
1719
1720   // We may need to add the step a number of times, depending on the unroll
1721   // factor. The last of those goes into the PHI.
1722   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1723                                     &*LoopVectorBody->getFirstInsertionPt());
1724   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1725   Instruction *LastInduction = VecInd;
1726   for (unsigned Part = 0; Part < UF; ++Part) {
1727     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1728
1729     if (isa<TruncInst>(EntryVal))
1730       addMetadata(LastInduction, EntryVal);
1731     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1732
1733     LastInduction = cast<Instruction>(addFastMathFlag(
1734         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1735     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1736   }
1737
1738   // Move the last step to the end of the latch block. This ensures consistent
1739   // placement of all induction updates.
1740   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1741   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1742   auto *ICmp = cast<Instruction>(Br->getCondition());
1743   LastInduction->moveBefore(ICmp);
1744   LastInduction->setName("vec.ind.next");
1745
1746   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1747   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1748 }
1749
1750 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1751   return Cost->isScalarAfterVectorization(I, VF) ||
1752          Cost->isProfitableToScalarize(I, VF);
1753 }
1754
1755 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1756   if (shouldScalarizeInstruction(IV))
1757     return true;
1758   auto isScalarInst = [&](User *U) -> bool {
1759     auto *I = cast<Instruction>(U);
1760     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1761   };
1762   return llvm::any_of(IV->users(), isScalarInst);
1763 }
1764
1765 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1766     const InductionDescriptor &ID, const Instruction *EntryVal,
1767     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1768   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1769          "Expected either an induction phi-node or a truncate of it!");
1770
1771   // This induction variable is not the phi from the original loop but the
1772   // newly-created IV based on the proof that casted Phi is equal to the
1773   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1774   // re-uses the same InductionDescriptor that original IV uses but we don't
1775   // have to do any recording in this case - that is done when original IV is
1776   // processed.
1777   if (isa<TruncInst>(EntryVal))
1778     return;
1779
1780   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1781   if (Casts.empty())
1782     return;
1783   // Only the first Cast instruction in the Casts vector is of interest.
1784   // The rest of the Casts (if exist) have no uses outside the
1785   // induction update chain itself.
1786   Instruction *CastInst = *Casts.begin();
1787   if (Lane < UINT_MAX)
1788     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1789   else
1790     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1791 }
1792
1793 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1794   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1795          "Primary induction variable must have an integer type");
1796
1797   auto II = Legal->getInductionVars()->find(IV);
1798   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1799
1800   auto ID = II->second;
1801   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1802
1803   // The scalar value to broadcast. This will be derived from the canonical
1804   // induction variable.
1805   Value *ScalarIV = nullptr;
1806
1807   // The value from the original loop to which we are mapping the new induction
1808   // variable.
1809   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1810
1811   // True if we have vectorized the induction variable.
1812   auto VectorizedIV = false;
1813
1814   // Determine if we want a scalar version of the induction variable. This is
1815   // true if the induction variable itself is not widened, or if it has at
1816   // least one user in the loop that is not widened.
1817   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1818
1819   // Generate code for the induction step. Note that induction steps are
1820   // required to be loop-invariant
1821   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1822          "Induction step should be loop invariant");
1823   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1824   Value *Step = nullptr;
1825   if (PSE.getSE()->isSCEVable(IV->getType())) {
1826     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1827     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1828                              LoopVectorPreHeader->getTerminator());
1829   } else {
1830     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1831   }
1832
1833   // Try to create a new independent vector induction variable. If we can't
1834   // create the phi node, we will splat the scalar induction variable in each
1835   // loop iteration.
1836   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1837     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1838     VectorizedIV = true;
1839   }
1840
1841   // If we haven't yet vectorized the induction variable, or if we will create
1842   // a scalar one, we need to define the scalar induction variable and step
1843   // values. If we were given a truncation type, truncate the canonical
1844   // induction variable and step. Otherwise, derive these values from the
1845   // induction descriptor.
1846   if (!VectorizedIV || NeedsScalarIV) {
1847     ScalarIV = Induction;
1848     if (IV != OldInduction) {
1849       ScalarIV = IV->getType()->isIntegerTy()
1850                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1851                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1852                                           IV->getType());
1853       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1854       ScalarIV->setName("offset.idx");
1855     }
1856     if (Trunc) {
1857       auto *TruncType = cast<IntegerType>(Trunc->getType());
1858       assert(Step->getType()->isIntegerTy() &&
1859              "Truncation requires an integer step");
1860       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1861       Step = Builder.CreateTrunc(Step, TruncType);
1862     }
1863   }
1864
1865   // If we haven't yet vectorized the induction variable, splat the scalar
1866   // induction variable, and build the necessary step vectors.
1867   // TODO: Don't do it unless the vectorized IV is really required.
1868   if (!VectorizedIV) {
1869     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1870     for (unsigned Part = 0; Part < UF; ++Part) {
1871       Value *EntryPart =
1872           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1873       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1874       if (Trunc)
1875         addMetadata(EntryPart, Trunc);
1876       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1877     }
1878   }
1879
1880   // If an induction variable is only used for counting loop iterations or
1881   // calculating addresses, it doesn't need to be widened. Create scalar steps
1882   // that can be used by instructions we will later scalarize. Note that the
1883   // addition of the scalar steps will not increase the number of instructions
1884   // in the loop in the common case prior to InstCombine. We will be trading
1885   // one vector extract for each scalar step.
1886   if (NeedsScalarIV)
1887     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1888 }
1889
1890 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1891                                           Instruction::BinaryOps BinOp) {
1892   // Create and check the types.
1893   assert(Val->getType()->isVectorTy() && "Must be a vector");
1894   int VLen = Val->getType()->getVectorNumElements();
1895
1896   Type *STy = Val->getType()->getScalarType();
1897   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1898          "Induction Step must be an integer or FP");
1899   assert(Step->getType() == STy && "Step has wrong type");
1900
1901   SmallVector<Constant *, 8> Indices;
1902
1903   if (STy->isIntegerTy()) {
1904     // Create a vector of consecutive numbers from zero to VF.
1905     for (int i = 0; i < VLen; ++i)
1906       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1907
1908     // Add the consecutive indices to the vector value.
1909     Constant *Cv = ConstantVector::get(Indices);
1910     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1911     Step = Builder.CreateVectorSplat(VLen, Step);
1912     assert(Step->getType() == Val->getType() && "Invalid step vec");
1913     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1914     // which can be found from the original scalar operations.
1915     Step = Builder.CreateMul(Cv, Step);
1916     return Builder.CreateAdd(Val, Step, "induction");
1917   }
1918
1919   // Floating point induction.
1920   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1921          "Binary Opcode should be specified for FP induction");
1922   // Create a vector of consecutive numbers from zero to VF.
1923   for (int i = 0; i < VLen; ++i)
1924     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1925
1926   // Add the consecutive indices to the vector value.
1927   Constant *Cv = ConstantVector::get(Indices);
1928
1929   Step = Builder.CreateVectorSplat(VLen, Step);
1930
1931   // Floating point operations had to be 'fast' to enable the induction.
1932   FastMathFlags Flags;
1933   Flags.setFast();
1934
1935   Value *MulOp = Builder.CreateFMul(Cv, Step);
1936   if (isa<Instruction>(MulOp))
1937     // Have to check, MulOp may be a constant
1938     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1939
1940   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1941   if (isa<Instruction>(BOp))
1942     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1943   return BOp;
1944 }
1945
1946 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1947                                            Instruction *EntryVal,
1948                                            const InductionDescriptor &ID) {
1949   // We shouldn't have to build scalar steps if we aren't vectorizing.
1950   assert(VF > 1 && "VF should be greater than one");
1951
1952   // Get the value type and ensure it and the step have the same integer type.
1953   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1954   assert(ScalarIVTy == Step->getType() &&
1955          "Val and Step should have the same type");
1956
1957   // We build scalar steps for both integer and floating-point induction
1958   // variables. Here, we determine the kind of arithmetic we will perform.
1959   Instruction::BinaryOps AddOp;
1960   Instruction::BinaryOps MulOp;
1961   if (ScalarIVTy->isIntegerTy()) {
1962     AddOp = Instruction::Add;
1963     MulOp = Instruction::Mul;
1964   } else {
1965     AddOp = ID.getInductionOpcode();
1966     MulOp = Instruction::FMul;
1967   }
1968
1969   // Determine the number of scalars we need to generate for each unroll
1970   // iteration. If EntryVal is uniform, we only need to generate the first
1971   // lane. Otherwise, we generate all VF values.
1972   unsigned Lanes =
1973       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1974                                                                          : VF;
1975   // Compute the scalar steps and save the results in VectorLoopValueMap.
1976   for (unsigned Part = 0; Part < UF; ++Part) {
1977     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1978       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1979       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1980       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1981       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1982       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1983     }
1984   }
1985 }
1986
1987 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1988   assert(V != Induction && "The new induction variable should not be used.");
1989   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1990   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1991
1992   // If we have a stride that is replaced by one, do it here. Defer this for
1993   // the VPlan-native path until we start running Legal checks in that path.
1994   if (!EnableVPlanNativePath && Legal->hasStride(V))
1995     V = ConstantInt::get(V->getType(), 1);
1996
1997   // If we have a vector mapped to this value, return it.
1998   if (VectorLoopValueMap.hasVectorValue(V, Part))
1999     return VectorLoopValueMap.getVectorValue(V, Part);
2000
2001   // If the value has not been vectorized, check if it has been scalarized
2002   // instead. If it has been scalarized, and we actually need the value in
2003   // vector form, we will construct the vector values on demand.
2004   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2005     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2006
2007     // If we've scalarized a value, that value should be an instruction.
2008     auto *I = cast<Instruction>(V);
2009
2010     // If we aren't vectorizing, we can just copy the scalar map values over to
2011     // the vector map.
2012     if (VF == 1) {
2013       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2014       return ScalarValue;
2015     }
2016
2017     // Get the last scalar instruction we generated for V and Part. If the value
2018     // is known to be uniform after vectorization, this corresponds to lane zero
2019     // of the Part unroll iteration. Otherwise, the last instruction is the one
2020     // we created for the last vector lane of the Part unroll iteration.
2021     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2022     auto *LastInst = cast<Instruction>(
2023         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2024
2025     // Set the insert point after the last scalarized instruction. This ensures
2026     // the insertelement sequence will directly follow the scalar definitions.
2027     auto OldIP = Builder.saveIP();
2028     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2029     Builder.SetInsertPoint(&*NewIP);
2030
2031     // However, if we are vectorizing, we need to construct the vector values.
2032     // If the value is known to be uniform after vectorization, we can just
2033     // broadcast the scalar value corresponding to lane zero for each unroll
2034     // iteration. Otherwise, we construct the vector values using insertelement
2035     // instructions. Since the resulting vectors are stored in
2036     // VectorLoopValueMap, we will only generate the insertelements once.
2037     Value *VectorValue = nullptr;
2038     if (Cost->isUniformAfterVectorization(I, VF)) {
2039       VectorValue = getBroadcastInstrs(ScalarValue);
2040       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2041     } else {
2042       // Initialize packing with insertelements to start from undef.
2043       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2044       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2045       for (unsigned Lane = 0; Lane < VF; ++Lane)
2046         packScalarIntoVectorValue(V, {Part, Lane});
2047       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2048     }
2049     Builder.restoreIP(OldIP);
2050     return VectorValue;
2051   }
2052
2053   // If this scalar is unknown, assume that it is a constant or that it is
2054   // loop invariant. Broadcast V and save the value for future uses.
2055   Value *B = getBroadcastInstrs(V);
2056   VectorLoopValueMap.setVectorValue(V, Part, B);
2057   return B;
2058 }
2059
2060 Value *
2061 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2062                                             const VPIteration &Instance) {
2063   // If the value is not an instruction contained in the loop, it should
2064   // already be scalar.
2065   if (OrigLoop->isLoopInvariant(V))
2066     return V;
2067
2068   assert(Instance.Lane > 0
2069              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2070              : true && "Uniform values only have lane zero");
2071
2072   // If the value from the original loop has not been vectorized, it is
2073   // represented by UF x VF scalar values in the new loop. Return the requested
2074   // scalar value.
2075   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2076     return VectorLoopValueMap.getScalarValue(V, Instance);
2077
2078   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2079   // for the given unroll part. If this entry is not a vector type (i.e., the
2080   // vectorization factor is one), there is no need to generate an
2081   // extractelement instruction.
2082   auto *U = getOrCreateVectorValue(V, Instance.Part);
2083   if (!U->getType()->isVectorTy()) {
2084     assert(VF == 1 && "Value not scalarized has non-vector type");
2085     return U;
2086   }
2087
2088   // Otherwise, the value from the original loop has been vectorized and is
2089   // represented by UF vector values. Extract and return the requested scalar
2090   // value from the appropriate vector lane.
2091   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2092 }
2093
2094 void InnerLoopVectorizer::packScalarIntoVectorValue(
2095     Value *V, const VPIteration &Instance) {
2096   assert(V != Induction && "The new induction variable should not be used.");
2097   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2098   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2099
2100   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2101   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2102   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2103                                             Builder.getInt32(Instance.Lane));
2104   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2105 }
2106
2107 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2108   assert(Vec->getType()->isVectorTy() && "Invalid type");
2109   SmallVector<Constant *, 8> ShuffleMask;
2110   for (unsigned i = 0; i < VF; ++i)
2111     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2112
2113   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2114                                      ConstantVector::get(ShuffleMask),
2115                                      "reverse");
2116 }
2117
2118 // Return whether we allow using masked interleave-groups (for dealing with
2119 // strided loads/stores that reside in predicated blocks, or for dealing
2120 // with gaps).
2121 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2122   // If an override option has been passed in for interleaved accesses, use it.
2123   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2124     return EnableMaskedInterleavedMemAccesses;
2125
2126   return TTI.enableMaskedInterleavedAccessVectorization();
2127 }
2128
2129 // Try to vectorize the interleave group that \p Instr belongs to.
2130 //
2131 // E.g. Translate following interleaved load group (factor = 3):
2132 //   for (i = 0; i < N; i+=3) {
2133 //     R = Pic[i];             // Member of index 0
2134 //     G = Pic[i+1];           // Member of index 1
2135 //     B = Pic[i+2];           // Member of index 2
2136 //     ... // do something to R, G, B
2137 //   }
2138 // To:
2139 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2140 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2141 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2142 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2143 //
2144 // Or translate following interleaved store group (factor = 3):
2145 //   for (i = 0; i < N; i+=3) {
2146 //     ... do something to R, G, B
2147 //     Pic[i]   = R;           // Member of index 0
2148 //     Pic[i+1] = G;           // Member of index 1
2149 //     Pic[i+2] = B;           // Member of index 2
2150 //   }
2151 // To:
2152 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2153 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2154 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2155 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2156 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2157 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2158                                                    VectorParts *BlockInMask) {
2159   const InterleaveGroup<Instruction> *Group =
2160       Cost->getInterleavedAccessGroup(Instr);
2161   assert(Group && "Fail to get an interleaved access group.");
2162
2163   // Skip if current instruction is not the insert position.
2164   if (Instr != Group->getInsertPos())
2165     return;
2166
2167   const DataLayout &DL = Instr->getModule()->getDataLayout();
2168   Value *Ptr = getLoadStorePointerOperand(Instr);
2169
2170   // Prepare for the vector type of the interleaved load/store.
2171   Type *ScalarTy = getMemInstValueType(Instr);
2172   unsigned InterleaveFactor = Group->getFactor();
2173   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2174   Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2175
2176   // Prepare for the new pointers.
2177   setDebugLocFromInst(Builder, Ptr);
2178   SmallVector<Value *, 2> NewPtrs;
2179   unsigned Index = Group->getIndex(Instr);
2180
2181   VectorParts Mask;
2182   bool IsMaskForCondRequired = BlockInMask;
2183   if (IsMaskForCondRequired) {
2184     Mask = *BlockInMask;
2185     // TODO: extend the masked interleaved-group support to reversed access.
2186     assert(!Group->isReverse() && "Reversed masked interleave-group "
2187                                   "not supported.");
2188   }
2189
2190   // If the group is reverse, adjust the index to refer to the last vector lane
2191   // instead of the first. We adjust the index from the first vector lane,
2192   // rather than directly getting the pointer for lane VF - 1, because the
2193   // pointer operand of the interleaved access is supposed to be uniform. For
2194   // uniform instructions, we're only required to generate a value for the
2195   // first vector lane in each unroll iteration.
2196   if (Group->isReverse())
2197     Index += (VF - 1) * Group->getFactor();
2198
2199   bool InBounds = false;
2200   if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2201     InBounds = gep->isInBounds();
2202
2203   for (unsigned Part = 0; Part < UF; Part++) {
2204     Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2205
2206     // Notice current instruction could be any index. Need to adjust the address
2207     // to the member of index 0.
2208     //
2209     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2210     //       b = A[i];       // Member of index 0
2211     // Current pointer is pointed to A[i+1], adjust it to A[i].
2212     //
2213     // E.g.  A[i+1] = a;     // Member of index 1
2214     //       A[i]   = b;     // Member of index 0
2215     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2216     // Current pointer is pointed to A[i+2], adjust it to A[i].
2217     NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
2218     if (InBounds)
2219       cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2220
2221     // Cast to the vector pointer type.
2222     NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2223   }
2224
2225   setDebugLocFromInst(Builder, Instr);
2226   Value *UndefVec = UndefValue::get(VecTy);
2227
2228   Value *MaskForGaps = nullptr;
2229   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2230     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2231     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2232   }
2233
2234   // Vectorize the interleaved load group.
2235   if (isa<LoadInst>(Instr)) {
2236     // For each unroll part, create a wide load for the group.
2237     SmallVector<Value *, 2> NewLoads;
2238     for (unsigned Part = 0; Part < UF; Part++) {
2239       Instruction *NewLoad;
2240       if (IsMaskForCondRequired || MaskForGaps) {
2241         assert(useMaskedInterleavedAccesses(*TTI) &&
2242                "masked interleaved groups are not allowed.");
2243         Value *GroupMask = MaskForGaps;
2244         if (IsMaskForCondRequired) {
2245           auto *Undefs = UndefValue::get(Mask[Part]->getType());
2246           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2247           Value *ShuffledMask = Builder.CreateShuffleVector(
2248               Mask[Part], Undefs, RepMask, "interleaved.mask");
2249           GroupMask = MaskForGaps
2250                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2251                                                 MaskForGaps)
2252                           : ShuffledMask;
2253         }
2254         NewLoad =
2255             Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2256                                      GroupMask, UndefVec, "wide.masked.vec");
2257       }
2258       else
2259         NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
2260                                             Group->getAlignment(), "wide.vec");
2261       Group->addMetadata(NewLoad);
2262       NewLoads.push_back(NewLoad);
2263     }
2264
2265     // For each member in the group, shuffle out the appropriate data from the
2266     // wide loads.
2267     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2268       Instruction *Member = Group->getMember(I);
2269
2270       // Skip the gaps in the group.
2271       if (!Member)
2272         continue;
2273
2274       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2275       for (unsigned Part = 0; Part < UF; Part++) {
2276         Value *StridedVec = Builder.CreateShuffleVector(
2277             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2278
2279         // If this member has different type, cast the result type.
2280         if (Member->getType() != ScalarTy) {
2281           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2282           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2283         }
2284
2285         if (Group->isReverse())
2286           StridedVec = reverseVector(StridedVec);
2287
2288         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2289       }
2290     }
2291     return;
2292   }
2293
2294   // The sub vector type for current instruction.
2295   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2296
2297   // Vectorize the interleaved store group.
2298   for (unsigned Part = 0; Part < UF; Part++) {
2299     // Collect the stored vector from each member.
2300     SmallVector<Value *, 4> StoredVecs;
2301     for (unsigned i = 0; i < InterleaveFactor; i++) {
2302       // Interleaved store group doesn't allow a gap, so each index has a member
2303       Instruction *Member = Group->getMember(i);
2304       assert(Member && "Fail to get a member from an interleaved store group");
2305
2306       Value *StoredVec = getOrCreateVectorValue(
2307           cast<StoreInst>(Member)->getValueOperand(), Part);
2308       if (Group->isReverse())
2309         StoredVec = reverseVector(StoredVec);
2310
2311       // If this member has different type, cast it to a unified type.
2312
2313       if (StoredVec->getType() != SubVT)
2314         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2315
2316       StoredVecs.push_back(StoredVec);
2317     }
2318
2319     // Concatenate all vectors into a wide vector.
2320     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2321
2322     // Interleave the elements in the wide vector.
2323     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2324     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2325                                               "interleaved.vec");
2326
2327     Instruction *NewStoreInstr;
2328     if (IsMaskForCondRequired) {
2329       auto *Undefs = UndefValue::get(Mask[Part]->getType());
2330       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2331       Value *ShuffledMask = Builder.CreateShuffleVector(
2332           Mask[Part], Undefs, RepMask, "interleaved.mask");
2333       NewStoreInstr = Builder.CreateMaskedStore(
2334           IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2335     }
2336     else
2337       NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2338         Group->getAlignment());
2339
2340     Group->addMetadata(NewStoreInstr);
2341   }
2342 }
2343
2344 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2345                                                      VectorParts *BlockInMask) {
2346   // Attempt to issue a wide load.
2347   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2348   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2349
2350   assert((LI || SI) && "Invalid Load/Store instruction");
2351
2352   LoopVectorizationCostModel::InstWidening Decision =
2353       Cost->getWideningDecision(Instr, VF);
2354   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2355          "CM decision should be taken at this point");
2356   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2357     return vectorizeInterleaveGroup(Instr);
2358
2359   Type *ScalarDataTy = getMemInstValueType(Instr);
2360   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2361   Value *Ptr = getLoadStorePointerOperand(Instr);
2362   // An alignment of 0 means target abi alignment. We need to use the scalar's
2363   // target abi alignment in such a case.
2364   const DataLayout &DL = Instr->getModule()->getDataLayout();
2365   const Align Alignment =
2366       DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
2367   unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2368
2369   // Determine if the pointer operand of the access is either consecutive or
2370   // reverse consecutive.
2371   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2372   bool ConsecutiveStride =
2373       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2374   bool CreateGatherScatter =
2375       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2376
2377   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2378   // gather/scatter. Otherwise Decision should have been to Scalarize.
2379   assert((ConsecutiveStride || CreateGatherScatter) &&
2380          "The instruction should be scalarized");
2381
2382   // Handle consecutive loads/stores.
2383   if (ConsecutiveStride)
2384     Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2385
2386   VectorParts Mask;
2387   bool isMaskRequired = BlockInMask;
2388   if (isMaskRequired)
2389     Mask = *BlockInMask;
2390
2391   bool InBounds = false;
2392   if (auto *gep = dyn_cast<GetElementPtrInst>(
2393           getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2394     InBounds = gep->isInBounds();
2395
2396   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2397     // Calculate the pointer for the specific unroll-part.
2398     GetElementPtrInst *PartPtr = nullptr;
2399
2400     if (Reverse) {
2401       // If the address is consecutive but reversed, then the
2402       // wide store needs to start at the last vector element.
2403       PartPtr = cast<GetElementPtrInst>(
2404           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2405       PartPtr->setIsInBounds(InBounds);
2406       PartPtr = cast<GetElementPtrInst>(
2407           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2408       PartPtr->setIsInBounds(InBounds);
2409       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2410         Mask[Part] = reverseVector(Mask[Part]);
2411     } else {
2412       PartPtr = cast<GetElementPtrInst>(
2413           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2414       PartPtr->setIsInBounds(InBounds);
2415     }
2416
2417     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2418   };
2419
2420   // Handle Stores:
2421   if (SI) {
2422     setDebugLocFromInst(Builder, SI);
2423
2424     for (unsigned Part = 0; Part < UF; ++Part) {
2425       Instruction *NewSI = nullptr;
2426       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2427       if (CreateGatherScatter) {
2428         Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2429         Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2430         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep,
2431                                             Alignment.value(), MaskPart);
2432       } else {
2433         if (Reverse) {
2434           // If we store to reverse consecutive memory locations, then we need
2435           // to reverse the order of elements in the stored value.
2436           StoredVal = reverseVector(StoredVal);
2437           // We don't want to update the value in the map as it might be used in
2438           // another expression. So don't call resetVectorValue(StoredVal).
2439         }
2440         auto *VecPtr = CreateVecPtr(Part, Ptr);
2441         if (isMaskRequired)
2442           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr,
2443                                             Alignment.value(), Mask[Part]);
2444         else
2445           NewSI =
2446               Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value());
2447       }
2448       addMetadata(NewSI, SI);
2449     }
2450     return;
2451   }
2452
2453   // Handle loads.
2454   assert(LI && "Must have a load instruction");
2455   setDebugLocFromInst(Builder, LI);
2456   for (unsigned Part = 0; Part < UF; ++Part) {
2457     Value *NewLI;
2458     if (CreateGatherScatter) {
2459       Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2460       Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2461       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart,
2462                                          nullptr, "wide.masked.gather");
2463       addMetadata(NewLI, LI);
2464     } else {
2465       auto *VecPtr = CreateVecPtr(Part, Ptr);
2466       if (isMaskRequired)
2467         NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment.value(), Mask[Part],
2468                                          UndefValue::get(DataTy),
2469                                          "wide.masked.load");
2470       else
2471         NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(),
2472                                           "wide.load");
2473
2474       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2475       addMetadata(NewLI, LI);
2476       if (Reverse)
2477         NewLI = reverseVector(NewLI);
2478     }
2479     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2480   }
2481 }
2482
2483 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2484                                                const VPIteration &Instance,
2485                                                bool IfPredicateInstr) {
2486   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2487
2488   setDebugLocFromInst(Builder, Instr);
2489
2490   // Does this instruction return a value ?
2491   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2492
2493   Instruction *Cloned = Instr->clone();
2494   if (!IsVoidRetTy)
2495     Cloned->setName(Instr->getName() + ".cloned");
2496
2497   // Replace the operands of the cloned instructions with their scalar
2498   // equivalents in the new loop.
2499   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2500     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2501     Cloned->setOperand(op, NewOp);
2502   }
2503   addNewMetadata(Cloned, Instr);
2504
2505   // Place the cloned scalar in the new loop.
2506   Builder.Insert(Cloned);
2507
2508   // Add the cloned scalar to the scalar map entry.
2509   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2510
2511   // If we just cloned a new assumption, add it the assumption cache.
2512   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2513     if (II->getIntrinsicID() == Intrinsic::assume)
2514       AC->registerAssumption(II);
2515
2516   // End if-block.
2517   if (IfPredicateInstr)
2518     PredicatedInstructions.push_back(Cloned);
2519 }
2520
2521 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2522                                                       Value *End, Value *Step,
2523                                                       Instruction *DL) {
2524   BasicBlock *Header = L->getHeader();
2525   BasicBlock *Latch = L->getLoopLatch();
2526   // As we're just creating this loop, it's possible no latch exists
2527   // yet. If so, use the header as this will be a single block loop.
2528   if (!Latch)
2529     Latch = Header;
2530
2531   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2532   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2533   setDebugLocFromInst(Builder, OldInst);
2534   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2535
2536   Builder.SetInsertPoint(Latch->getTerminator());
2537   setDebugLocFromInst(Builder, OldInst);
2538
2539   // Create i+1 and fill the PHINode.
2540   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2541   Induction->addIncoming(Start, L->getLoopPreheader());
2542   Induction->addIncoming(Next, Latch);
2543   // Create the compare.
2544   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2545   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2546
2547   // Now we have two terminators. Remove the old one from the block.
2548   Latch->getTerminator()->eraseFromParent();
2549
2550   return Induction;
2551 }
2552
2553 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2554   if (TripCount)
2555     return TripCount;
2556
2557   assert(L && "Create Trip Count for null loop.");
2558   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2559   // Find the loop boundaries.
2560   ScalarEvolution *SE = PSE.getSE();
2561   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2562   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2563          "Invalid loop count");
2564
2565   Type *IdxTy = Legal->getWidestInductionType();
2566   assert(IdxTy && "No type for induction");
2567
2568   // The exit count might have the type of i64 while the phi is i32. This can
2569   // happen if we have an induction variable that is sign extended before the
2570   // compare. The only way that we get a backedge taken count is that the
2571   // induction variable was signed and as such will not overflow. In such a case
2572   // truncation is legal.
2573   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2574       IdxTy->getPrimitiveSizeInBits())
2575     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2576   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2577
2578   // Get the total trip count from the count by adding 1.
2579   const SCEV *ExitCount = SE->getAddExpr(
2580       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2581
2582   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2583
2584   // Expand the trip count and place the new instructions in the preheader.
2585   // Notice that the pre-header does not change, only the loop body.
2586   SCEVExpander Exp(*SE, DL, "induction");
2587
2588   // Count holds the overall loop count (N).
2589   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2590                                 L->getLoopPreheader()->getTerminator());
2591
2592   if (TripCount->getType()->isPointerTy())
2593     TripCount =
2594         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2595                                     L->getLoopPreheader()->getTerminator());
2596
2597   return TripCount;
2598 }
2599
2600 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2601   if (VectorTripCount)
2602     return VectorTripCount;
2603
2604   Value *TC = getOrCreateTripCount(L);
2605   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2606
2607   Type *Ty = TC->getType();
2608   Constant *Step = ConstantInt::get(Ty, VF * UF);
2609
2610   // If the tail is to be folded by masking, round the number of iterations N
2611   // up to a multiple of Step instead of rounding down. This is done by first
2612   // adding Step-1 and then rounding down. Note that it's ok if this addition
2613   // overflows: the vector induction variable will eventually wrap to zero given
2614   // that it starts at zero and its Step is a power of two; the loop will then
2615   // exit, with the last early-exit vector comparison also producing all-true.
2616   if (Cost->foldTailByMasking()) {
2617     assert(isPowerOf2_32(VF * UF) &&
2618            "VF*UF must be a power of 2 when folding tail by masking");
2619     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2620   }
2621
2622   // Now we need to generate the expression for the part of the loop that the
2623   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2624   // iterations are not required for correctness, or N - Step, otherwise. Step
2625   // is equal to the vectorization factor (number of SIMD elements) times the
2626   // unroll factor (number of SIMD instructions).
2627   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2628
2629   // If there is a non-reversed interleaved group that may speculatively access
2630   // memory out-of-bounds, we need to ensure that there will be at least one
2631   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2632   // the trip count, we set the remainder to be equal to the step. If the step
2633   // does not evenly divide the trip count, no adjustment is necessary since
2634   // there will already be scalar iterations. Note that the minimum iterations
2635   // check ensures that N >= Step.
2636   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2637     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2638     R = Builder.CreateSelect(IsZero, Step, R);
2639   }
2640
2641   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2642
2643   return VectorTripCount;
2644 }
2645
2646 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2647                                                    const DataLayout &DL) {
2648   // Verify that V is a vector type with same number of elements as DstVTy.
2649   unsigned VF = DstVTy->getNumElements();
2650   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2651   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2652   Type *SrcElemTy = SrcVecTy->getElementType();
2653   Type *DstElemTy = DstVTy->getElementType();
2654   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2655          "Vector elements must have same size");
2656
2657   // Do a direct cast if element types are castable.
2658   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2659     return Builder.CreateBitOrPointerCast(V, DstVTy);
2660   }
2661   // V cannot be directly casted to desired vector type.
2662   // May happen when V is a floating point vector but DstVTy is a vector of
2663   // pointers or vice-versa. Handle this using a two-step bitcast using an
2664   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2665   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2666          "Only one type should be a pointer type");
2667   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2668          "Only one type should be a floating point type");
2669   Type *IntTy =
2670       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2671   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2672   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2673   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2674 }
2675
2676 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2677                                                          BasicBlock *Bypass) {
2678   Value *Count = getOrCreateTripCount(L);
2679   BasicBlock *BB = L->getLoopPreheader();
2680   IRBuilder<> Builder(BB->getTerminator());
2681
2682   // Generate code to check if the loop's trip count is less than VF * UF, or
2683   // equal to it in case a scalar epilogue is required; this implies that the
2684   // vector trip count is zero. This check also covers the case where adding one
2685   // to the backedge-taken count overflowed leading to an incorrect trip count
2686   // of zero. In this case we will also jump to the scalar loop.
2687   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2688                                           : ICmpInst::ICMP_ULT;
2689
2690   // If tail is to be folded, vector loop takes care of all iterations.
2691   Value *CheckMinIters = Builder.getFalse();
2692   if (!Cost->foldTailByMasking())
2693     CheckMinIters = Builder.CreateICmp(
2694         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2695         "min.iters.check");
2696
2697   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2698   // Update dominator tree immediately if the generated block is a
2699   // LoopBypassBlock because SCEV expansions to generate loop bypass
2700   // checks may query it before the current function is finished.
2701   DT->addNewBlock(NewBB, BB);
2702   if (L->getParentLoop())
2703     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2704   ReplaceInstWithInst(BB->getTerminator(),
2705                       BranchInst::Create(Bypass, NewBB, CheckMinIters));
2706   LoopBypassBlocks.push_back(BB);
2707 }
2708
2709 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2710   BasicBlock *BB = L->getLoopPreheader();
2711
2712   // Generate the code to check that the SCEV assumptions that we made.
2713   // We want the new basic block to start at the first instruction in a
2714   // sequence of instructions that form a check.
2715   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2716                    "scev.check");
2717   Value *SCEVCheck =
2718       Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2719
2720   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2721     if (C->isZero())
2722       return;
2723
2724   assert(!BB->getParent()->hasOptSize() &&
2725          "Cannot SCEV check stride or overflow when optimizing for size");
2726
2727   // Create a new block containing the stride check.
2728   BB->setName("vector.scevcheck");
2729   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2730   // Update dominator tree immediately if the generated block is a
2731   // LoopBypassBlock because SCEV expansions to generate loop bypass
2732   // checks may query it before the current function is finished.
2733   DT->addNewBlock(NewBB, BB);
2734   if (L->getParentLoop())
2735     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2736   ReplaceInstWithInst(BB->getTerminator(),
2737                       BranchInst::Create(Bypass, NewBB, SCEVCheck));
2738   LoopBypassBlocks.push_back(BB);
2739   AddedSafetyChecks = true;
2740 }
2741
2742 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2743   // VPlan-native path does not do any analysis for runtime checks currently.
2744   if (EnableVPlanNativePath)
2745     return;
2746
2747   BasicBlock *BB = L->getLoopPreheader();
2748
2749   // Generate the code that checks in runtime if arrays overlap. We put the
2750   // checks into a separate block to make the more common case of few elements
2751   // faster.
2752   Instruction *FirstCheckInst;
2753   Instruction *MemRuntimeCheck;
2754   std::tie(FirstCheckInst, MemRuntimeCheck) =
2755       Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
2756   if (!MemRuntimeCheck)
2757     return;
2758
2759   if (BB->getParent()->hasOptSize()) {
2760     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2761            "Cannot emit memory checks when optimizing for size, unless forced "
2762            "to vectorize.");
2763     ORE->emit([&]() {
2764       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2765                                         L->getStartLoc(), L->getHeader())
2766              << "Code-size may be reduced by not forcing "
2767                 "vectorization, or by source-code modifications "
2768                 "eliminating the need for runtime checks "
2769                 "(e.g., adding 'restrict').";
2770     });
2771   }
2772
2773   // Create a new block containing the memory check.
2774   BB->setName("vector.memcheck");
2775   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2776   // Update dominator tree immediately if the generated block is a
2777   // LoopBypassBlock because SCEV expansions to generate loop bypass
2778   // checks may query it before the current function is finished.
2779   DT->addNewBlock(NewBB, BB);
2780   if (L->getParentLoop())
2781     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2782   ReplaceInstWithInst(BB->getTerminator(),
2783                       BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2784   LoopBypassBlocks.push_back(BB);
2785   AddedSafetyChecks = true;
2786
2787   // We currently don't use LoopVersioning for the actual loop cloning but we
2788   // still use it to add the noalias metadata.
2789   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2790                                            PSE.getSE());
2791   LVer->prepareNoAliasMetadata();
2792 }
2793
2794 Value *InnerLoopVectorizer::emitTransformedIndex(
2795     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2796     const InductionDescriptor &ID) const {
2797
2798   SCEVExpander Exp(*SE, DL, "induction");
2799   auto Step = ID.getStep();
2800   auto StartValue = ID.getStartValue();
2801   assert(Index->getType() == Step->getType() &&
2802          "Index type does not match StepValue type");
2803
2804   // Note: the IR at this point is broken. We cannot use SE to create any new
2805   // SCEV and then expand it, hoping that SCEV's simplification will give us
2806   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2807   // lead to various SCEV crashes. So all we can do is to use builder and rely
2808   // on InstCombine for future simplifications. Here we handle some trivial
2809   // cases only.
2810   auto CreateAdd = [&B](Value *X, Value *Y) {
2811     assert(X->getType() == Y->getType() && "Types don't match!");
2812     if (auto *CX = dyn_cast<ConstantInt>(X))
2813       if (CX->isZero())
2814         return Y;
2815     if (auto *CY = dyn_cast<ConstantInt>(Y))
2816       if (CY->isZero())
2817         return X;
2818     return B.CreateAdd(X, Y);
2819   };
2820
2821   auto CreateMul = [&B](Value *X, Value *Y) {
2822     assert(X->getType() == Y->getType() && "Types don't match!");
2823     if (auto *CX = dyn_cast<ConstantInt>(X))
2824       if (CX->isOne())
2825         return Y;
2826     if (auto *CY = dyn_cast<ConstantInt>(Y))
2827       if (CY->isOne())
2828         return X;
2829     return B.CreateMul(X, Y);
2830   };
2831
2832   switch (ID.getKind()) {
2833   case InductionDescriptor::IK_IntInduction: {
2834     assert(Index->getType() == StartValue->getType() &&
2835            "Index type does not match StartValue type");
2836     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2837       return B.CreateSub(StartValue, Index);
2838     auto *Offset = CreateMul(
2839         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2840     return CreateAdd(StartValue, Offset);
2841   }
2842   case InductionDescriptor::IK_PtrInduction: {
2843     assert(isa<SCEVConstant>(Step) &&
2844            "Expected constant step for pointer induction");
2845     return B.CreateGEP(
2846         StartValue->getType()->getPointerElementType(), StartValue,
2847         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2848                                            &*B.GetInsertPoint())));
2849   }
2850   case InductionDescriptor::IK_FpInduction: {
2851     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2852     auto InductionBinOp = ID.getInductionBinOp();
2853     assert(InductionBinOp &&
2854            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2855             InductionBinOp->getOpcode() == Instruction::FSub) &&
2856            "Original bin op should be defined for FP induction");
2857
2858     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2859
2860     // Floating point operations had to be 'fast' to enable the induction.
2861     FastMathFlags Flags;
2862     Flags.setFast();
2863
2864     Value *MulExp = B.CreateFMul(StepValue, Index);
2865     if (isa<Instruction>(MulExp))
2866       // We have to check, the MulExp may be a constant.
2867       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2868
2869     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2870                                "induction");
2871     if (isa<Instruction>(BOp))
2872       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2873
2874     return BOp;
2875   }
2876   case InductionDescriptor::IK_NoInduction:
2877     return nullptr;
2878   }
2879   llvm_unreachable("invalid enum");
2880 }
2881
2882 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2883   /*
2884    In this function we generate a new loop. The new loop will contain
2885    the vectorized instructions while the old loop will continue to run the
2886    scalar remainder.
2887
2888        [ ] <-- loop iteration number check.
2889     /   |
2890    /    v
2891   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2892   |  /  |
2893   | /   v
2894   ||   [ ]     <-- vector pre header.
2895   |/    |
2896   |     v
2897   |    [  ] \
2898   |    [  ]_|   <-- vector loop.
2899   |     |
2900   |     v
2901   |   -[ ]   <--- middle-block.
2902   |  /  |
2903   | /   v
2904   -|- >[ ]     <--- new preheader.
2905    |    |
2906    |    v
2907    |   [ ] \
2908    |   [ ]_|   <-- old scalar loop to handle remainder.
2909     \   |
2910      \  v
2911       >[ ]     <-- exit block.
2912    ...
2913    */
2914
2915   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2916   BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2917   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2918   MDNode *OrigLoopID = OrigLoop->getLoopID();
2919   assert(VectorPH && "Invalid loop structure");
2920   assert(ExitBlock && "Must have an exit block");
2921
2922   // Some loops have a single integer induction variable, while other loops
2923   // don't. One example is c++ iterators that often have multiple pointer
2924   // induction variables. In the code below we also support a case where we
2925   // don't have a single induction variable.
2926   //
2927   // We try to obtain an induction variable from the original loop as hard
2928   // as possible. However if we don't find one that:
2929   //   - is an integer
2930   //   - counts from zero, stepping by one
2931   //   - is the size of the widest induction variable type
2932   // then we create a new one.
2933   OldInduction = Legal->getPrimaryInduction();
2934   Type *IdxTy = Legal->getWidestInductionType();
2935
2936   // Split the single block loop into the two loop structure described above.
2937   BasicBlock *VecBody =
2938       VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2939   BasicBlock *MiddleBlock =
2940       VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2941   BasicBlock *ScalarPH =
2942       MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2943
2944   // Create and register the new vector loop.
2945   Loop *Lp = LI->AllocateLoop();
2946   Loop *ParentLoop = OrigLoop->getParentLoop();
2947
2948   // Insert the new loop into the loop nest and register the new basic blocks
2949   // before calling any utilities such as SCEV that require valid LoopInfo.
2950   if (ParentLoop) {
2951     ParentLoop->addChildLoop(Lp);
2952     ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2953     ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2954   } else {
2955     LI->addTopLevelLoop(Lp);
2956   }
2957   Lp->addBasicBlockToLoop(VecBody, *LI);
2958
2959   // Find the loop boundaries.
2960   Value *Count = getOrCreateTripCount(Lp);
2961
2962   Value *StartIdx = ConstantInt::get(IdxTy, 0);
2963
2964   // Now, compare the new count to zero. If it is zero skip the vector loop and
2965   // jump to the scalar loop. This check also covers the case where the
2966   // backedge-taken count is uint##_max: adding one to it will overflow leading
2967   // to an incorrect trip count of zero. In this (rare) case we will also jump
2968   // to the scalar loop.
2969   emitMinimumIterationCountCheck(Lp, ScalarPH);
2970
2971   // Generate the code to check any assumptions that we've made for SCEV
2972   // expressions.
2973   emitSCEVChecks(Lp, ScalarPH);
2974
2975   // Generate the code that checks in runtime if arrays overlap. We put the
2976   // checks into a separate block to make the more common case of few elements
2977   // faster.
2978   emitMemRuntimeChecks(Lp, ScalarPH);
2979
2980   // Generate the induction variable.
2981   // The loop step is equal to the vectorization factor (num of SIMD elements)
2982   // times the unroll factor (num of SIMD instructions).
2983   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2984   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2985   Induction =
2986       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2987                               getDebugLocFromInstOrOperands(OldInduction));
2988
2989   // We are going to resume the execution of the scalar loop.
2990   // Go over all of the induction variables that we found and fix the
2991   // PHIs that are left in the scalar version of the loop.
2992   // The starting values of PHI nodes depend on the counter of the last
2993   // iteration in the vectorized loop.
2994   // If we come from a bypass edge then we need to start from the original
2995   // start value.
2996
2997   // This variable saves the new starting index for the scalar loop. It is used
2998   // to test if there are any tail iterations left once the vector loop has
2999   // completed.
3000   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
3001   for (auto &InductionEntry : *List) {
3002     PHINode *OrigPhi = InductionEntry.first;
3003     InductionDescriptor II = InductionEntry.second;
3004
3005     // Create phi nodes to merge from the  backedge-taken check block.
3006     PHINode *BCResumeVal = PHINode::Create(
3007         OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
3008     // Copy original phi DL over to the new one.
3009     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3010     Value *&EndValue = IVEndValues[OrigPhi];
3011     if (OrigPhi == OldInduction) {
3012       // We know what the end value is.
3013       EndValue = CountRoundDown;
3014     } else {
3015       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3016       Type *StepType = II.getStep()->getType();
3017       Instruction::CastOps CastOp =
3018         CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3019       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3020       const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3021       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3022       EndValue->setName("ind.end");
3023     }
3024
3025     // The new PHI merges the original incoming value, in case of a bypass,
3026     // or the value at the end of the vectorized loop.
3027     BCResumeVal->addIncoming(EndValue, MiddleBlock);
3028
3029     // Fix the scalar body counter (PHI node).
3030     // The old induction's phi node in the scalar body needs the truncated
3031     // value.
3032     for (BasicBlock *BB : LoopBypassBlocks)
3033       BCResumeVal->addIncoming(II.getStartValue(), BB);
3034     OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);
3035   }
3036
3037   // We need the OrigLoop (scalar loop part) latch terminator to help
3038   // produce correct debug info for the middle block BB instructions.
3039   // The legality check stage guarantees that the loop will have a single
3040   // latch.
3041   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3042          "Scalar loop latch terminator isn't a branch");
3043   BranchInst *ScalarLatchBr =
3044       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3045
3046   // Add a check in the middle block to see if we have completed
3047   // all of the iterations in the first vector loop.
3048   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3049   // If tail is to be folded, we know we don't need to run the remainder.
3050   Value *CmpN = Builder.getTrue();
3051   if (!Cost->foldTailByMasking()) {
3052     CmpN =
3053         CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3054                         CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
3055
3056     // Here we use the same DebugLoc as the scalar loop latch branch instead
3057     // of the corresponding compare because they may have ended up with
3058     // different line numbers and we want to avoid awkward line stepping while
3059     // debugging. Eg. if the compare has got a line number inside the loop.
3060     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3061   }
3062
3063   BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
3064   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3065   ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
3066
3067   // Get ready to start creating new instructions into the vectorized body.
3068   Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
3069
3070   // Save the state.
3071   LoopVectorPreHeader = Lp->getLoopPreheader();
3072   LoopScalarPreHeader = ScalarPH;
3073   LoopMiddleBlock = MiddleBlock;
3074   LoopExitBlock = ExitBlock;
3075   LoopVectorBody = VecBody;
3076   LoopScalarBody = OldBasicBlock;
3077
3078   Optional<MDNode *> VectorizedLoopID =
3079       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3080                                       LLVMLoopVectorizeFollowupVectorized});
3081   if (VectorizedLoopID.hasValue()) {
3082     Lp->setLoopID(VectorizedLoopID.getValue());
3083
3084     // Do not setAlreadyVectorized if loop attributes have been defined
3085     // explicitly.
3086     return LoopVectorPreHeader;
3087   }
3088
3089   // Keep all loop hints from the original loop on the vector loop (we'll
3090   // replace the vectorizer-specific hints below).
3091   if (MDNode *LID = OrigLoop->getLoopID())
3092     Lp->setLoopID(LID);
3093
3094   LoopVectorizeHints Hints(Lp, true, *ORE);
3095   Hints.setAlreadyVectorized();
3096
3097   return LoopVectorPreHeader;
3098 }
3099
3100 // Fix up external users of the induction variable. At this point, we are
3101 // in LCSSA form, with all external PHIs that use the IV having one input value,
3102 // coming from the remainder loop. We need those PHIs to also have a correct
3103 // value for the IV when arriving directly from the middle block.
3104 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3105                                        const InductionDescriptor &II,
3106                                        Value *CountRoundDown, Value *EndValue,
3107                                        BasicBlock *MiddleBlock) {
3108   // There are two kinds of external IV usages - those that use the value
3109   // computed in the last iteration (the PHI) and those that use the penultimate
3110   // value (the value that feeds into the phi from the loop latch).
3111   // We allow both, but they, obviously, have different values.
3112
3113   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3114
3115   DenseMap<Value *, Value *> MissingVals;
3116
3117   // An external user of the last iteration's value should see the value that
3118   // the remainder loop uses to initialize its own IV.
3119   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3120   for (User *U : PostInc->users()) {
3121     Instruction *UI = cast<Instruction>(U);
3122     if (!OrigLoop->contains(UI)) {
3123       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3124       MissingVals[UI] = EndValue;
3125     }
3126   }
3127
3128   // An external user of the penultimate value need to see EndValue - Step.
3129   // The simplest way to get this is to recompute it from the constituent SCEVs,
3130   // that is Start + (Step * (CRD - 1)).
3131   for (User *U : OrigPhi->users()) {
3132     auto *UI = cast<Instruction>(U);
3133     if (!OrigLoop->contains(UI)) {
3134       const DataLayout &DL =
3135           OrigLoop->getHeader()->getModule()->getDataLayout();
3136       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3137
3138       IRBuilder<> B(MiddleBlock->getTerminator());
3139       Value *CountMinusOne = B.CreateSub(
3140           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3141       Value *CMO =
3142           !II.getStep()->getType()->isIntegerTy()
3143               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3144                              II.getStep()->getType())
3145               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3146       CMO->setName("cast.cmo");
3147       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3148       Escape->setName("ind.escape");
3149       MissingVals[UI] = Escape;
3150     }
3151   }
3152
3153   for (auto &I : MissingVals) {
3154     PHINode *PHI = cast<PHINode>(I.first);
3155     // One corner case we have to handle is two IVs "chasing" each-other,
3156     // that is %IV2 = phi [...], [ %IV1, %latch ]
3157     // In this case, if IV1 has an external use, we need to avoid adding both
3158     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3159     // don't already have an incoming value for the middle block.
3160     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3161       PHI->addIncoming(I.second, MiddleBlock);
3162   }
3163 }
3164
3165 namespace {
3166
3167 struct CSEDenseMapInfo {
3168   static bool canHandle(const Instruction *I) {
3169     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3170            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3171   }
3172
3173   static inline Instruction *getEmptyKey() {
3174     return DenseMapInfo<Instruction *>::getEmptyKey();
3175   }
3176
3177   static inline Instruction *getTombstoneKey() {
3178     return DenseMapInfo<Instruction *>::getTombstoneKey();
3179   }
3180
3181   static unsigned getHashValue(const Instruction *I) {
3182     assert(canHandle(I) && "Unknown instruction!");
3183     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3184                                                            I->value_op_end()));
3185   }
3186
3187   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3188     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3189         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3190       return LHS == RHS;
3191     return LHS->isIdenticalTo(RHS);
3192   }
3193 };
3194
3195 } // end anonymous namespace
3196
3197 ///Perform cse of induction variable instructions.
3198 static void cse(BasicBlock *BB) {
3199   // Perform simple cse.
3200   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3201   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3202     Instruction *In = &*I++;
3203
3204     if (!CSEDenseMapInfo::canHandle(In))
3205       continue;
3206
3207     // Check if we can replace this instruction with any of the
3208     // visited instructions.
3209     if (Instruction *V = CSEMap.lookup(In)) {
3210       In->replaceAllUsesWith(V);
3211       In->eraseFromParent();
3212       continue;
3213     }
3214
3215     CSEMap[In] = In;
3216   }
3217 }
3218
3219 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3220                                                        unsigned VF,
3221                                                        bool &NeedToScalarize) {
3222   Function *F = CI->getCalledFunction();
3223   StringRef FnName = CI->getCalledFunction()->getName();
3224   Type *ScalarRetTy = CI->getType();
3225   SmallVector<Type *, 4> Tys, ScalarTys;
3226   for (auto &ArgOp : CI->arg_operands())
3227     ScalarTys.push_back(ArgOp->getType());
3228
3229   // Estimate cost of scalarized vector call. The source operands are assumed
3230   // to be vectors, so we need to extract individual elements from there,
3231   // execute VF scalar calls, and then gather the result into the vector return
3232   // value.
3233   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3234   if (VF == 1)
3235     return ScalarCallCost;
3236
3237   // Compute corresponding vector type for return value and arguments.
3238   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3239   for (Type *ScalarTy : ScalarTys)
3240     Tys.push_back(ToVectorTy(ScalarTy, VF));
3241
3242   // Compute costs of unpacking argument values for the scalar calls and
3243   // packing the return values to a vector.
3244   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3245
3246   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3247
3248   // If we can't emit a vector call for this function, then the currently found
3249   // cost is the cost we need to return.
3250   NeedToScalarize = true;
3251   if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3252     return Cost;
3253
3254   // If the corresponding vector cost is cheaper, return its cost.
3255   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3256   if (VectorCallCost < Cost) {
3257     NeedToScalarize = false;
3258     return VectorCallCost;
3259   }
3260   return Cost;
3261 }
3262
3263 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3264                                                             unsigned VF) {
3265   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3266   assert(ID && "Expected intrinsic call!");
3267
3268   FastMathFlags FMF;
3269   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3270     FMF = FPMO->getFastMathFlags();
3271
3272   SmallVector<Value *, 4> Operands(CI->arg_operands());
3273   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3274 }
3275
3276 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3277   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3278   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3279   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3280 }
3281 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3282   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3283   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3284   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3285 }
3286
3287 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3288   // For every instruction `I` in MinBWs, truncate the operands, create a
3289   // truncated version of `I` and reextend its result. InstCombine runs
3290   // later and will remove any ext/trunc pairs.
3291   SmallPtrSet<Value *, 4> Erased;
3292   for (const auto &KV : Cost->getMinimalBitwidths()) {
3293     // If the value wasn't vectorized, we must maintain the original scalar
3294     // type. The absence of the value from VectorLoopValueMap indicates that it
3295     // wasn't vectorized.
3296     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3297       continue;
3298     for (unsigned Part = 0; Part < UF; ++Part) {
3299       Value *I = getOrCreateVectorValue(KV.first, Part);
3300       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3301           !isa<Instruction>(I))
3302         continue;
3303       Type *OriginalTy = I->getType();
3304       Type *ScalarTruncatedTy =
3305           IntegerType::get(OriginalTy->getContext(), KV.second);
3306       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3307                                           OriginalTy->getVectorNumElements());
3308       if (TruncatedTy == OriginalTy)
3309         continue;
3310
3311       IRBuilder<> B(cast<Instruction>(I));
3312       auto ShrinkOperand = [&](Value *V) -> Value * {
3313         if (auto *ZI = dyn_cast<ZExtInst>(V))
3314           if (ZI->getSrcTy() == TruncatedTy)
3315             return ZI->getOperand(0);
3316         return B.CreateZExtOrTrunc(V, TruncatedTy);
3317       };
3318
3319       // The actual instruction modification depends on the instruction type,
3320       // unfortunately.
3321       Value *NewI = nullptr;
3322       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3323         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3324                              ShrinkOperand(BO->getOperand(1)));
3325
3326         // Any wrapping introduced by shrinking this operation shouldn't be
3327         // considered undefined behavior. So, we can't unconditionally copy
3328         // arithmetic wrapping flags to NewI.
3329         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3330       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3331         NewI =
3332             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3333                          ShrinkOperand(CI->getOperand(1)));
3334       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3335         NewI = B.CreateSelect(SI->getCondition(),
3336                               ShrinkOperand(SI->getTrueValue()),
3337                               ShrinkOperand(SI->getFalseValue()));
3338       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3339         switch (CI->getOpcode()) {
3340         default:
3341           llvm_unreachable("Unhandled cast!");
3342         case Instruction::Trunc:
3343           NewI = ShrinkOperand(CI->getOperand(0));
3344           break;
3345         case Instruction::SExt:
3346           NewI = B.CreateSExtOrTrunc(
3347               CI->getOperand(0),
3348               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3349           break;
3350         case Instruction::ZExt:
3351           NewI = B.CreateZExtOrTrunc(
3352               CI->getOperand(0),
3353               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3354           break;
3355         }
3356       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3357         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3358         auto *O0 = B.CreateZExtOrTrunc(
3359             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3360         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3361         auto *O1 = B.CreateZExtOrTrunc(
3362             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3363
3364         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3365       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3366         // Don't do anything with the operands, just extend the result.
3367         continue;
3368       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3369         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3370         auto *O0 = B.CreateZExtOrTrunc(
3371             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3372         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3373         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3374       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3375         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3376         auto *O0 = B.CreateZExtOrTrunc(
3377             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3378         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3379       } else {
3380         // If we don't know what to do, be conservative and don't do anything.
3381         continue;
3382       }
3383
3384       // Lastly, extend the result.
3385       NewI->takeName(cast<Instruction>(I));
3386       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3387       I->replaceAllUsesWith(Res);
3388       cast<Instruction>(I)->eraseFromParent();
3389       Erased.insert(I);
3390       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3391     }
3392   }
3393
3394   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3395   for (const auto &KV : Cost->getMinimalBitwidths()) {
3396     // If the value wasn't vectorized, we must maintain the original scalar
3397     // type. The absence of the value from VectorLoopValueMap indicates that it
3398     // wasn't vectorized.
3399     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3400       continue;
3401     for (unsigned Part = 0; Part < UF; ++Part) {
3402       Value *I = getOrCreateVectorValue(KV.first, Part);
3403       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3404       if (Inst && Inst->use_empty()) {
3405         Value *NewI = Inst->getOperand(0);
3406         Inst->eraseFromParent();
3407         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3408       }
3409     }
3410   }
3411 }
3412
3413 void InnerLoopVectorizer::fixVectorizedLoop() {
3414   // Insert truncates and extends for any truncated instructions as hints to
3415   // InstCombine.
3416   if (VF > 1)
3417     truncateToMinimalBitwidths();
3418
3419   // Fix widened non-induction PHIs by setting up the PHI operands.
3420   if (OrigPHIsToFix.size()) {
3421     assert(EnableVPlanNativePath &&
3422            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3423     fixNonInductionPHIs();
3424   }
3425
3426   // At this point every instruction in the original loop is widened to a
3427   // vector form. Now we need to fix the recurrences in the loop. These PHI
3428   // nodes are currently empty because we did not want to introduce cycles.
3429   // This is the second stage of vectorizing recurrences.
3430   fixCrossIterationPHIs();
3431
3432   // Update the dominator tree.
3433   //
3434   // FIXME: After creating the structure of the new loop, the dominator tree is
3435   //        no longer up-to-date, and it remains that way until we update it
3436   //        here. An out-of-date dominator tree is problematic for SCEV,
3437   //        because SCEVExpander uses it to guide code generation. The
3438   //        vectorizer use SCEVExpanders in several places. Instead, we should
3439   //        keep the dominator tree up-to-date as we go.
3440   updateAnalysis();
3441
3442   // Fix-up external users of the induction variables.
3443   for (auto &Entry : *Legal->getInductionVars())
3444     fixupIVUsers(Entry.first, Entry.second,
3445                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3446                  IVEndValues[Entry.first], LoopMiddleBlock);
3447
3448   fixLCSSAPHIs();
3449   for (Instruction *PI : PredicatedInstructions)
3450     sinkScalarOperands(&*PI);
3451
3452   // Remove redundant induction instructions.
3453   cse(LoopVectorBody);
3454 }
3455
3456 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3457   // In order to support recurrences we need to be able to vectorize Phi nodes.
3458   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3459   // stage #2: We now need to fix the recurrences by adding incoming edges to
3460   // the currently empty PHI nodes. At this point every instruction in the
3461   // original loop is widened to a vector form so we can use them to construct
3462   // the incoming edges.
3463   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3464     // Handle first-order recurrences and reductions that need to be fixed.
3465     if (Legal->isFirstOrderRecurrence(&Phi))
3466       fixFirstOrderRecurrence(&Phi);
3467     else if (Legal->isReductionVariable(&Phi))
3468       fixReduction(&Phi);
3469   }
3470 }
3471
3472 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3473   // This is the second phase of vectorizing first-order recurrences. An
3474   // overview of the transformation is described below. Suppose we have the
3475   // following loop.
3476   //
3477   //   for (int i = 0; i < n; ++i)
3478   //     b[i] = a[i] - a[i - 1];
3479   //
3480   // There is a first-order recurrence on "a". For this loop, the shorthand
3481   // scalar IR looks like:
3482   //
3483   //   scalar.ph:
3484   //     s_init = a[-1]
3485   //     br scalar.body
3486   //
3487   //   scalar.body:
3488   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3489   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3490   //     s2 = a[i]
3491   //     b[i] = s2 - s1
3492   //     br cond, scalar.body, ...
3493   //
3494   // In this example, s1 is a recurrence because it's value depends on the
3495   // previous iteration. In the first phase of vectorization, we created a
3496   // temporary value for s1. We now complete the vectorization and produce the
3497   // shorthand vector IR shown below (for VF = 4, UF = 1).
3498   //
3499   //   vector.ph:
3500   //     v_init = vector(..., ..., ..., a[-1])
3501   //     br vector.body
3502   //
3503   //   vector.body
3504   //     i = phi [0, vector.ph], [i+4, vector.body]
3505   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3506   //     v2 = a[i, i+1, i+2, i+3];
3507   //     v3 = vector(v1(3), v2(0, 1, 2))
3508   //     b[i, i+1, i+2, i+3] = v2 - v3
3509   //     br cond, vector.body, middle.block
3510   //
3511   //   middle.block:
3512   //     x = v2(3)
3513   //     br scalar.ph
3514   //
3515   //   scalar.ph:
3516   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3517   //     br scalar.body
3518   //
3519   // After execution completes the vector loop, we extract the next value of
3520   // the recurrence (x) to use as the initial value in the scalar loop.
3521
3522   // Get the original loop preheader and single loop latch.
3523   auto *Preheader = OrigLoop->getLoopPreheader();
3524   auto *Latch = OrigLoop->getLoopLatch();
3525
3526   // Get the initial and previous values of the scalar recurrence.
3527   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3528   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3529
3530   // Create a vector from the initial value.
3531   auto *VectorInit = ScalarInit;
3532   if (VF > 1) {
3533     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3534     VectorInit = Builder.CreateInsertElement(
3535         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3536         Builder.getInt32(VF - 1), "vector.recur.init");
3537   }
3538
3539   // We constructed a temporary phi node in the first phase of vectorization.
3540   // This phi node will eventually be deleted.
3541   Builder.SetInsertPoint(
3542       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3543
3544   // Create a phi node for the new recurrence. The current value will either be
3545   // the initial value inserted into a vector or loop-varying vector value.
3546   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3547   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3548
3549   // Get the vectorized previous value of the last part UF - 1. It appears last
3550   // among all unrolled iterations, due to the order of their construction.
3551   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3552
3553   // Set the insertion point after the previous value if it is an instruction.
3554   // Note that the previous value may have been constant-folded so it is not
3555   // guaranteed to be an instruction in the vector loop. Also, if the previous
3556   // value is a phi node, we should insert after all the phi nodes to avoid
3557   // breaking basic block verification.
3558   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
3559       isa<PHINode>(PreviousLastPart))
3560     Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3561   else
3562     Builder.SetInsertPoint(
3563         &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
3564
3565   // We will construct a vector for the recurrence by combining the values for
3566   // the current and previous iterations. This is the required shuffle mask.
3567   SmallVector<Constant *, 8> ShuffleMask(VF);
3568   ShuffleMask[0] = Builder.getInt32(VF - 1);
3569   for (unsigned I = 1; I < VF; ++I)
3570     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3571
3572   // The vector from which to take the initial value for the current iteration
3573   // (actual or unrolled). Initially, this is the vector phi node.
3574   Value *Incoming = VecPhi;
3575
3576   // Shuffle the current and previous vector and update the vector parts.
3577   for (unsigned Part = 0; Part < UF; ++Part) {
3578     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3579     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3580     auto *Shuffle =
3581         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3582                                              ConstantVector::get(ShuffleMask))
3583                : Incoming;
3584     PhiPart->replaceAllUsesWith(Shuffle);
3585     cast<Instruction>(PhiPart)->eraseFromParent();
3586     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3587     Incoming = PreviousPart;
3588   }
3589
3590   // Fix the latch value of the new recurrence in the vector loop.
3591   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3592
3593   // Extract the last vector element in the middle block. This will be the
3594   // initial value for the recurrence when jumping to the scalar loop.
3595   auto *ExtractForScalar = Incoming;
3596   if (VF > 1) {
3597     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3598     ExtractForScalar = Builder.CreateExtractElement(
3599         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3600   }
3601   // Extract the second last element in the middle block if the
3602   // Phi is used outside the loop. We need to extract the phi itself
3603   // and not the last element (the phi update in the current iteration). This
3604   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3605   // when the scalar loop is not run at all.
3606   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3607   if (VF > 1)
3608     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3609         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3610   // When loop is unrolled without vectorizing, initialize
3611   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3612   // `Incoming`. This is analogous to the vectorized case above: extracting the
3613   // second last element when VF > 1.
3614   else if (UF > 1)
3615     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3616
3617   // Fix the initial value of the original recurrence in the scalar loop.
3618   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3619   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3620   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3621     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3622     Start->addIncoming(Incoming, BB);
3623   }
3624
3625   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3626   Phi->setName("scalar.recur");
3627
3628   // Finally, fix users of the recurrence outside the loop. The users will need
3629   // either the last value of the scalar recurrence or the last value of the
3630   // vector recurrence we extracted in the middle block. Since the loop is in
3631   // LCSSA form, we just need to find all the phi nodes for the original scalar
3632   // recurrence in the exit block, and then add an edge for the middle block.
3633   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3634     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3635       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3636     }
3637   }
3638 }
3639
3640 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3641   Constant *Zero = Builder.getInt32(0);
3642
3643   // Get it's reduction variable descriptor.
3644   assert(Legal->isReductionVariable(Phi) &&
3645          "Unable to find the reduction variable");
3646   RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3647
3648   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3649   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3650   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3651   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3652     RdxDesc.getMinMaxRecurrenceKind();
3653   setDebugLocFromInst(Builder, ReductionStartValue);
3654
3655   // We need to generate a reduction vector from the incoming scalar.
3656   // To do so, we need to generate the 'identity' vector and override
3657   // one of the elements with the incoming scalar reduction. We need
3658   // to do it in the vector-loop preheader.
3659   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3660
3661   // This is the vector-clone of the value that leaves the loop.
3662   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3663
3664   // Find the reduction identity variable. Zero for addition, or, xor,
3665   // one for multiplication, -1 for And.
3666   Value *Identity;
3667   Value *VectorStart;
3668   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3669       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3670     // MinMax reduction have the start value as their identify.
3671     if (VF == 1) {
3672       VectorStart = Identity = ReductionStartValue;
3673     } else {
3674       VectorStart = Identity =
3675         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3676     }
3677   } else {
3678     // Handle other reduction kinds:
3679     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3680         RK, VecTy->getScalarType());
3681     if (VF == 1) {
3682       Identity = Iden;
3683       // This vector is the Identity vector where the first element is the
3684       // incoming scalar reduction.
3685       VectorStart = ReductionStartValue;
3686     } else {
3687       Identity = ConstantVector::getSplat(VF, Iden);
3688
3689       // This vector is the Identity vector where the first element is the
3690       // incoming scalar reduction.
3691       VectorStart =
3692         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3693     }
3694   }
3695
3696   // Fix the vector-loop phi.
3697
3698   // Reductions do not have to start at zero. They can start with
3699   // any loop invariant values.
3700   BasicBlock *Latch = OrigLoop->getLoopLatch();
3701   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3702   for (unsigned Part = 0; Part < UF; ++Part) {
3703     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3704     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3705     // Make sure to add the reduction stat value only to the
3706     // first unroll part.
3707     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3708     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3709     cast<PHINode>(VecRdxPhi)
3710       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3711   }
3712
3713   // Before each round, move the insertion point right between
3714   // the PHIs and the values we are going to write.
3715   // This allows us to write both PHINodes and the extractelement
3716   // instructions.
3717   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3718
3719   setDebugLocFromInst(Builder, LoopExitInst);
3720
3721   // If tail is folded by masking, the vector value to leave the loop should be
3722   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3723   // instead of the former.
3724   if (Cost->foldTailByMasking()) {
3725     for (unsigned Part = 0; Part < UF; ++Part) {
3726       Value *VecLoopExitInst =
3727           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3728       Value *Sel = nullptr;
3729       for (User *U : VecLoopExitInst->users()) {
3730         if (isa<SelectInst>(U)) {
3731           assert(!Sel && "Reduction exit feeding two selects");
3732           Sel = U;
3733         } else
3734           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3735       }
3736       assert(Sel && "Reduction exit feeds no select");
3737       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3738     }
3739   }
3740
3741   // If the vector reduction can be performed in a smaller type, we truncate
3742   // then extend the loop exit value to enable InstCombine to evaluate the
3743   // entire expression in the smaller type.
3744   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3745     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3746     Builder.SetInsertPoint(
3747         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3748     VectorParts RdxParts(UF);
3749     for (unsigned Part = 0; Part < UF; ++Part) {
3750       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3751       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3752       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3753                                         : Builder.CreateZExt(Trunc, VecTy);
3754       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3755            UI != RdxParts[Part]->user_end();)
3756         if (*UI != Trunc) {
3757           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3758           RdxParts[Part] = Extnd;
3759         } else {
3760           ++UI;
3761         }
3762     }
3763     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3764     for (unsigned Part = 0; Part < UF; ++Part) {
3765       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3766       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3767     }
3768   }
3769
3770   // Reduce all of the unrolled parts into a single vector.
3771   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3772   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3773
3774   // The middle block terminator has already been assigned a DebugLoc here (the
3775   // OrigLoop's single latch terminator). We want the whole middle block to
3776   // appear to execute on this line because: (a) it is all compiler generated,
3777   // (b) these instructions are always executed after evaluating the latch
3778   // conditional branch, and (c) other passes may add new predecessors which
3779   // terminate on this line. This is the easiest way to ensure we don't
3780   // accidentally cause an extra step back into the loop while debugging.
3781   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3782   for (unsigned Part = 1; Part < UF; ++Part) {
3783     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3784     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3785       // Floating point operations had to be 'fast' to enable the reduction.
3786       ReducedPartRdx = addFastMathFlag(
3787           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3788                               ReducedPartRdx, "bin.rdx"),
3789           RdxDesc.getFastMathFlags());
3790     else
3791       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3792                                       RdxPart);
3793   }
3794
3795   if (VF > 1) {
3796     bool NoNaN = Legal->hasFunNoNaNAttr();
3797     ReducedPartRdx =
3798         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3799     // If the reduction can be performed in a smaller type, we need to extend
3800     // the reduction to the wider type before we branch to the original loop.
3801     if (Phi->getType() != RdxDesc.getRecurrenceType())
3802       ReducedPartRdx =
3803         RdxDesc.isSigned()
3804         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3805         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3806   }
3807
3808   // Create a phi node that merges control-flow from the backedge-taken check
3809   // block and the middle block.
3810   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3811                                         LoopScalarPreHeader->getTerminator());
3812   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3813     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3814   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3815
3816   // Now, we need to fix the users of the reduction variable
3817   // inside and outside of the scalar remainder loop.
3818   // We know that the loop is in LCSSA form. We need to update the
3819   // PHI nodes in the exit blocks.
3820   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3821     // All PHINodes need to have a single entry edge, or two if
3822     // we already fixed them.
3823     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3824
3825     // We found a reduction value exit-PHI. Update it with the
3826     // incoming bypass edge.
3827     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3828       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3829   } // end of the LCSSA phi scan.
3830
3831     // Fix the scalar loop reduction variable with the incoming reduction sum
3832     // from the vector body and from the backedge value.
3833   int IncomingEdgeBlockIdx =
3834     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3835   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3836   // Pick the other block.
3837   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3838   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3839   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3840 }
3841
3842 void InnerLoopVectorizer::fixLCSSAPHIs() {
3843   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3844     if (LCSSAPhi.getNumIncomingValues() == 1) {
3845       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3846       // Non-instruction incoming values will have only one value.
3847       unsigned LastLane = 0;
3848       if (isa<Instruction>(IncomingValue))
3849           LastLane = Cost->isUniformAfterVectorization(
3850                          cast<Instruction>(IncomingValue), VF)
3851                          ? 0
3852                          : VF - 1;
3853       // Can be a loop invariant incoming value or the last scalar value to be
3854       // extracted from the vectorized loop.
3855       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3856       Value *lastIncomingValue =
3857           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3858       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3859     }
3860   }
3861 }
3862
3863 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3864   // The basic block and loop containing the predicated instruction.
3865   auto *PredBB = PredInst->getParent();
3866   auto *VectorLoop = LI->getLoopFor(PredBB);
3867
3868   // Initialize a worklist with the operands of the predicated instruction.
3869   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3870
3871   // Holds instructions that we need to analyze again. An instruction may be
3872   // reanalyzed if we don't yet know if we can sink it or not.
3873   SmallVector<Instruction *, 8> InstsToReanalyze;
3874
3875   // Returns true if a given use occurs in the predicated block. Phi nodes use
3876   // their operands in their corresponding predecessor blocks.
3877   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3878     auto *I = cast<Instruction>(U.getUser());
3879     BasicBlock *BB = I->getParent();
3880     if (auto *Phi = dyn_cast<PHINode>(I))
3881       BB = Phi->getIncomingBlock(
3882           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3883     return BB == PredBB;
3884   };
3885
3886   // Iteratively sink the scalarized operands of the predicated instruction
3887   // into the block we created for it. When an instruction is sunk, it's
3888   // operands are then added to the worklist. The algorithm ends after one pass
3889   // through the worklist doesn't sink a single instruction.
3890   bool Changed;
3891   do {
3892     // Add the instructions that need to be reanalyzed to the worklist, and
3893     // reset the changed indicator.
3894     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3895     InstsToReanalyze.clear();
3896     Changed = false;
3897
3898     while (!Worklist.empty()) {
3899       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3900
3901       // We can't sink an instruction if it is a phi node, is already in the
3902       // predicated block, is not in the loop, or may have side effects.
3903       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3904           !VectorLoop->contains(I) || I->mayHaveSideEffects())
3905         continue;
3906
3907       // It's legal to sink the instruction if all its uses occur in the
3908       // predicated block. Otherwise, there's nothing to do yet, and we may
3909       // need to reanalyze the instruction.
3910       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3911         InstsToReanalyze.push_back(I);
3912         continue;
3913       }
3914
3915       // Move the instruction to the beginning of the predicated block, and add
3916       // it's operands to the worklist.
3917       I->moveBefore(&*PredBB->getFirstInsertionPt());
3918       Worklist.insert(I->op_begin(), I->op_end());
3919
3920       // The sinking may have enabled other instructions to be sunk, so we will
3921       // need to iterate.
3922       Changed = true;
3923     }
3924   } while (Changed);
3925 }
3926
3927 void InnerLoopVectorizer::fixNonInductionPHIs() {
3928   for (PHINode *OrigPhi : OrigPHIsToFix) {
3929     PHINode *NewPhi =
3930         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3931     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3932
3933     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3934         predecessors(OrigPhi->getParent()));
3935     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3936         predecessors(NewPhi->getParent()));
3937     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3938            "Scalar and Vector BB should have the same number of predecessors");
3939
3940     // The insertion point in Builder may be invalidated by the time we get
3941     // here. Force the Builder insertion point to something valid so that we do
3942     // not run into issues during insertion point restore in
3943     // getOrCreateVectorValue calls below.
3944     Builder.SetInsertPoint(NewPhi);
3945
3946     // The predecessor order is preserved and we can rely on mapping between
3947     // scalar and vector block predecessors.
3948     for (unsigned i = 0; i < NumIncomingValues; ++i) {
3949       BasicBlock *NewPredBB = VectorBBPredecessors[i];
3950
3951       // When looking up the new scalar/vector values to fix up, use incoming
3952       // values from original phi.
3953       Value *ScIncV =
3954           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3955
3956       // Scalar incoming value may need a broadcast
3957       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3958       NewPhi->addIncoming(NewIncV, NewPredBB);
3959     }
3960   }
3961 }
3962
3963 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
3964                                               unsigned VF) {
3965   PHINode *P = cast<PHINode>(PN);
3966   if (EnableVPlanNativePath) {
3967     // Currently we enter here in the VPlan-native path for non-induction
3968     // PHIs where all control flow is uniform. We simply widen these PHIs.
3969     // Create a vector phi with no operands - the vector phi operands will be
3970     // set at the end of vector code generation.
3971     Type *VecTy =
3972         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3973     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3974     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3975     OrigPHIsToFix.push_back(P);
3976
3977     return;
3978   }
3979
3980   assert(PN->getParent() == OrigLoop->getHeader() &&
3981          "Non-header phis should have been handled elsewhere");
3982
3983   // In order to support recurrences we need to be able to vectorize Phi nodes.
3984   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3985   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3986   // this value when we vectorize all of the instructions that use the PHI.
3987   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
3988     for (unsigned Part = 0; Part < UF; ++Part) {
3989       // This is phase one of vectorizing PHIs.
3990       Type *VecTy =
3991           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3992       Value *EntryPart = PHINode::Create(
3993           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
3994       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
3995     }
3996     return;
3997   }
3998
3999   setDebugLocFromInst(Builder, P);
4000
4001   // This PHINode must be an induction variable.
4002   // Make sure that we know about it.
4003   assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
4004
4005   InductionDescriptor II = Legal->getInductionVars()->lookup(P);
4006   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4007
4008   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4009   // which can be found from the original scalar operations.
4010   switch (II.getKind()) {
4011   case InductionDescriptor::IK_NoInduction:
4012     llvm_unreachable("Unknown induction");
4013   case InductionDescriptor::IK_IntInduction:
4014   case InductionDescriptor::IK_FpInduction:
4015     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4016   case InductionDescriptor::IK_PtrInduction: {
4017     // Handle the pointer induction variable case.
4018     assert(P->getType()->isPointerTy() && "Unexpected type.");
4019     // This is the normalized GEP that starts counting at zero.
4020     Value *PtrInd = Induction;
4021     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4022     // Determine the number of scalars we need to generate for each unroll
4023     // iteration. If the instruction is uniform, we only need to generate the
4024     // first lane. Otherwise, we generate all VF values.
4025     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4026     // These are the scalar results. Notice that we don't generate vector GEPs
4027     // because scalar GEPs result in better code.
4028     for (unsigned Part = 0; Part < UF; ++Part) {
4029       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4030         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4031         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4032         Value *SclrGep =
4033             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4034         SclrGep->setName("next.gep");
4035         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4036       }
4037     }
4038     return;
4039   }
4040   }
4041 }
4042
4043 /// A helper function for checking whether an integer division-related
4044 /// instruction may divide by zero (in which case it must be predicated if
4045 /// executed conditionally in the scalar code).
4046 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4047 /// Non-zero divisors that are non compile-time constants will not be
4048 /// converted into multiplication, so we will still end up scalarizing
4049 /// the division, but can do so w/o predication.
4050 static bool mayDivideByZero(Instruction &I) {
4051   assert((I.getOpcode() == Instruction::UDiv ||
4052           I.getOpcode() == Instruction::SDiv ||
4053           I.getOpcode() == Instruction::URem ||
4054           I.getOpcode() == Instruction::SRem) &&
4055          "Unexpected instruction");
4056   Value *Divisor = I.getOperand(1);
4057   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4058   return !CInt || CInt->isZero();
4059 }
4060
4061 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4062   switch (I.getOpcode()) {
4063   case Instruction::Br:
4064   case Instruction::PHI:
4065     llvm_unreachable("This instruction is handled by a different recipe.");
4066   case Instruction::GetElementPtr: {
4067     // Construct a vector GEP by widening the operands of the scalar GEP as
4068     // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4069     // results in a vector of pointers when at least one operand of the GEP
4070     // is vector-typed. Thus, to keep the representation compact, we only use
4071     // vector-typed operands for loop-varying values.
4072     auto *GEP = cast<GetElementPtrInst>(&I);
4073
4074     if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
4075       // If we are vectorizing, but the GEP has only loop-invariant operands,
4076       // the GEP we build (by only using vector-typed operands for
4077       // loop-varying values) would be a scalar pointer. Thus, to ensure we
4078       // produce a vector of pointers, we need to either arbitrarily pick an
4079       // operand to broadcast, or broadcast a clone of the original GEP.
4080       // Here, we broadcast a clone of the original.
4081       //
4082       // TODO: If at some point we decide to scalarize instructions having
4083       //       loop-invariant operands, this special case will no longer be
4084       //       required. We would add the scalarization decision to
4085       //       collectLoopScalars() and teach getVectorValue() to broadcast
4086       //       the lane-zero scalar value.
4087       auto *Clone = Builder.Insert(GEP->clone());
4088       for (unsigned Part = 0; Part < UF; ++Part) {
4089         Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4090         VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
4091         addMetadata(EntryPart, GEP);
4092       }
4093     } else {
4094       // If the GEP has at least one loop-varying operand, we are sure to
4095       // produce a vector of pointers. But if we are only unrolling, we want
4096       // to produce a scalar GEP for each unroll part. Thus, the GEP we
4097       // produce with the code below will be scalar (if VF == 1) or vector
4098       // (otherwise). Note that for the unroll-only case, we still maintain
4099       // values in the vector mapping with initVector, as we do for other
4100       // instructions.
4101       for (unsigned Part = 0; Part < UF; ++Part) {
4102         // The pointer operand of the new GEP. If it's loop-invariant, we
4103         // won't broadcast it.
4104         auto *Ptr =
4105             OrigLoop->isLoopInvariant(GEP->getPointerOperand())
4106                 ? GEP->getPointerOperand()
4107                 : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4108
4109         // Collect all the indices for the new GEP. If any index is
4110         // loop-invariant, we won't broadcast it.
4111         SmallVector<Value *, 4> Indices;
4112         for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
4113           if (OrigLoop->isLoopInvariant(U.get()))
4114             Indices.push_back(U.get());
4115           else
4116             Indices.push_back(getOrCreateVectorValue(U.get(), Part));
4117         }
4118
4119         // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4120         // but it should be a vector, otherwise.
4121         auto *NewGEP =
4122             GEP->isInBounds()
4123                 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4124                                             Indices)
4125                 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4126         assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4127                "NewGEP is not a pointer vector");
4128         VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
4129         addMetadata(NewGEP, GEP);
4130       }
4131     }
4132
4133     break;
4134   }
4135   case Instruction::UDiv:
4136   case Instruction::SDiv:
4137   case Instruction::SRem:
4138   case Instruction::URem:
4139   case Instruction::Add:
4140   case Instruction::FAdd:
4141   case Instruction::Sub:
4142   case Instruction::FSub:
4143   case Instruction::FNeg:
4144   case Instruction::Mul:
4145   case Instruction::FMul:
4146   case Instruction::FDiv:
4147   case Instruction::FRem:
4148   case Instruction::Shl:
4149   case Instruction::LShr:
4150   case Instruction::AShr:
4151   case Instruction::And:
4152   case Instruction::Or:
4153   case Instruction::Xor: {
4154     // Just widen unops and binops.
4155     setDebugLocFromInst(Builder, &I);
4156
4157     for (unsigned Part = 0; Part < UF; ++Part) {
4158       SmallVector<Value *, 2> Ops;
4159       for (Value *Op : I.operands())
4160         Ops.push_back(getOrCreateVectorValue(Op, Part));
4161
4162       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4163
4164       if (auto *VecOp = dyn_cast<Instruction>(V))
4165         VecOp->copyIRFlags(&I);
4166
4167       // Use this vector value for all users of the original instruction.
4168       VectorLoopValueMap.setVectorValue(&I, Part, V);
4169       addMetadata(V, &I);
4170     }
4171
4172     break;
4173   }
4174   case Instruction::Select: {
4175     // Widen selects.
4176     // If the selector is loop invariant we can create a select
4177     // instruction with a scalar condition. Otherwise, use vector-select.
4178     auto *SE = PSE.getSE();
4179     bool InvariantCond =
4180         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4181     setDebugLocFromInst(Builder, &I);
4182
4183     // The condition can be loop invariant  but still defined inside the
4184     // loop. This means that we can't just use the original 'cond' value.
4185     // We have to take the 'vectorized' value and pick the first lane.
4186     // Instcombine will make this a no-op.
4187
4188     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4189
4190     for (unsigned Part = 0; Part < UF; ++Part) {
4191       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4192       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4193       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4194       Value *Sel =
4195           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4196       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4197       addMetadata(Sel, &I);
4198     }
4199
4200     break;
4201   }
4202
4203   case Instruction::ICmp:
4204   case Instruction::FCmp: {
4205     // Widen compares. Generate vector compares.
4206     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4207     auto *Cmp = cast<CmpInst>(&I);
4208     setDebugLocFromInst(Builder, Cmp);
4209     for (unsigned Part = 0; Part < UF; ++Part) {
4210       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4211       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4212       Value *C = nullptr;
4213       if (FCmp) {
4214         // Propagate fast math flags.
4215         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4216         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4217         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4218       } else {
4219         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4220       }
4221       VectorLoopValueMap.setVectorValue(&I, Part, C);
4222       addMetadata(C, &I);
4223     }
4224
4225     break;
4226   }
4227
4228   case Instruction::ZExt:
4229   case Instruction::SExt:
4230   case Instruction::FPToUI:
4231   case Instruction::FPToSI:
4232   case Instruction::FPExt:
4233   case Instruction::PtrToInt:
4234   case Instruction::IntToPtr:
4235   case Instruction::SIToFP:
4236   case Instruction::UIToFP:
4237   case Instruction::Trunc:
4238   case Instruction::FPTrunc:
4239   case Instruction::BitCast: {
4240     auto *CI = cast<CastInst>(&I);
4241     setDebugLocFromInst(Builder, CI);
4242
4243     /// Vectorize casts.
4244     Type *DestTy =
4245         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4246
4247     for (unsigned Part = 0; Part < UF; ++Part) {
4248       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4249       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4250       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4251       addMetadata(Cast, &I);
4252     }
4253     break;
4254   }
4255
4256   case Instruction::Call: {
4257     // Ignore dbg intrinsics.
4258     if (isa<DbgInfoIntrinsic>(I))
4259       break;
4260     setDebugLocFromInst(Builder, &I);
4261
4262     Module *M = I.getParent()->getParent()->getParent();
4263     auto *CI = cast<CallInst>(&I);
4264
4265     StringRef FnName = CI->getCalledFunction()->getName();
4266     Function *F = CI->getCalledFunction();
4267     Type *RetTy = ToVectorTy(CI->getType(), VF);
4268     SmallVector<Type *, 4> Tys;
4269     for (Value *ArgOperand : CI->arg_operands())
4270       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4271
4272     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4273
4274     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4275     // version of the instruction.
4276     // Is it beneficial to perform intrinsic call compared to lib call?
4277     bool NeedToScalarize;
4278     unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4279     bool UseVectorIntrinsic =
4280         ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4281     assert((UseVectorIntrinsic || !NeedToScalarize) &&
4282            "Instruction should be scalarized elsewhere.");
4283
4284     for (unsigned Part = 0; Part < UF; ++Part) {
4285       SmallVector<Value *, 4> Args;
4286       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4287         Value *Arg = CI->getArgOperand(i);
4288         // Some intrinsics have a scalar argument - don't replace it with a
4289         // vector.
4290         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4291           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4292         Args.push_back(Arg);
4293       }
4294
4295       Function *VectorF;
4296       if (UseVectorIntrinsic) {
4297         // Use vector version of the intrinsic.
4298         Type *TysForDecl[] = {CI->getType()};
4299         if (VF > 1)
4300           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4301         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4302       } else {
4303         // Use vector version of the library call.
4304         StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4305         assert(!VFnName.empty() && "Vector function name is empty.");
4306         VectorF = M->getFunction(VFnName);
4307         if (!VectorF) {
4308           // Generate a declaration
4309           FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4310           VectorF =
4311               Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4312           VectorF->copyAttributesFrom(F);
4313         }
4314       }
4315       assert(VectorF && "Can't create vector function.");
4316
4317       SmallVector<OperandBundleDef, 1> OpBundles;
4318       CI->getOperandBundlesAsDefs(OpBundles);
4319       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4320
4321       if (isa<FPMathOperator>(V))
4322         V->copyFastMathFlags(CI);
4323
4324       VectorLoopValueMap.setVectorValue(&I, Part, V);
4325       addMetadata(V, &I);
4326     }
4327
4328     break;
4329   }
4330
4331   default:
4332     // This instruction is not vectorized by simple widening.
4333     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4334     llvm_unreachable("Unhandled instruction!");
4335   } // end of switch.
4336 }
4337
4338 void InnerLoopVectorizer::updateAnalysis() {
4339   // Forget the original basic block.
4340   PSE.getSE()->forgetLoop(OrigLoop);
4341
4342   // DT is not kept up-to-date for outer loop vectorization
4343   if (EnableVPlanNativePath)
4344     return;
4345
4346   // Update the dominator tree information.
4347   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
4348          "Entry does not dominate exit.");
4349
4350   DT->addNewBlock(LoopMiddleBlock,
4351                   LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4352   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
4353   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
4354   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
4355   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
4356 }
4357
4358 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4359   // We should not collect Scalars more than once per VF. Right now, this
4360   // function is called from collectUniformsAndScalars(), which already does
4361   // this check. Collecting Scalars for VF=1 does not make any sense.
4362   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4363          "This function should not be visited twice for the same VF");
4364
4365   SmallSetVector<Instruction *, 8> Worklist;
4366
4367   // These sets are used to seed the analysis with pointers used by memory
4368   // accesses that will remain scalar.
4369   SmallSetVector<Instruction *, 8> ScalarPtrs;
4370   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4371
4372   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4373   // The pointer operands of loads and stores will be scalar as long as the
4374   // memory access is not a gather or scatter operation. The value operand of a
4375   // store will remain scalar if the store is scalarized.
4376   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4377     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4378     assert(WideningDecision != CM_Unknown &&
4379            "Widening decision should be ready at this moment");
4380     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4381       if (Ptr == Store->getValueOperand())
4382         return WideningDecision == CM_Scalarize;
4383     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4384            "Ptr is neither a value or pointer operand");
4385     return WideningDecision != CM_GatherScatter;
4386   };
4387
4388   // A helper that returns true if the given value is a bitcast or
4389   // getelementptr instruction contained in the loop.
4390   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4391     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4392             isa<GetElementPtrInst>(V)) &&
4393            !TheLoop->isLoopInvariant(V);
4394   };
4395
4396   // A helper that evaluates a memory access's use of a pointer. If the use
4397   // will be a scalar use, and the pointer is only used by memory accesses, we
4398   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4399   // PossibleNonScalarPtrs.
4400   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4401     // We only care about bitcast and getelementptr instructions contained in
4402     // the loop.
4403     if (!isLoopVaryingBitCastOrGEP(Ptr))
4404       return;
4405
4406     // If the pointer has already been identified as scalar (e.g., if it was
4407     // also identified as uniform), there's nothing to do.
4408     auto *I = cast<Instruction>(Ptr);
4409     if (Worklist.count(I))
4410       return;
4411
4412     // If the use of the pointer will be a scalar use, and all users of the
4413     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4414     // place the pointer in PossibleNonScalarPtrs.
4415     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4416           return isa<LoadInst>(U) || isa<StoreInst>(U);
4417         }))
4418       ScalarPtrs.insert(I);
4419     else
4420       PossibleNonScalarPtrs.insert(I);
4421   };
4422
4423   // We seed the scalars analysis with three classes of instructions: (1)
4424   // instructions marked uniform-after-vectorization, (2) bitcast and
4425   // getelementptr instructions used by memory accesses requiring a scalar use,
4426   // and (3) pointer induction variables and their update instructions (we
4427   // currently only scalarize these).
4428   //
4429   // (1) Add to the worklist all instructions that have been identified as
4430   // uniform-after-vectorization.
4431   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4432
4433   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4434   // memory accesses requiring a scalar use. The pointer operands of loads and
4435   // stores will be scalar as long as the memory accesses is not a gather or
4436   // scatter operation. The value operand of a store will remain scalar if the
4437   // store is scalarized.
4438   for (auto *BB : TheLoop->blocks())
4439     for (auto &I : *BB) {
4440       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4441         evaluatePtrUse(Load, Load->getPointerOperand());
4442       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4443         evaluatePtrUse(Store, Store->getPointerOperand());
4444         evaluatePtrUse(Store, Store->getValueOperand());
4445       }
4446     }
4447   for (auto *I : ScalarPtrs)
4448     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4449       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4450       Worklist.insert(I);
4451     }
4452
4453   // (3) Add to the worklist all pointer induction variables and their update
4454   // instructions.
4455   //
4456   // TODO: Once we are able to vectorize pointer induction variables we should
4457   //       no longer insert them into the worklist here.
4458   auto *Latch = TheLoop->getLoopLatch();
4459   for (auto &Induction : *Legal->getInductionVars()) {
4460     auto *Ind = Induction.first;
4461     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4462     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4463       continue;
4464     Worklist.insert(Ind);
4465     Worklist.insert(IndUpdate);
4466     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4467     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4468                       << "\n");
4469   }
4470
4471   // Insert the forced scalars.
4472   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4473   // induction variable when the PHI user is scalarized.
4474   auto ForcedScalar = ForcedScalars.find(VF);
4475   if (ForcedScalar != ForcedScalars.end())
4476     for (auto *I : ForcedScalar->second)
4477       Worklist.insert(I);
4478
4479   // Expand the worklist by looking through any bitcasts and getelementptr
4480   // instructions we've already identified as scalar. This is similar to the
4481   // expansion step in collectLoopUniforms(); however, here we're only
4482   // expanding to include additional bitcasts and getelementptr instructions.
4483   unsigned Idx = 0;
4484   while (Idx != Worklist.size()) {
4485     Instruction *Dst = Worklist[Idx++];
4486     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4487       continue;
4488     auto *Src = cast<Instruction>(Dst->getOperand(0));
4489     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4490           auto *J = cast<Instruction>(U);
4491           return !TheLoop->contains(J) || Worklist.count(J) ||
4492                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4493                   isScalarUse(J, Src));
4494         })) {
4495       Worklist.insert(Src);
4496       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4497     }
4498   }
4499
4500   // An induction variable will remain scalar if all users of the induction
4501   // variable and induction variable update remain scalar.
4502   for (auto &Induction : *Legal->getInductionVars()) {
4503     auto *Ind = Induction.first;
4504     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4505
4506     // We already considered pointer induction variables, so there's no reason
4507     // to look at their users again.
4508     //
4509     // TODO: Once we are able to vectorize pointer induction variables we
4510     //       should no longer skip over them here.
4511     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4512       continue;
4513
4514     // Determine if all users of the induction variable are scalar after
4515     // vectorization.
4516     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4517       auto *I = cast<Instruction>(U);
4518       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4519     });
4520     if (!ScalarInd)
4521       continue;
4522
4523     // Determine if all users of the induction variable update instruction are
4524     // scalar after vectorization.
4525     auto ScalarIndUpdate =
4526         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4527           auto *I = cast<Instruction>(U);
4528           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4529         });
4530     if (!ScalarIndUpdate)
4531       continue;
4532
4533     // The induction variable and its update instruction will remain scalar.
4534     Worklist.insert(Ind);
4535     Worklist.insert(IndUpdate);
4536     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4537     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4538                       << "\n");
4539   }
4540
4541   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4542 }
4543
4544 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4545   if (!blockNeedsPredication(I->getParent()))
4546     return false;
4547   switch(I->getOpcode()) {
4548   default:
4549     break;
4550   case Instruction::Load:
4551   case Instruction::Store: {
4552     if (!Legal->isMaskRequired(I))
4553       return false;
4554     auto *Ptr = getLoadStorePointerOperand(I);
4555     auto *Ty = getMemInstValueType(I);
4556     // We have already decided how to vectorize this instruction, get that
4557     // result.
4558     if (VF > 1) {
4559       InstWidening WideningDecision = getWideningDecision(I, VF);
4560       assert(WideningDecision != CM_Unknown &&
4561              "Widening decision should be ready at this moment");
4562       return WideningDecision == CM_Scalarize;
4563     }
4564     const MaybeAlign Alignment = getLoadStoreAlignment(I);
4565     return isa<LoadInst>(I) ?
4566         !(isLegalMaskedLoad(Ty, Ptr, Alignment) || isLegalMaskedGather(Ty))
4567       : !(isLegalMaskedStore(Ty, Ptr, Alignment) || isLegalMaskedScatter(Ty));
4568   }
4569   case Instruction::UDiv:
4570   case Instruction::SDiv:
4571   case Instruction::SRem:
4572   case Instruction::URem:
4573     return mayDivideByZero(*I);
4574   }
4575   return false;
4576 }
4577
4578 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4579                                                                unsigned VF) {
4580   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4581   assert(getWideningDecision(I, VF) == CM_Unknown &&
4582          "Decision should not be set yet.");
4583   auto *Group = getInterleavedAccessGroup(I);
4584   assert(Group && "Must have a group.");
4585
4586   // If the instruction's allocated size doesn't equal it's type size, it
4587   // requires padding and will be scalarized.
4588   auto &DL = I->getModule()->getDataLayout();
4589   auto *ScalarTy = getMemInstValueType(I);
4590   if (hasIrregularType(ScalarTy, DL, VF))
4591     return false;
4592
4593   // Check if masking is required.
4594   // A Group may need masking for one of two reasons: it resides in a block that
4595   // needs predication, or it was decided to use masking to deal with gaps.
4596   bool PredicatedAccessRequiresMasking =
4597       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4598   bool AccessWithGapsRequiresMasking =
4599       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4600   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4601     return true;
4602
4603   // If masked interleaving is required, we expect that the user/target had
4604   // enabled it, because otherwise it either wouldn't have been created or
4605   // it should have been invalidated by the CostModel.
4606   assert(useMaskedInterleavedAccesses(TTI) &&
4607          "Masked interleave-groups for predicated accesses are not enabled.");
4608
4609   auto *Ty = getMemInstValueType(I);
4610   const MaybeAlign Alignment = getLoadStoreAlignment(I);
4611   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4612                           : TTI.isLegalMaskedStore(Ty, Alignment);
4613 }
4614
4615 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4616                                                                unsigned VF) {
4617   // Get and ensure we have a valid memory instruction.
4618   LoadInst *LI = dyn_cast<LoadInst>(I);
4619   StoreInst *SI = dyn_cast<StoreInst>(I);
4620   assert((LI || SI) && "Invalid memory instruction");
4621
4622   auto *Ptr = getLoadStorePointerOperand(I);
4623
4624   // In order to be widened, the pointer should be consecutive, first of all.
4625   if (!Legal->isConsecutivePtr(Ptr))
4626     return false;
4627
4628   // If the instruction is a store located in a predicated block, it will be
4629   // scalarized.
4630   if (isScalarWithPredication(I))
4631     return false;
4632
4633   // If the instruction's allocated size doesn't equal it's type size, it
4634   // requires padding and will be scalarized.
4635   auto &DL = I->getModule()->getDataLayout();
4636   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4637   if (hasIrregularType(ScalarTy, DL, VF))
4638     return false;
4639
4640   return true;
4641 }
4642
4643 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4644   // We should not collect Uniforms more than once per VF. Right now,
4645   // this function is called from collectUniformsAndScalars(), which
4646   // already does this check. Collecting Uniforms for VF=1 does not make any
4647   // sense.
4648
4649   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4650          "This function should not be visited twice for the same VF");
4651
4652   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4653   // not analyze again.  Uniforms.count(VF) will return 1.
4654   Uniforms[VF].clear();
4655
4656   // We now know that the loop is vectorizable!
4657   // Collect instructions inside the loop that will remain uniform after
4658   // vectorization.
4659
4660   // Global values, params and instructions outside of current loop are out of
4661   // scope.
4662   auto isOutOfScope = [&](Value *V) -> bool {
4663     Instruction *I = dyn_cast<Instruction>(V);
4664     return (!I || !TheLoop->contains(I));
4665   };
4666
4667   SetVector<Instruction *> Worklist;
4668   BasicBlock *Latch = TheLoop->getLoopLatch();
4669
4670   // Start with the conditional branch. If the branch condition is an
4671   // instruction contained in the loop that is only used by the branch, it is
4672   // uniform.
4673   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4674   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4675     Worklist.insert(Cmp);
4676     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
4677   }
4678
4679   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4680   // are pointers that are treated like consecutive pointers during
4681   // vectorization. The pointer operands of interleaved accesses are an
4682   // example.
4683   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4684
4685   // Holds pointer operands of instructions that are possibly non-uniform.
4686   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4687
4688   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4689     InstWidening WideningDecision = getWideningDecision(I, VF);
4690     assert(WideningDecision != CM_Unknown &&
4691            "Widening decision should be ready at this moment");
4692
4693     return (WideningDecision == CM_Widen ||
4694             WideningDecision == CM_Widen_Reverse ||
4695             WideningDecision == CM_Interleave);
4696   };
4697   // Iterate over the instructions in the loop, and collect all
4698   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4699   // that a consecutive-like pointer operand will be scalarized, we collect it
4700   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4701   // getelementptr instruction can be used by both vectorized and scalarized
4702   // memory instructions. For example, if a loop loads and stores from the same
4703   // location, but the store is conditional, the store will be scalarized, and
4704   // the getelementptr won't remain uniform.
4705   for (auto *BB : TheLoop->blocks())
4706     for (auto &I : *BB) {
4707       // If there's no pointer operand, there's nothing to do.
4708       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4709       if (!Ptr)
4710         continue;
4711
4712       // True if all users of Ptr are memory accesses that have Ptr as their
4713       // pointer operand.
4714       auto UsersAreMemAccesses =
4715           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4716             return getLoadStorePointerOperand(U) == Ptr;
4717           });
4718
4719       // Ensure the memory instruction will not be scalarized or used by
4720       // gather/scatter, making its pointer operand non-uniform. If the pointer
4721       // operand is used by any instruction other than a memory access, we
4722       // conservatively assume the pointer operand may be non-uniform.
4723       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4724         PossibleNonUniformPtrs.insert(Ptr);
4725
4726       // If the memory instruction will be vectorized and its pointer operand
4727       // is consecutive-like, or interleaving - the pointer operand should
4728       // remain uniform.
4729       else
4730         ConsecutiveLikePtrs.insert(Ptr);
4731     }
4732
4733   // Add to the Worklist all consecutive and consecutive-like pointers that
4734   // aren't also identified as possibly non-uniform.
4735   for (auto *V : ConsecutiveLikePtrs)
4736     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
4737       LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
4738       Worklist.insert(V);
4739     }
4740
4741   // Expand Worklist in topological order: whenever a new instruction
4742   // is added , its users should be already inside Worklist.  It ensures
4743   // a uniform instruction will only be used by uniform instructions.
4744   unsigned idx = 0;
4745   while (idx != Worklist.size()) {
4746     Instruction *I = Worklist[idx++];
4747
4748     for (auto OV : I->operand_values()) {
4749       // isOutOfScope operands cannot be uniform instructions.
4750       if (isOutOfScope(OV))
4751         continue;
4752       // First order recurrence Phi's should typically be considered
4753       // non-uniform.
4754       auto *OP = dyn_cast<PHINode>(OV);
4755       if (OP && Legal->isFirstOrderRecurrence(OP))
4756         continue;
4757       // If all the users of the operand are uniform, then add the
4758       // operand into the uniform worklist.
4759       auto *OI = cast<Instruction>(OV);
4760       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4761             auto *J = cast<Instruction>(U);
4762             return Worklist.count(J) ||
4763                    (OI == getLoadStorePointerOperand(J) &&
4764                     isUniformDecision(J, VF));
4765           })) {
4766         Worklist.insert(OI);
4767         LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
4768       }
4769     }
4770   }
4771
4772   // Returns true if Ptr is the pointer operand of a memory access instruction
4773   // I, and I is known to not require scalarization.
4774   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4775     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4776   };
4777
4778   // For an instruction to be added into Worklist above, all its users inside
4779   // the loop should also be in Worklist. However, this condition cannot be
4780   // true for phi nodes that form a cyclic dependence. We must process phi
4781   // nodes separately. An induction variable will remain uniform if all users
4782   // of the induction variable and induction variable update remain uniform.
4783   // The code below handles both pointer and non-pointer induction variables.
4784   for (auto &Induction : *Legal->getInductionVars()) {
4785     auto *Ind = Induction.first;
4786     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4787
4788     // Determine if all users of the induction variable are uniform after
4789     // vectorization.
4790     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4791       auto *I = cast<Instruction>(U);
4792       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4793              isVectorizedMemAccessUse(I, Ind);
4794     });
4795     if (!UniformInd)
4796       continue;
4797
4798     // Determine if all users of the induction variable update instruction are
4799     // uniform after vectorization.
4800     auto UniformIndUpdate =
4801         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4802           auto *I = cast<Instruction>(U);
4803           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4804                  isVectorizedMemAccessUse(I, IndUpdate);
4805         });
4806     if (!UniformIndUpdate)
4807       continue;
4808
4809     // The induction variable and its update instruction will remain uniform.
4810     Worklist.insert(Ind);
4811     Worklist.insert(IndUpdate);
4812     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
4813     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4814                       << "\n");
4815   }
4816
4817   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4818 }
4819
4820 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4821   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4822
4823   if (Legal->getRuntimePointerChecking()->Need) {
4824     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4825         "runtime pointer checks needed. Enable vectorization of this "
4826         "loop with '#pragma clang loop vectorize(enable)' when "
4827         "compiling with -Os/-Oz",
4828         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4829     return true;
4830   }
4831
4832   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4833     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4834         "runtime SCEV checks needed. Enable vectorization of this "
4835         "loop with '#pragma clang loop vectorize(enable)' when "
4836         "compiling with -Os/-Oz",
4837         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4838     return true;
4839   }
4840
4841   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4842   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4843     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4844         "runtime stride == 1 checks needed. Enable vectorization of "
4845         "this loop with '#pragma clang loop vectorize(enable)' when "
4846         "compiling with -Os/-Oz",
4847         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4848     return true;
4849   }
4850
4851   return false;
4852 }
4853
4854 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4855   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4856     // TODO: It may by useful to do since it's still likely to be dynamically
4857     // uniform if the target can skip.
4858     reportVectorizationFailure(
4859         "Not inserting runtime ptr check for divergent target",
4860         "runtime pointer checks needed. Not enabled for divergent target",
4861         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4862     return None;
4863   }
4864
4865   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4866   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4867   if (TC == 1) {
4868     reportVectorizationFailure("Single iteration (non) loop",
4869         "loop trip count is one, irrelevant for vectorization",
4870         "SingleIterationLoop", ORE, TheLoop);
4871     return None;
4872   }
4873
4874   switch (ScalarEpilogueStatus) {
4875   case CM_ScalarEpilogueAllowed:
4876     return computeFeasibleMaxVF(TC);
4877   case CM_ScalarEpilogueNotNeededUsePredicate:
4878     LLVM_DEBUG(
4879         dbgs() << "LV: vector predicate hint/switch found.\n"
4880                << "LV: Not allowing scalar epilogue, creating predicated "
4881                << "vector loop.\n");
4882     break;
4883   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4884     // fallthrough as a special case of OptForSize
4885   case CM_ScalarEpilogueNotAllowedOptSize:
4886     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4887       LLVM_DEBUG(
4888           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4889     else
4890       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4891                         << "count.\n");
4892
4893     // Bail if runtime checks are required, which are not good when optimising
4894     // for size.
4895     if (runtimeChecksRequired())
4896       return None;
4897     break;
4898   }
4899
4900   // Now try the tail folding
4901
4902   // Invalidate interleave groups that require an epilogue if we can't mask
4903   // the interleave-group.
4904   if (!useMaskedInterleavedAccesses(TTI))
4905     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4906
4907   unsigned MaxVF = computeFeasibleMaxVF(TC);
4908   if (TC > 0 && TC % MaxVF == 0) {
4909     // Accept MaxVF if we do not have a tail.
4910     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4911     return MaxVF;
4912   }
4913
4914   // If we don't know the precise trip count, or if the trip count that we
4915   // found modulo the vectorization factor is not zero, try to fold the tail
4916   // by masking.
4917   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4918   if (Legal->prepareToFoldTailByMasking()) {
4919     FoldTailByMasking = true;
4920     return MaxVF;
4921   }
4922
4923   if (TC == 0) {
4924     reportVectorizationFailure(
4925         "Unable to calculate the loop count due to complex control flow",
4926         "unable to calculate the loop count due to complex control flow",
4927         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4928     return None;
4929   }
4930
4931   reportVectorizationFailure(
4932       "Cannot optimize for size and vectorize at the same time.",
4933       "cannot optimize for size and vectorize at the same time. "
4934       "Enable vectorization of this loop with '#pragma clang loop "
4935       "vectorize(enable)' when compiling with -Os/-Oz",
4936       "NoTailLoopWithOptForSize", ORE, TheLoop);
4937   return None;
4938 }
4939
4940 unsigned
4941 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
4942   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4943   unsigned SmallestType, WidestType;
4944   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4945   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4946
4947   // Get the maximum safe dependence distance in bits computed by LAA.
4948   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4949   // the memory accesses that is most restrictive (involved in the smallest
4950   // dependence distance).
4951   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4952
4953   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4954
4955   unsigned MaxVectorSize = WidestRegister / WidestType;
4956
4957   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4958                     << " / " << WidestType << " bits.\n");
4959   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4960                     << WidestRegister << " bits.\n");
4961
4962   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4963                                  " into one vector!");
4964   if (MaxVectorSize == 0) {
4965     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4966     MaxVectorSize = 1;
4967     return MaxVectorSize;
4968   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4969              isPowerOf2_32(ConstTripCount)) {
4970     // We need to clamp the VF to be the ConstTripCount. There is no point in
4971     // choosing a higher viable VF as done in the loop below.
4972     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4973                       << ConstTripCount << "\n");
4974     MaxVectorSize = ConstTripCount;
4975     return MaxVectorSize;
4976   }
4977
4978   unsigned MaxVF = MaxVectorSize;
4979   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
4980       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
4981     // Collect all viable vectorization factors larger than the default MaxVF
4982     // (i.e. MaxVectorSize).
4983     SmallVector<unsigned, 8> VFs;
4984     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
4985     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
4986       VFs.push_back(VS);
4987
4988     // For each VF calculate its register usage.
4989     auto RUs = calculateRegisterUsage(VFs);
4990
4991     // Select the largest VF which doesn't require more registers than existing
4992     // ones.
4993     for (int i = RUs.size() - 1; i >= 0; --i) {
4994       bool Selected = true;
4995       for (auto& pair : RUs[i].MaxLocalUsers) {
4996         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
4997         if (pair.second > TargetNumRegisters)
4998           Selected = false;
4999       }
5000       if (Selected) {
5001         MaxVF = VFs[i];
5002         break;
5003       }
5004     }
5005     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5006       if (MaxVF < MinVF) {
5007         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5008                           << ") with target's minimum: " << MinVF << '\n');
5009         MaxVF = MinVF;
5010       }
5011     }
5012   }
5013   return MaxVF;
5014 }
5015
5016 VectorizationFactor
5017 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5018   float Cost = expectedCost(1).first;
5019   const float ScalarCost = Cost;
5020   unsigned Width = 1;
5021   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5022
5023   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5024   if (ForceVectorization && MaxVF > 1) {
5025     // Ignore scalar width, because the user explicitly wants vectorization.
5026     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5027     // evaluation.
5028     Cost = std::numeric_limits<float>::max();
5029   }
5030
5031   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5032     // Notice that the vector loop needs to be executed less times, so
5033     // we need to divide the cost of the vector loops by the width of
5034     // the vector elements.
5035     VectorizationCostTy C = expectedCost(i);
5036     float VectorCost = C.first / (float)i;
5037     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5038                       << " costs: " << (int)VectorCost << ".\n");
5039     if (!C.second && !ForceVectorization) {
5040       LLVM_DEBUG(
5041           dbgs() << "LV: Not considering vector loop of width " << i
5042                  << " because it will not generate any vector instructions.\n");
5043       continue;
5044     }
5045     if (VectorCost < Cost) {
5046       Cost = VectorCost;
5047       Width = i;
5048     }
5049   }
5050
5051   if (!EnableCondStoresVectorization && NumPredStores) {
5052     reportVectorizationFailure("There are conditional stores.",
5053         "store that is conditionally executed prevents vectorization",
5054         "ConditionalStore", ORE, TheLoop);
5055     Width = 1;
5056     Cost = ScalarCost;
5057   }
5058
5059   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5060              << "LV: Vectorization seems to be not beneficial, "
5061              << "but was forced by a user.\n");
5062   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5063   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5064   return Factor;
5065 }
5066
5067 std::pair<unsigned, unsigned>
5068 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5069   unsigned MinWidth = -1U;
5070   unsigned MaxWidth = 8;
5071   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5072
5073   // For each block.
5074   for (BasicBlock *BB : TheLoop->blocks()) {
5075     // For each instruction in the loop.
5076     for (Instruction &I : BB->instructionsWithoutDebug()) {
5077       Type *T = I.getType();
5078
5079       // Skip ignored values.
5080       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5081         continue;
5082
5083       // Only examine Loads, Stores and PHINodes.
5084       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5085         continue;
5086
5087       // Examine PHI nodes that are reduction variables. Update the type to
5088       // account for the recurrence type.
5089       if (auto *PN = dyn_cast<PHINode>(&I)) {
5090         if (!Legal->isReductionVariable(PN))
5091           continue;
5092         RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
5093         T = RdxDesc.getRecurrenceType();
5094       }
5095
5096       // Examine the stored values.
5097       if (auto *ST = dyn_cast<StoreInst>(&I))
5098         T = ST->getValueOperand()->getType();
5099
5100       // Ignore loaded pointer types and stored pointer types that are not
5101       // vectorizable.
5102       //
5103       // FIXME: The check here attempts to predict whether a load or store will
5104       //        be vectorized. We only know this for certain after a VF has
5105       //        been selected. Here, we assume that if an access can be
5106       //        vectorized, it will be. We should also look at extending this
5107       //        optimization to non-pointer types.
5108       //
5109       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5110           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5111         continue;
5112
5113       MinWidth = std::min(MinWidth,
5114                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5115       MaxWidth = std::max(MaxWidth,
5116                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5117     }
5118   }
5119
5120   return {MinWidth, MaxWidth};
5121 }
5122
5123 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5124                                                            unsigned LoopCost) {
5125   // -- The interleave heuristics --
5126   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5127   // There are many micro-architectural considerations that we can't predict
5128   // at this level. For example, frontend pressure (on decode or fetch) due to
5129   // code size, or the number and capabilities of the execution ports.
5130   //
5131   // We use the following heuristics to select the interleave count:
5132   // 1. If the code has reductions, then we interleave to break the cross
5133   // iteration dependency.
5134   // 2. If the loop is really small, then we interleave to reduce the loop
5135   // overhead.
5136   // 3. We don't interleave if we think that we will spill registers to memory
5137   // due to the increased register pressure.
5138
5139   if (!isScalarEpilogueAllowed())
5140     return 1;
5141
5142   // We used the distance for the interleave count.
5143   if (Legal->getMaxSafeDepDistBytes() != -1U)
5144     return 1;
5145
5146   // Do not interleave loops with a relatively small trip count.
5147   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5148   if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
5149     return 1;
5150
5151   RegisterUsage R = calculateRegisterUsage({VF})[0];
5152   // We divide by these constants so assume that we have at least one
5153   // instruction that uses at least one register.
5154   for (auto& pair : R.MaxLocalUsers) {
5155     pair.second = std::max(pair.second, 1U);
5156   }
5157
5158   // We calculate the interleave count using the following formula.
5159   // Subtract the number of loop invariants from the number of available
5160   // registers. These registers are used by all of the interleaved instances.
5161   // Next, divide the remaining registers by the number of registers that is
5162   // required by the loop, in order to estimate how many parallel instances
5163   // fit without causing spills. All of this is rounded down if necessary to be
5164   // a power of two. We want power of two interleave count to simplify any
5165   // addressing operations or alignment considerations.
5166   // We also want power of two interleave counts to ensure that the induction
5167   // variable of the vector loop wraps to zero, when tail is folded by masking;
5168   // this currently happens when OptForSize, in which case IC is set to 1 above.
5169   unsigned IC = UINT_MAX;
5170
5171   for (auto& pair : R.MaxLocalUsers) {
5172     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5173     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5174                       << " registers of "
5175                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5176     if (VF == 1) {
5177       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5178         TargetNumRegisters = ForceTargetNumScalarRegs;
5179     } else {
5180       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5181         TargetNumRegisters = ForceTargetNumVectorRegs;
5182     }
5183     unsigned MaxLocalUsers = pair.second;
5184     unsigned LoopInvariantRegs = 0;
5185     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5186       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5187
5188     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5189     // Don't count the induction variable as interleaved.
5190     if (EnableIndVarRegisterHeur) {
5191       TmpIC =
5192           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5193                         std::max(1U, (MaxLocalUsers - 1)));
5194     }
5195
5196     IC = std::min(IC, TmpIC);
5197   }
5198
5199   // Clamp the interleave ranges to reasonable counts.
5200   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5201
5202   // Check if the user has overridden the max.
5203   if (VF == 1) {
5204     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5205       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5206   } else {
5207     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5208       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5209   }
5210
5211   // If the trip count is constant, limit the interleave count to be less than
5212   // the trip count divided by VF.
5213   if (TC > 0) {
5214     assert(TC >= VF && "VF exceeds trip count?");
5215     if ((TC / VF) < MaxInterleaveCount)
5216       MaxInterleaveCount = (TC / VF);
5217   }
5218
5219   // If we did not calculate the cost for VF (because the user selected the VF)
5220   // then we calculate the cost of VF here.
5221   if (LoopCost == 0)
5222     LoopCost = expectedCost(VF).first;
5223
5224   assert(LoopCost && "Non-zero loop cost expected");
5225
5226   // Clamp the calculated IC to be between the 1 and the max interleave count
5227   // that the target and trip count allows.
5228   if (IC > MaxInterleaveCount)
5229     IC = MaxInterleaveCount;
5230   else if (IC < 1)
5231     IC = 1;
5232
5233   // Interleave if we vectorized this loop and there is a reduction that could
5234   // benefit from interleaving.
5235   if (VF > 1 && !Legal->getReductionVars()->empty()) {
5236     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5237     return IC;
5238   }
5239
5240   // Note that if we've already vectorized the loop we will have done the
5241   // runtime check and so interleaving won't require further checks.
5242   bool InterleavingRequiresRuntimePointerCheck =
5243       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5244
5245   // We want to interleave small loops in order to reduce the loop overhead and
5246   // potentially expose ILP opportunities.
5247   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5248   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5249     // We assume that the cost overhead is 1 and we use the cost model
5250     // to estimate the cost of the loop and interleave until the cost of the
5251     // loop overhead is about 5% of the cost of the loop.
5252     unsigned SmallIC =
5253         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5254
5255     // Interleave until store/load ports (estimated by max interleave count) are
5256     // saturated.
5257     unsigned NumStores = Legal->getNumStores();
5258     unsigned NumLoads = Legal->getNumLoads();
5259     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5260     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5261
5262     // If we have a scalar reduction (vector reductions are already dealt with
5263     // by this point), we can increase the critical path length if the loop
5264     // we're interleaving is inside another loop. Limit, by default to 2, so the
5265     // critical path only gets increased by one reduction operation.
5266     if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5267       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5268       SmallIC = std::min(SmallIC, F);
5269       StoresIC = std::min(StoresIC, F);
5270       LoadsIC = std::min(LoadsIC, F);
5271     }
5272
5273     if (EnableLoadStoreRuntimeInterleave &&
5274         std::max(StoresIC, LoadsIC) > SmallIC) {
5275       LLVM_DEBUG(
5276           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5277       return std::max(StoresIC, LoadsIC);
5278     }
5279
5280     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5281     return SmallIC;
5282   }
5283
5284   // Interleave if this is a large loop (small loops are already dealt with by
5285   // this point) that could benefit from interleaving.
5286   bool HasReductions = !Legal->getReductionVars()->empty();
5287   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5288     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5289     return IC;
5290   }
5291
5292   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5293   return 1;
5294 }
5295
5296 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5297 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5298   // This function calculates the register usage by measuring the highest number
5299   // of values that are alive at a single location. Obviously, this is a very
5300   // rough estimation. We scan the loop in a topological order in order and
5301   // assign a number to each instruction. We use RPO to ensure that defs are
5302   // met before their users. We assume that each instruction that has in-loop
5303   // users starts an interval. We record every time that an in-loop value is
5304   // used, so we have a list of the first and last occurrences of each
5305   // instruction. Next, we transpose this data structure into a multi map that
5306   // holds the list of intervals that *end* at a specific location. This multi
5307   // map allows us to perform a linear search. We scan the instructions linearly
5308   // and record each time that a new interval starts, by placing it in a set.
5309   // If we find this value in the multi-map then we remove it from the set.
5310   // The max register usage is the maximum size of the set.
5311   // We also search for instructions that are defined outside the loop, but are
5312   // used inside the loop. We need this number separately from the max-interval
5313   // usage number because when we unroll, loop-invariant values do not take
5314   // more register.
5315   LoopBlocksDFS DFS(TheLoop);
5316   DFS.perform(LI);
5317
5318   RegisterUsage RU;
5319
5320   // Each 'key' in the map opens a new interval. The values
5321   // of the map are the index of the 'last seen' usage of the
5322   // instruction that is the key.
5323   using IntervalMap = DenseMap<Instruction *, unsigned>;
5324
5325   // Maps instruction to its index.
5326   SmallVector<Instruction *, 64> IdxToInstr;
5327   // Marks the end of each interval.
5328   IntervalMap EndPoint;
5329   // Saves the list of instruction indices that are used in the loop.
5330   SmallPtrSet<Instruction *, 8> Ends;
5331   // Saves the list of values that are used in the loop but are
5332   // defined outside the loop, such as arguments and constants.
5333   SmallPtrSet<Value *, 8> LoopInvariants;
5334
5335   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5336     for (Instruction &I : BB->instructionsWithoutDebug()) {
5337       IdxToInstr.push_back(&I);
5338
5339       // Save the end location of each USE.
5340       for (Value *U : I.operands()) {
5341         auto *Instr = dyn_cast<Instruction>(U);
5342
5343         // Ignore non-instruction values such as arguments, constants, etc.
5344         if (!Instr)
5345           continue;
5346
5347         // If this instruction is outside the loop then record it and continue.
5348         if (!TheLoop->contains(Instr)) {
5349           LoopInvariants.insert(Instr);
5350           continue;
5351         }
5352
5353         // Overwrite previous end points.
5354         EndPoint[Instr] = IdxToInstr.size();
5355         Ends.insert(Instr);
5356       }
5357     }
5358   }
5359
5360   // Saves the list of intervals that end with the index in 'key'.
5361   using InstrList = SmallVector<Instruction *, 2>;
5362   DenseMap<unsigned, InstrList> TransposeEnds;
5363
5364   // Transpose the EndPoints to a list of values that end at each index.
5365   for (auto &Interval : EndPoint)
5366     TransposeEnds[Interval.second].push_back(Interval.first);
5367
5368   SmallPtrSet<Instruction *, 8> OpenIntervals;
5369
5370   // Get the size of the widest register.
5371   unsigned MaxSafeDepDist = -1U;
5372   if (Legal->getMaxSafeDepDistBytes() != -1U)
5373     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5374   unsigned WidestRegister =
5375       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5376   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5377
5378   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5379   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5380
5381   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5382
5383   // A lambda that gets the register usage for the given type and VF.
5384   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5385     if (Ty->isTokenTy())
5386       return 0U;
5387     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5388     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5389   };
5390
5391   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5392     Instruction *I = IdxToInstr[i];
5393
5394     // Remove all of the instructions that end at this location.
5395     InstrList &List = TransposeEnds[i];
5396     for (Instruction *ToRemove : List)
5397       OpenIntervals.erase(ToRemove);
5398
5399     // Ignore instructions that are never used within the loop.
5400     if (Ends.find(I) == Ends.end())
5401       continue;
5402
5403     // Skip ignored values.
5404     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5405       continue;
5406
5407     // For each VF find the maximum usage of registers.
5408     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5409       // Count the number of live intervals.
5410       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5411
5412       if (VFs[j] == 1) {
5413         for (auto Inst : OpenIntervals) {
5414           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5415           if (RegUsage.find(ClassID) == RegUsage.end())
5416             RegUsage[ClassID] = 1;
5417           else
5418             RegUsage[ClassID] += 1;
5419         }
5420       } else {
5421         collectUniformsAndScalars(VFs[j]);
5422         for (auto Inst : OpenIntervals) {
5423           // Skip ignored values for VF > 1.
5424           if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
5425             continue;
5426           if (isScalarAfterVectorization(Inst, VFs[j])) {
5427             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5428             if (RegUsage.find(ClassID) == RegUsage.end())
5429               RegUsage[ClassID] = 1;
5430             else
5431               RegUsage[ClassID] += 1;
5432           } else {
5433             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5434             if (RegUsage.find(ClassID) == RegUsage.end())
5435               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5436             else
5437               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5438           }
5439         }
5440       }
5441
5442       for (auto& pair : RegUsage) {
5443         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5444           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5445         else
5446           MaxUsages[j][pair.first] = pair.second;
5447       }
5448     }
5449
5450     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5451                       << OpenIntervals.size() << '\n');
5452
5453     // Add the current instruction to the list of open intervals.
5454     OpenIntervals.insert(I);
5455   }
5456
5457   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5458     SmallMapVector<unsigned, unsigned, 4> Invariant;
5459
5460     for (auto Inst : LoopInvariants) {
5461       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5462       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5463       if (Invariant.find(ClassID) == Invariant.end())
5464         Invariant[ClassID] = Usage;
5465       else
5466         Invariant[ClassID] += Usage;
5467     }
5468
5469     LLVM_DEBUG({
5470       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5471       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5472              << " item\n";
5473       for (const auto &pair : MaxUsages[i]) {
5474         dbgs() << "LV(REG): RegisterClass: "
5475                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5476                << " registers\n";
5477       }
5478       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5479              << " item\n";
5480       for (const auto &pair : Invariant) {
5481         dbgs() << "LV(REG): RegisterClass: "
5482                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5483                << " registers\n";
5484       }
5485     });
5486
5487     RU.LoopInvariantRegs = Invariant;
5488     RU.MaxLocalUsers = MaxUsages[i];
5489     RUs[i] = RU;
5490   }
5491
5492   return RUs;
5493 }
5494
5495 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5496   // TODO: Cost model for emulated masked load/store is completely
5497   // broken. This hack guides the cost model to use an artificially
5498   // high enough value to practically disable vectorization with such
5499   // operations, except where previously deployed legality hack allowed
5500   // using very low cost values. This is to avoid regressions coming simply
5501   // from moving "masked load/store" check from legality to cost model.
5502   // Masked Load/Gather emulation was previously never allowed.
5503   // Limited number of Masked Store/Scatter emulation was allowed.
5504   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5505   return isa<LoadInst>(I) ||
5506          (isa<StoreInst>(I) &&
5507           NumPredStores > NumberOfStoresToPredicate);
5508 }
5509
5510 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5511   // If we aren't vectorizing the loop, or if we've already collected the
5512   // instructions to scalarize, there's nothing to do. Collection may already
5513   // have occurred if we have a user-selected VF and are now computing the
5514   // expected cost for interleaving.
5515   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5516     return;
5517
5518   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5519   // not profitable to scalarize any instructions, the presence of VF in the
5520   // map will indicate that we've analyzed it already.
5521   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5522
5523   // Find all the instructions that are scalar with predication in the loop and
5524   // determine if it would be better to not if-convert the blocks they are in.
5525   // If so, we also record the instructions to scalarize.
5526   for (BasicBlock *BB : TheLoop->blocks()) {
5527     if (!blockNeedsPredication(BB))
5528       continue;
5529     for (Instruction &I : *BB)
5530       if (isScalarWithPredication(&I)) {
5531         ScalarCostsTy ScalarCosts;
5532         // Do not apply discount logic if hacked cost is needed
5533         // for emulated masked memrefs.
5534         if (!useEmulatedMaskMemRefHack(&I) &&
5535             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5536           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5537         // Remember that BB will remain after vectorization.
5538         PredicatedBBsAfterVectorization.insert(BB);
5539       }
5540   }
5541 }
5542
5543 int LoopVectorizationCostModel::computePredInstDiscount(
5544     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5545     unsigned VF) {
5546   assert(!isUniformAfterVectorization(PredInst, VF) &&
5547          "Instruction marked uniform-after-vectorization will be predicated");
5548
5549   // Initialize the discount to zero, meaning that the scalar version and the
5550   // vector version cost the same.
5551   int Discount = 0;
5552
5553   // Holds instructions to analyze. The instructions we visit are mapped in
5554   // ScalarCosts. Those instructions are the ones that would be scalarized if
5555   // we find that the scalar version costs less.
5556   SmallVector<Instruction *, 8> Worklist;
5557
5558   // Returns true if the given instruction can be scalarized.
5559   auto canBeScalarized = [&](Instruction *I) -> bool {
5560     // We only attempt to scalarize instructions forming a single-use chain
5561     // from the original predicated block that would otherwise be vectorized.
5562     // Although not strictly necessary, we give up on instructions we know will
5563     // already be scalar to avoid traversing chains that are unlikely to be
5564     // beneficial.
5565     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5566         isScalarAfterVectorization(I, VF))
5567       return false;
5568
5569     // If the instruction is scalar with predication, it will be analyzed
5570     // separately. We ignore it within the context of PredInst.
5571     if (isScalarWithPredication(I))
5572       return false;
5573
5574     // If any of the instruction's operands are uniform after vectorization,
5575     // the instruction cannot be scalarized. This prevents, for example, a
5576     // masked load from being scalarized.
5577     //
5578     // We assume we will only emit a value for lane zero of an instruction
5579     // marked uniform after vectorization, rather than VF identical values.
5580     // Thus, if we scalarize an instruction that uses a uniform, we would
5581     // create uses of values corresponding to the lanes we aren't emitting code
5582     // for. This behavior can be changed by allowing getScalarValue to clone
5583     // the lane zero values for uniforms rather than asserting.
5584     for (Use &U : I->operands())
5585       if (auto *J = dyn_cast<Instruction>(U.get()))
5586         if (isUniformAfterVectorization(J, VF))
5587           return false;
5588
5589     // Otherwise, we can scalarize the instruction.
5590     return true;
5591   };
5592
5593   // Compute the expected cost discount from scalarizing the entire expression
5594   // feeding the predicated instruction. We currently only consider expressions
5595   // that are single-use instruction chains.
5596   Worklist.push_back(PredInst);
5597   while (!Worklist.empty()) {
5598     Instruction *I = Worklist.pop_back_val();
5599
5600     // If we've already analyzed the instruction, there's nothing to do.
5601     if (ScalarCosts.find(I) != ScalarCosts.end())
5602       continue;
5603
5604     // Compute the cost of the vector instruction. Note that this cost already
5605     // includes the scalarization overhead of the predicated instruction.
5606     unsigned VectorCost = getInstructionCost(I, VF).first;
5607
5608     // Compute the cost of the scalarized instruction. This cost is the cost of
5609     // the instruction as if it wasn't if-converted and instead remained in the
5610     // predicated block. We will scale this cost by block probability after
5611     // computing the scalarization overhead.
5612     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5613
5614     // Compute the scalarization overhead of needed insertelement instructions
5615     // and phi nodes.
5616     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5617       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5618                                                  true, false);
5619       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5620     }
5621
5622     // Compute the scalarization overhead of needed extractelement
5623     // instructions. For each of the instruction's operands, if the operand can
5624     // be scalarized, add it to the worklist; otherwise, account for the
5625     // overhead.
5626     for (Use &U : I->operands())
5627       if (auto *J = dyn_cast<Instruction>(U.get())) {
5628         assert(VectorType::isValidElementType(J->getType()) &&
5629                "Instruction has non-scalar type");
5630         if (canBeScalarized(J))
5631           Worklist.push_back(J);
5632         else if (needsExtract(J, VF))
5633           ScalarCost += TTI.getScalarizationOverhead(
5634                               ToVectorTy(J->getType(),VF), false, true);
5635       }
5636
5637     // Scale the total scalar cost by block probability.
5638     ScalarCost /= getReciprocalPredBlockProb();
5639
5640     // Compute the discount. A non-negative discount means the vector version
5641     // of the instruction costs more, and scalarizing would be beneficial.
5642     Discount += VectorCost - ScalarCost;
5643     ScalarCosts[I] = ScalarCost;
5644   }
5645
5646   return Discount;
5647 }
5648
5649 LoopVectorizationCostModel::VectorizationCostTy
5650 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5651   VectorizationCostTy Cost;
5652
5653   // For each block.
5654   for (BasicBlock *BB : TheLoop->blocks()) {
5655     VectorizationCostTy BlockCost;
5656
5657     // For each instruction in the old loop.
5658     for (Instruction &I : BB->instructionsWithoutDebug()) {
5659       // Skip ignored values.
5660       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5661           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5662         continue;
5663
5664       VectorizationCostTy C = getInstructionCost(&I, VF);
5665
5666       // Check if we should override the cost.
5667       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5668         C.first = ForceTargetInstructionCost;
5669
5670       BlockCost.first += C.first;
5671       BlockCost.second |= C.second;
5672       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5673                         << " for VF " << VF << " For instruction: " << I
5674                         << '\n');
5675     }
5676
5677     // If we are vectorizing a predicated block, it will have been
5678     // if-converted. This means that the block's instructions (aside from
5679     // stores and instructions that may divide by zero) will now be
5680     // unconditionally executed. For the scalar case, we may not always execute
5681     // the predicated block. Thus, scale the block's cost by the probability of
5682     // executing it.
5683     if (VF == 1 && blockNeedsPredication(BB))
5684       BlockCost.first /= getReciprocalPredBlockProb();
5685
5686     Cost.first += BlockCost.first;
5687     Cost.second |= BlockCost.second;
5688   }
5689
5690   return Cost;
5691 }
5692
5693 /// Gets Address Access SCEV after verifying that the access pattern
5694 /// is loop invariant except the induction variable dependence.
5695 ///
5696 /// This SCEV can be sent to the Target in order to estimate the address
5697 /// calculation cost.
5698 static const SCEV *getAddressAccessSCEV(
5699               Value *Ptr,
5700               LoopVectorizationLegality *Legal,
5701               PredicatedScalarEvolution &PSE,
5702               const Loop *TheLoop) {
5703
5704   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5705   if (!Gep)
5706     return nullptr;
5707
5708   // We are looking for a gep with all loop invariant indices except for one
5709   // which should be an induction variable.
5710   auto SE = PSE.getSE();
5711   unsigned NumOperands = Gep->getNumOperands();
5712   for (unsigned i = 1; i < NumOperands; ++i) {
5713     Value *Opd = Gep->getOperand(i);
5714     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5715         !Legal->isInductionVariable(Opd))
5716       return nullptr;
5717   }
5718
5719   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5720   return PSE.getSCEV(Ptr);
5721 }
5722
5723 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5724   return Legal->hasStride(I->getOperand(0)) ||
5725          Legal->hasStride(I->getOperand(1));
5726 }
5727
5728 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5729                                                                  unsigned VF) {
5730   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5731   Type *ValTy = getMemInstValueType(I);
5732   auto SE = PSE.getSE();
5733
5734   unsigned AS = getLoadStoreAddressSpace(I);
5735   Value *Ptr = getLoadStorePointerOperand(I);
5736   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5737
5738   // Figure out whether the access is strided and get the stride value
5739   // if it's known in compile time
5740   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5741
5742   // Get the cost of the scalar memory instruction and address computation.
5743   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5744
5745   // Don't pass *I here, since it is scalar but will actually be part of a
5746   // vectorized loop where the user of it is a vectorized instruction.
5747   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5748   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5749                                    Alignment ? Alignment->value() : 0, AS);
5750
5751   // Get the overhead of the extractelement and insertelement instructions
5752   // we might create due to scalarization.
5753   Cost += getScalarizationOverhead(I, VF);
5754
5755   // If we have a predicated store, it may not be executed for each vector
5756   // lane. Scale the cost by the probability of executing the predicated
5757   // block.
5758   if (isPredicatedInst(I)) {
5759     Cost /= getReciprocalPredBlockProb();
5760
5761     if (useEmulatedMaskMemRefHack(I))
5762       // Artificially setting to a high enough value to practically disable
5763       // vectorization with such operations.
5764       Cost = 3000000;
5765   }
5766
5767   return Cost;
5768 }
5769
5770 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5771                                                              unsigned VF) {
5772   Type *ValTy = getMemInstValueType(I);
5773   Type *VectorTy = ToVectorTy(ValTy, VF);
5774   Value *Ptr = getLoadStorePointerOperand(I);
5775   unsigned AS = getLoadStoreAddressSpace(I);
5776   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5777
5778   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5779          "Stride should be 1 or -1 for consecutive memory access");
5780   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5781   unsigned Cost = 0;
5782   if (Legal->isMaskRequired(I))
5783     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
5784                                       Alignment ? Alignment->value() : 0, AS);
5785   else
5786     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy,
5787                                 Alignment ? Alignment->value() : 0, AS, I);
5788
5789   bool Reverse = ConsecutiveStride < 0;
5790   if (Reverse)
5791     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5792   return Cost;
5793 }
5794
5795 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5796                                                          unsigned VF) {
5797   Type *ValTy = getMemInstValueType(I);
5798   Type *VectorTy = ToVectorTy(ValTy, VF);
5799   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5800   unsigned AS = getLoadStoreAddressSpace(I);
5801   if (isa<LoadInst>(I)) {
5802     return TTI.getAddressComputationCost(ValTy) +
5803            TTI.getMemoryOpCost(Instruction::Load, ValTy,
5804                                Alignment ? Alignment->value() : 0, AS) +
5805            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5806   }
5807   StoreInst *SI = cast<StoreInst>(I);
5808
5809   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5810   return TTI.getAddressComputationCost(ValTy) +
5811          TTI.getMemoryOpCost(Instruction::Store, ValTy,
5812                              Alignment ? Alignment->value() : 0, AS) +
5813          (isLoopInvariantStoreValue
5814               ? 0
5815               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5816                                        VF - 1));
5817 }
5818
5819 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5820                                                           unsigned VF) {
5821   Type *ValTy = getMemInstValueType(I);
5822   Type *VectorTy = ToVectorTy(ValTy, VF);
5823   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5824   Value *Ptr = getLoadStorePointerOperand(I);
5825
5826   return TTI.getAddressComputationCost(VectorTy) +
5827          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5828                                     Legal->isMaskRequired(I),
5829                                     Alignment ? Alignment->value() : 0);
5830 }
5831
5832 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5833                                                             unsigned VF) {
5834   Type *ValTy = getMemInstValueType(I);
5835   Type *VectorTy = ToVectorTy(ValTy, VF);
5836   unsigned AS = getLoadStoreAddressSpace(I);
5837
5838   auto Group = getInterleavedAccessGroup(I);
5839   assert(Group && "Fail to get an interleaved access group.");
5840
5841   unsigned InterleaveFactor = Group->getFactor();
5842   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5843
5844   // Holds the indices of existing members in an interleaved load group.
5845   // An interleaved store group doesn't need this as it doesn't allow gaps.
5846   SmallVector<unsigned, 4> Indices;
5847   if (isa<LoadInst>(I)) {
5848     for (unsigned i = 0; i < InterleaveFactor; i++)
5849       if (Group->getMember(i))
5850         Indices.push_back(i);
5851   }
5852
5853   // Calculate the cost of the whole interleaved group.
5854   bool UseMaskForGaps =
5855       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5856   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5857       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5858       Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5859
5860   if (Group->isReverse()) {
5861     // TODO: Add support for reversed masked interleaved access.
5862     assert(!Legal->isMaskRequired(I) &&
5863            "Reverse masked interleaved access not supported.");
5864     Cost += Group->getNumMembers() *
5865             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5866   }
5867   return Cost;
5868 }
5869
5870 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5871                                                               unsigned VF) {
5872   // Calculate scalar cost only. Vectorization cost should be ready at this
5873   // moment.
5874   if (VF == 1) {
5875     Type *ValTy = getMemInstValueType(I);
5876     const MaybeAlign Alignment = getLoadStoreAlignment(I);
5877     unsigned AS = getLoadStoreAddressSpace(I);
5878
5879     return TTI.getAddressComputationCost(ValTy) +
5880            TTI.getMemoryOpCost(I->getOpcode(), ValTy,
5881                                Alignment ? Alignment->value() : 0, AS, I);
5882   }
5883   return getWideningCost(I, VF);
5884 }
5885
5886 LoopVectorizationCostModel::VectorizationCostTy
5887 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5888   // If we know that this instruction will remain uniform, check the cost of
5889   // the scalar version.
5890   if (isUniformAfterVectorization(I, VF))
5891     VF = 1;
5892
5893   if (VF > 1 && isProfitableToScalarize(I, VF))
5894     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5895
5896   // Forced scalars do not have any scalarization overhead.
5897   auto ForcedScalar = ForcedScalars.find(VF);
5898   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5899     auto InstSet = ForcedScalar->second;
5900     if (InstSet.find(I) != InstSet.end())
5901       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5902   }
5903
5904   Type *VectorTy;
5905   unsigned C = getInstructionCost(I, VF, VectorTy);
5906
5907   bool TypeNotScalarized =
5908       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5909   return VectorizationCostTy(C, TypeNotScalarized);
5910 }
5911
5912 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5913                                                               unsigned VF) {
5914
5915   if (VF == 1)
5916     return 0;
5917
5918   unsigned Cost = 0;
5919   Type *RetTy = ToVectorTy(I->getType(), VF);
5920   if (!RetTy->isVoidTy() &&
5921       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5922     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5923
5924   // Some targets keep addresses scalar.
5925   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5926     return Cost;
5927
5928   // Some targets support efficient element stores.
5929   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5930     return Cost;
5931
5932   // Collect operands to consider.
5933   CallInst *CI = dyn_cast<CallInst>(I);
5934   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5935
5936   // Skip operands that do not require extraction/scalarization and do not incur
5937   // any overhead.
5938   return Cost + TTI.getOperandsScalarizationOverhead(
5939                     filterExtractingOperands(Ops, VF), VF);
5940 }
5941
5942 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
5943   if (VF == 1)
5944     return;
5945   NumPredStores = 0;
5946   for (BasicBlock *BB : TheLoop->blocks()) {
5947     // For each instruction in the old loop.
5948     for (Instruction &I : *BB) {
5949       Value *Ptr =  getLoadStorePointerOperand(&I);
5950       if (!Ptr)
5951         continue;
5952
5953       // TODO: We should generate better code and update the cost model for
5954       // predicated uniform stores. Today they are treated as any other
5955       // predicated store (see added test cases in
5956       // invariant-store-vectorization.ll).
5957       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
5958         NumPredStores++;
5959
5960       if (Legal->isUniform(Ptr) &&
5961           // Conditional loads and stores should be scalarized and predicated.
5962           // isScalarWithPredication cannot be used here since masked
5963           // gather/scatters are not considered scalar with predication.
5964           !Legal->blockNeedsPredication(I.getParent())) {
5965         // TODO: Avoid replicating loads and stores instead of
5966         // relying on instcombine to remove them.
5967         // Load: Scalar load + broadcast
5968         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5969         unsigned Cost = getUniformMemOpCost(&I, VF);
5970         setWideningDecision(&I, VF, CM_Scalarize, Cost);
5971         continue;
5972       }
5973
5974       // We assume that widening is the best solution when possible.
5975       if (memoryInstructionCanBeWidened(&I, VF)) {
5976         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
5977         int ConsecutiveStride =
5978                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
5979         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5980                "Expected consecutive stride.");
5981         InstWidening Decision =
5982             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5983         setWideningDecision(&I, VF, Decision, Cost);
5984         continue;
5985       }
5986
5987       // Choose between Interleaving, Gather/Scatter or Scalarization.
5988       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
5989       unsigned NumAccesses = 1;
5990       if (isAccessInterleaved(&I)) {
5991         auto Group = getInterleavedAccessGroup(&I);
5992         assert(Group && "Fail to get an interleaved access group.");
5993
5994         // Make one decision for the whole group.
5995         if (getWideningDecision(&I, VF) != CM_Unknown)
5996           continue;
5997
5998         NumAccesses = Group->getNumMembers();
5999         if (interleavedAccessCanBeWidened(&I, VF))
6000           InterleaveCost = getInterleaveGroupCost(&I, VF);
6001       }
6002
6003       unsigned GatherScatterCost =
6004           isLegalGatherOrScatter(&I)
6005               ? getGatherScatterCost(&I, VF) * NumAccesses
6006               : std::numeric_limits<unsigned>::max();
6007
6008       unsigned ScalarizationCost =
6009           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6010
6011       // Choose better solution for the current VF,
6012       // write down this decision and use it during vectorization.
6013       unsigned Cost;
6014       InstWidening Decision;
6015       if (InterleaveCost <= GatherScatterCost &&
6016           InterleaveCost < ScalarizationCost) {
6017         Decision = CM_Interleave;
6018         Cost = InterleaveCost;
6019       } else if (GatherScatterCost < ScalarizationCost) {
6020         Decision = CM_GatherScatter;
6021         Cost = GatherScatterCost;
6022       } else {
6023         Decision = CM_Scalarize;
6024         Cost = ScalarizationCost;
6025       }
6026       // If the instructions belongs to an interleave group, the whole group
6027       // receives the same decision. The whole group receives the cost, but
6028       // the cost will actually be assigned to one instruction.
6029       if (auto Group = getInterleavedAccessGroup(&I))
6030         setWideningDecision(Group, VF, Decision, Cost);
6031       else
6032         setWideningDecision(&I, VF, Decision, Cost);
6033     }
6034   }
6035
6036   // Make sure that any load of address and any other address computation
6037   // remains scalar unless there is gather/scatter support. This avoids
6038   // inevitable extracts into address registers, and also has the benefit of
6039   // activating LSR more, since that pass can't optimize vectorized
6040   // addresses.
6041   if (TTI.prefersVectorizedAddressing())
6042     return;
6043
6044   // Start with all scalar pointer uses.
6045   SmallPtrSet<Instruction *, 8> AddrDefs;
6046   for (BasicBlock *BB : TheLoop->blocks())
6047     for (Instruction &I : *BB) {
6048       Instruction *PtrDef =
6049         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6050       if (PtrDef && TheLoop->contains(PtrDef) &&
6051           getWideningDecision(&I, VF) != CM_GatherScatter)
6052         AddrDefs.insert(PtrDef);
6053     }
6054
6055   // Add all instructions used to generate the addresses.
6056   SmallVector<Instruction *, 4> Worklist;
6057   for (auto *I : AddrDefs)
6058     Worklist.push_back(I);
6059   while (!Worklist.empty()) {
6060     Instruction *I = Worklist.pop_back_val();
6061     for (auto &Op : I->operands())
6062       if (auto *InstOp = dyn_cast<Instruction>(Op))
6063         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6064             AddrDefs.insert(InstOp).second)
6065           Worklist.push_back(InstOp);
6066   }
6067
6068   for (auto *I : AddrDefs) {
6069     if (isa<LoadInst>(I)) {
6070       // Setting the desired widening decision should ideally be handled in
6071       // by cost functions, but since this involves the task of finding out
6072       // if the loaded register is involved in an address computation, it is
6073       // instead changed here when we know this is the case.
6074       InstWidening Decision = getWideningDecision(I, VF);
6075       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6076         // Scalarize a widened load of address.
6077         setWideningDecision(I, VF, CM_Scalarize,
6078                             (VF * getMemoryInstructionCost(I, 1)));
6079       else if (auto Group = getInterleavedAccessGroup(I)) {
6080         // Scalarize an interleave group of address loads.
6081         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6082           if (Instruction *Member = Group->getMember(I))
6083             setWideningDecision(Member, VF, CM_Scalarize,
6084                                 (VF * getMemoryInstructionCost(Member, 1)));
6085         }
6086       }
6087     } else
6088       // Make sure I gets scalarized and a cost estimate without
6089       // scalarization overhead.
6090       ForcedScalars[VF].insert(I);
6091   }
6092 }
6093
6094 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6095                                                         unsigned VF,
6096                                                         Type *&VectorTy) {
6097   Type *RetTy = I->getType();
6098   if (canTruncateToMinimalBitwidth(I, VF))
6099     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6100   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6101   auto SE = PSE.getSE();
6102
6103   // TODO: We need to estimate the cost of intrinsic calls.
6104   switch (I->getOpcode()) {
6105   case Instruction::GetElementPtr:
6106     // We mark this instruction as zero-cost because the cost of GEPs in
6107     // vectorized code depends on whether the corresponding memory instruction
6108     // is scalarized or not. Therefore, we handle GEPs with the memory
6109     // instruction cost.
6110     return 0;
6111   case Instruction::Br: {
6112     // In cases of scalarized and predicated instructions, there will be VF
6113     // predicated blocks in the vectorized loop. Each branch around these
6114     // blocks requires also an extract of its vector compare i1 element.
6115     bool ScalarPredicatedBB = false;
6116     BranchInst *BI = cast<BranchInst>(I);
6117     if (VF > 1 && BI->isConditional() &&
6118         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6119              PredicatedBBsAfterVectorization.end() ||
6120          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6121              PredicatedBBsAfterVectorization.end()))
6122       ScalarPredicatedBB = true;
6123
6124     if (ScalarPredicatedBB) {
6125       // Return cost for branches around scalarized and predicated blocks.
6126       Type *Vec_i1Ty =
6127           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6128       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6129               (TTI.getCFInstrCost(Instruction::Br) * VF));
6130     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6131       // The back-edge branch will remain, as will all scalar branches.
6132       return TTI.getCFInstrCost(Instruction::Br);
6133     else
6134       // This branch will be eliminated by if-conversion.
6135       return 0;
6136     // Note: We currently assume zero cost for an unconditional branch inside
6137     // a predicated block since it will become a fall-through, although we
6138     // may decide in the future to call TTI for all branches.
6139   }
6140   case Instruction::PHI: {
6141     auto *Phi = cast<PHINode>(I);
6142
6143     // First-order recurrences are replaced by vector shuffles inside the loop.
6144     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6145     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6146       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6147                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6148
6149     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6150     // converted into select instructions. We require N - 1 selects per phi
6151     // node, where N is the number of incoming values.
6152     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6153       return (Phi->getNumIncomingValues() - 1) *
6154              TTI.getCmpSelInstrCost(
6155                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6156                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6157
6158     return TTI.getCFInstrCost(Instruction::PHI);
6159   }
6160   case Instruction::UDiv:
6161   case Instruction::SDiv:
6162   case Instruction::URem:
6163   case Instruction::SRem:
6164     // If we have a predicated instruction, it may not be executed for each
6165     // vector lane. Get the scalarization cost and scale this amount by the
6166     // probability of executing the predicated block. If the instruction is not
6167     // predicated, we fall through to the next case.
6168     if (VF > 1 && isScalarWithPredication(I)) {
6169       unsigned Cost = 0;
6170
6171       // These instructions have a non-void type, so account for the phi nodes
6172       // that we will create. This cost is likely to be zero. The phi node
6173       // cost, if any, should be scaled by the block probability because it
6174       // models a copy at the end of each predicated block.
6175       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6176
6177       // The cost of the non-predicated instruction.
6178       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6179
6180       // The cost of insertelement and extractelement instructions needed for
6181       // scalarization.
6182       Cost += getScalarizationOverhead(I, VF);
6183
6184       // Scale the cost by the probability of executing the predicated blocks.
6185       // This assumes the predicated block for each vector lane is equally
6186       // likely.
6187       return Cost / getReciprocalPredBlockProb();
6188     }
6189     LLVM_FALLTHROUGH;
6190   case Instruction::Add:
6191   case Instruction::FAdd:
6192   case Instruction::Sub:
6193   case Instruction::FSub:
6194   case Instruction::Mul:
6195   case Instruction::FMul:
6196   case Instruction::FDiv:
6197   case Instruction::FRem:
6198   case Instruction::Shl:
6199   case Instruction::LShr:
6200   case Instruction::AShr:
6201   case Instruction::And:
6202   case Instruction::Or:
6203   case Instruction::Xor: {
6204     // Since we will replace the stride by 1 the multiplication should go away.
6205     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6206       return 0;
6207     // Certain instructions can be cheaper to vectorize if they have a constant
6208     // second vector operand. One example of this are shifts on x86.
6209     Value *Op2 = I->getOperand(1);
6210     TargetTransformInfo::OperandValueProperties Op2VP;
6211     TargetTransformInfo::OperandValueKind Op2VK =
6212         TTI.getOperandInfo(Op2, Op2VP);
6213     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6214       Op2VK = TargetTransformInfo::OK_UniformValue;
6215
6216     SmallVector<const Value *, 4> Operands(I->operand_values());
6217     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6218     return N * TTI.getArithmeticInstrCost(
6219                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6220                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands);
6221   }
6222   case Instruction::FNeg: {
6223     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6224     return N * TTI.getArithmeticInstrCost(
6225                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6226                    TargetTransformInfo::OK_AnyValue,
6227                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6228                    I->getOperand(0));
6229   }
6230   case Instruction::Select: {
6231     SelectInst *SI = cast<SelectInst>(I);
6232     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6233     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6234     Type *CondTy = SI->getCondition()->getType();
6235     if (!ScalarCond)
6236       CondTy = VectorType::get(CondTy, VF);
6237
6238     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6239   }
6240   case Instruction::ICmp:
6241   case Instruction::FCmp: {
6242     Type *ValTy = I->getOperand(0)->getType();
6243     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6244     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6245       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6246     VectorTy = ToVectorTy(ValTy, VF);
6247     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6248   }
6249   case Instruction::Store:
6250   case Instruction::Load: {
6251     unsigned Width = VF;
6252     if (Width > 1) {
6253       InstWidening Decision = getWideningDecision(I, Width);
6254       assert(Decision != CM_Unknown &&
6255              "CM decision should be taken at this point");
6256       if (Decision == CM_Scalarize)
6257         Width = 1;
6258     }
6259     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6260     return getMemoryInstructionCost(I, VF);
6261   }
6262   case Instruction::ZExt:
6263   case Instruction::SExt:
6264   case Instruction::FPToUI:
6265   case Instruction::FPToSI:
6266   case Instruction::FPExt:
6267   case Instruction::PtrToInt:
6268   case Instruction::IntToPtr:
6269   case Instruction::SIToFP:
6270   case Instruction::UIToFP:
6271   case Instruction::Trunc:
6272   case Instruction::FPTrunc:
6273   case Instruction::BitCast: {
6274     // We optimize the truncation of induction variables having constant
6275     // integer steps. The cost of these truncations is the same as the scalar
6276     // operation.
6277     if (isOptimizableIVTruncate(I, VF)) {
6278       auto *Trunc = cast<TruncInst>(I);
6279       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6280                                   Trunc->getSrcTy(), Trunc);
6281     }
6282
6283     Type *SrcScalarTy = I->getOperand(0)->getType();
6284     Type *SrcVecTy =
6285         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6286     if (canTruncateToMinimalBitwidth(I, VF)) {
6287       // This cast is going to be shrunk. This may remove the cast or it might
6288       // turn it into slightly different cast. For example, if MinBW == 16,
6289       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6290       //
6291       // Calculate the modified src and dest types.
6292       Type *MinVecTy = VectorTy;
6293       if (I->getOpcode() == Instruction::Trunc) {
6294         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6295         VectorTy =
6296             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6297       } else if (I->getOpcode() == Instruction::ZExt ||
6298                  I->getOpcode() == Instruction::SExt) {
6299         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6300         VectorTy =
6301             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6302       }
6303     }
6304
6305     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6306     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6307   }
6308   case Instruction::Call: {
6309     bool NeedToScalarize;
6310     CallInst *CI = cast<CallInst>(I);
6311     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6312     if (getVectorIntrinsicIDForCall(CI, TLI))
6313       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6314     return CallCost;
6315   }
6316   default:
6317     // The cost of executing VF copies of the scalar instruction. This opcode
6318     // is unknown. Assume that it is the same as 'mul'.
6319     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6320            getScalarizationOverhead(I, VF);
6321   } // end of switch.
6322 }
6323
6324 char LoopVectorize::ID = 0;
6325
6326 static const char lv_name[] = "Loop Vectorization";
6327
6328 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6329 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6330 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6331 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6332 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6333 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6334 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6335 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6336 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6337 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6338 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6339 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6340 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6341 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6342 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6343
6344 namespace llvm {
6345
6346 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6347
6348 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6349                               bool VectorizeOnlyWhenForced) {
6350   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6351 }
6352
6353 } // end namespace llvm
6354
6355 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6356   // Check if the pointer operand of a load or store instruction is
6357   // consecutive.
6358   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6359     return Legal->isConsecutivePtr(Ptr);
6360   return false;
6361 }
6362
6363 void LoopVectorizationCostModel::collectValuesToIgnore() {
6364   // Ignore ephemeral values.
6365   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6366
6367   // Ignore type-promoting instructions we identified during reduction
6368   // detection.
6369   for (auto &Reduction : *Legal->getReductionVars()) {
6370     RecurrenceDescriptor &RedDes = Reduction.second;
6371     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6372     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6373   }
6374   // Ignore type-casting instructions we identified during induction
6375   // detection.
6376   for (auto &Induction : *Legal->getInductionVars()) {
6377     InductionDescriptor &IndDes = Induction.second;
6378     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6379     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6380   }
6381 }
6382
6383 // TODO: we could return a pair of values that specify the max VF and
6384 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6385 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6386 // doesn't have a cost model that can choose which plan to execute if
6387 // more than one is generated.
6388 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6389                                  LoopVectorizationCostModel &CM) {
6390   unsigned WidestType;
6391   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6392   return WidestVectorRegBits / WidestType;
6393 }
6394
6395 VectorizationFactor
6396 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6397   unsigned VF = UserVF;
6398   // Outer loop handling: They may require CFG and instruction level
6399   // transformations before even evaluating whether vectorization is profitable.
6400   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6401   // the vectorization pipeline.
6402   if (!OrigLoop->empty()) {
6403     // If the user doesn't provide a vectorization factor, determine a
6404     // reasonable one.
6405     if (!UserVF) {
6406       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6407       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6408
6409       // Make sure we have a VF > 1 for stress testing.
6410       if (VPlanBuildStressTest && VF < 2) {
6411         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6412                           << "overriding computed VF.\n");
6413         VF = 4;
6414       }
6415     }
6416     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6417     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6418     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6419                       << " to build VPlans.\n");
6420     buildVPlans(VF, VF);
6421
6422     // For VPlan build stress testing, we bail out after VPlan construction.
6423     if (VPlanBuildStressTest)
6424       return VectorizationFactor::Disabled();
6425
6426     return {VF, 0};
6427   }
6428
6429   LLVM_DEBUG(
6430       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6431                 "VPlan-native path.\n");
6432   return VectorizationFactor::Disabled();
6433 }
6434
6435 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6436   assert(OrigLoop->empty() && "Inner loop expected.");
6437   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6438   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6439     return None;
6440
6441   // Invalidate interleave groups if all blocks of loop will be predicated.
6442   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6443       !useMaskedInterleavedAccesses(*TTI)) {
6444     LLVM_DEBUG(
6445         dbgs()
6446         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6447            "which requires masked-interleaved support.\n");
6448     CM.InterleaveInfo.reset();
6449   }
6450
6451   if (UserVF) {
6452     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6453     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6454     // Collect the instructions (and their associated costs) that will be more
6455     // profitable to scalarize.
6456     CM.selectUserVectorizationFactor(UserVF);
6457     buildVPlansWithVPRecipes(UserVF, UserVF);
6458     LLVM_DEBUG(printPlans(dbgs()));
6459     return {{UserVF, 0}};
6460   }
6461
6462   unsigned MaxVF = MaybeMaxVF.getValue();
6463   assert(MaxVF != 0 && "MaxVF is zero.");
6464
6465   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6466     // Collect Uniform and Scalar instructions after vectorization with VF.
6467     CM.collectUniformsAndScalars(VF);
6468
6469     // Collect the instructions (and their associated costs) that will be more
6470     // profitable to scalarize.
6471     if (VF > 1)
6472       CM.collectInstsToScalarize(VF);
6473   }
6474
6475   buildVPlansWithVPRecipes(1, MaxVF);
6476   LLVM_DEBUG(printPlans(dbgs()));
6477   if (MaxVF == 1)
6478     return VectorizationFactor::Disabled();
6479
6480   // Select the optimal vectorization factor.
6481   return CM.selectVectorizationFactor(MaxVF);
6482 }
6483
6484 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6485   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6486                     << '\n');
6487   BestVF = VF;
6488   BestUF = UF;
6489
6490   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6491     return !Plan->hasVF(VF);
6492   });
6493   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6494 }
6495
6496 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6497                                            DominatorTree *DT) {
6498   // Perform the actual loop transformation.
6499
6500   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6501   VPCallbackILV CallbackILV(ILV);
6502
6503   VPTransformState State{BestVF, BestUF,      LI,
6504                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6505                          &ILV,   CallbackILV};
6506   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6507   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6508
6509   //===------------------------------------------------===//
6510   //
6511   // Notice: any optimization or new instruction that go
6512   // into the code below should also be implemented in
6513   // the cost-model.
6514   //
6515   //===------------------------------------------------===//
6516
6517   // 2. Copy and widen instructions from the old loop into the new loop.
6518   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6519   VPlans.front()->execute(&State);
6520
6521   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6522   //    predication, updating analyses.
6523   ILV.fixVectorizedLoop();
6524 }
6525
6526 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6527     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6528   BasicBlock *Latch = OrigLoop->getLoopLatch();
6529
6530   // We create new control-flow for the vectorized loop, so the original
6531   // condition will be dead after vectorization if it's only used by the
6532   // branch.
6533   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6534   if (Cmp && Cmp->hasOneUse())
6535     DeadInstructions.insert(Cmp);
6536
6537   // We create new "steps" for induction variable updates to which the original
6538   // induction variables map. An original update instruction will be dead if
6539   // all its users except the induction variable are dead.
6540   for (auto &Induction : *Legal->getInductionVars()) {
6541     PHINode *Ind = Induction.first;
6542     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6543     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6544           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6545                                  DeadInstructions.end();
6546         }))
6547       DeadInstructions.insert(IndUpdate);
6548
6549     // We record as "Dead" also the type-casting instructions we had identified
6550     // during induction analysis. We don't need any handling for them in the
6551     // vectorized loop because we have proven that, under a proper runtime
6552     // test guarding the vectorized loop, the value of the phi, and the casted
6553     // value of the phi, are the same. The last instruction in this casting chain
6554     // will get its scalar/vector/widened def from the scalar/vector/widened def
6555     // of the respective phi node. Any other casts in the induction def-use chain
6556     // have no other uses outside the phi update chain, and will be ignored.
6557     InductionDescriptor &IndDes = Induction.second;
6558     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6559     DeadInstructions.insert(Casts.begin(), Casts.end());
6560   }
6561 }
6562
6563 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6564
6565 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6566
6567 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6568                                         Instruction::BinaryOps BinOp) {
6569   // When unrolling and the VF is 1, we only need to add a simple scalar.
6570   Type *Ty = Val->getType();
6571   assert(!Ty->isVectorTy() && "Val must be a scalar");
6572
6573   if (Ty->isFloatingPointTy()) {
6574     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6575
6576     // Floating point operations had to be 'fast' to enable the unrolling.
6577     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6578     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6579   }
6580   Constant *C = ConstantInt::get(Ty, StartIdx);
6581   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6582 }
6583
6584 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6585   SmallVector<Metadata *, 4> MDs;
6586   // Reserve first location for self reference to the LoopID metadata node.
6587   MDs.push_back(nullptr);
6588   bool IsUnrollMetadata = false;
6589   MDNode *LoopID = L->getLoopID();
6590   if (LoopID) {
6591     // First find existing loop unrolling disable metadata.
6592     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6593       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6594       if (MD) {
6595         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6596         IsUnrollMetadata =
6597             S && S->getString().startswith("llvm.loop.unroll.disable");
6598       }
6599       MDs.push_back(LoopID->getOperand(i));
6600     }
6601   }
6602
6603   if (!IsUnrollMetadata) {
6604     // Add runtime unroll disable metadata.
6605     LLVMContext &Context = L->getHeader()->getContext();
6606     SmallVector<Metadata *, 1> DisableOperands;
6607     DisableOperands.push_back(
6608         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6609     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6610     MDs.push_back(DisableNode);
6611     MDNode *NewLoopID = MDNode::get(Context, MDs);
6612     // Set operand 0 to refer to the loop id itself.
6613     NewLoopID->replaceOperandWith(0, NewLoopID);
6614     L->setLoopID(NewLoopID);
6615   }
6616 }
6617
6618 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6619     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6620   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6621   bool PredicateAtRangeStart = Predicate(Range.Start);
6622
6623   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6624     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6625       Range.End = TmpVF;
6626       break;
6627     }
6628
6629   return PredicateAtRangeStart;
6630 }
6631
6632 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6633 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6634 /// of VF's starting at a given VF and extending it as much as possible. Each
6635 /// vectorization decision can potentially shorten this sub-range during
6636 /// buildVPlan().
6637 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6638   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6639     VFRange SubRange = {VF, MaxVF + 1};
6640     VPlans.push_back(buildVPlan(SubRange));
6641     VF = SubRange.End;
6642   }
6643 }
6644
6645 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6646                                          VPlanPtr &Plan) {
6647   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6648
6649   // Look for cached value.
6650   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6651   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6652   if (ECEntryIt != EdgeMaskCache.end())
6653     return ECEntryIt->second;
6654
6655   VPValue *SrcMask = createBlockInMask(Src, Plan);
6656
6657   // The terminator has to be a branch inst!
6658   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6659   assert(BI && "Unexpected terminator found");
6660
6661   if (!BI->isConditional())
6662     return EdgeMaskCache[Edge] = SrcMask;
6663
6664   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6665   assert(EdgeMask && "No Edge Mask found for condition");
6666
6667   if (BI->getSuccessor(0) != Dst)
6668     EdgeMask = Builder.createNot(EdgeMask);
6669
6670   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6671     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6672
6673   return EdgeMaskCache[Edge] = EdgeMask;
6674 }
6675
6676 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6677   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6678
6679   // Look for cached value.
6680   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6681   if (BCEntryIt != BlockMaskCache.end())
6682     return BCEntryIt->second;
6683
6684   // All-one mask is modelled as no-mask following the convention for masked
6685   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6686   VPValue *BlockMask = nullptr;
6687
6688   if (OrigLoop->getHeader() == BB) {
6689     if (!CM.blockNeedsPredication(BB))
6690       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6691
6692     // Introduce the early-exit compare IV <= BTC to form header block mask.
6693     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6694     VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6695     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6696     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6697     return BlockMaskCache[BB] = BlockMask;
6698   }
6699
6700   // This is the block mask. We OR all incoming edges.
6701   for (auto *Predecessor : predecessors(BB)) {
6702     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6703     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6704       return BlockMaskCache[BB] = EdgeMask;
6705
6706     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6707       BlockMask = EdgeMask;
6708       continue;
6709     }
6710
6711     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6712   }
6713
6714   return BlockMaskCache[BB] = BlockMask;
6715 }
6716
6717 VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
6718                                                            VFRange &Range,
6719                                                            VPlanPtr &Plan) {
6720   const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I);
6721   if (!IG)
6722     return nullptr;
6723
6724   // Now check if IG is relevant for VF's in the given range.
6725   auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {
6726     return [=](unsigned VF) -> bool {
6727       return (VF >= 2 && // Query is illegal for VF == 1
6728               CM.getWideningDecision(I, VF) ==
6729                   LoopVectorizationCostModel::CM_Interleave);
6730     };
6731   };
6732   if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range))
6733     return nullptr;
6734
6735   // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
6736   // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
6737   // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
6738   assert(I == IG->getInsertPos() &&
6739          "Generating a recipe for an adjunct member of an interleave group");
6740
6741   VPValue *Mask = nullptr;
6742   if (Legal->isMaskRequired(I))
6743     Mask = createBlockInMask(I->getParent(), Plan);
6744
6745   return new VPInterleaveRecipe(IG, Mask);
6746 }
6747
6748 VPWidenMemoryInstructionRecipe *
6749 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6750                                   VPlanPtr &Plan) {
6751   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6752     return nullptr;
6753
6754   auto willWiden = [&](unsigned VF) -> bool {
6755     if (VF == 1)
6756       return false;
6757     if (CM.isScalarAfterVectorization(I, VF) ||
6758         CM.isProfitableToScalarize(I, VF))
6759       return false;
6760     LoopVectorizationCostModel::InstWidening Decision =
6761         CM.getWideningDecision(I, VF);
6762     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6763            "CM decision should be taken at this point.");
6764     assert(Decision != LoopVectorizationCostModel::CM_Interleave &&
6765            "Interleave memory opportunity should be caught earlier.");
6766     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6767   };
6768
6769   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6770     return nullptr;
6771
6772   VPValue *Mask = nullptr;
6773   if (Legal->isMaskRequired(I))
6774     Mask = createBlockInMask(I->getParent(), Plan);
6775
6776   return new VPWidenMemoryInstructionRecipe(*I, Mask);
6777 }
6778
6779 VPWidenIntOrFpInductionRecipe *
6780 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6781   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6782     // Check if this is an integer or fp induction. If so, build the recipe that
6783     // produces its scalar and vector values.
6784     InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6785     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6786         II.getKind() == InductionDescriptor::IK_FpInduction)
6787       return new VPWidenIntOrFpInductionRecipe(Phi);
6788
6789     return nullptr;
6790   }
6791
6792   // Optimize the special case where the source is a constant integer
6793   // induction variable. Notice that we can only optimize the 'trunc' case
6794   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6795   // (c) other casts depend on pointer size.
6796
6797   // Determine whether \p K is a truncation based on an induction variable that
6798   // can be optimized.
6799   auto isOptimizableIVTruncate =
6800       [&](Instruction *K) -> std::function<bool(unsigned)> {
6801     return
6802         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6803   };
6804
6805   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6806                                isOptimizableIVTruncate(I), Range))
6807     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6808                                              cast<TruncInst>(I));
6809   return nullptr;
6810 }
6811
6812 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6813   PHINode *Phi = dyn_cast<PHINode>(I);
6814   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6815     return nullptr;
6816
6817   // We know that all PHIs in non-header blocks are converted into selects, so
6818   // we don't have to worry about the insertion order and we can just use the
6819   // builder. At this point we generate the predication tree. There may be
6820   // duplications since this is a simple recursive scan, but future
6821   // optimizations will clean it up.
6822
6823   SmallVector<VPValue *, 2> Masks;
6824   unsigned NumIncoming = Phi->getNumIncomingValues();
6825   for (unsigned In = 0; In < NumIncoming; In++) {
6826     VPValue *EdgeMask =
6827       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6828     assert((EdgeMask || NumIncoming == 1) &&
6829            "Multiple predecessors with one having a full mask");
6830     if (EdgeMask)
6831       Masks.push_back(EdgeMask);
6832   }
6833   return new VPBlendRecipe(Phi, Masks);
6834 }
6835
6836 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6837                                  VFRange &Range) {
6838
6839   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6840       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6841
6842   if (IsPredicated)
6843     return false;
6844
6845   auto IsVectorizableOpcode = [](unsigned Opcode) {
6846     switch (Opcode) {
6847     case Instruction::Add:
6848     case Instruction::And:
6849     case Instruction::AShr:
6850     case Instruction::BitCast:
6851     case Instruction::Br:
6852     case Instruction::Call:
6853     case Instruction::FAdd:
6854     case Instruction::FCmp:
6855     case Instruction::FDiv:
6856     case Instruction::FMul:
6857     case Instruction::FNeg:
6858     case Instruction::FPExt:
6859     case Instruction::FPToSI:
6860     case Instruction::FPToUI:
6861     case Instruction::FPTrunc:
6862     case Instruction::FRem:
6863     case Instruction::FSub:
6864     case Instruction::GetElementPtr:
6865     case Instruction::ICmp:
6866     case Instruction::IntToPtr:
6867     case Instruction::Load:
6868     case Instruction::LShr:
6869     case Instruction::Mul:
6870     case Instruction::Or:
6871     case Instruction::PHI:
6872     case Instruction::PtrToInt:
6873     case Instruction::SDiv:
6874     case Instruction::Select:
6875     case Instruction::SExt:
6876     case Instruction::Shl:
6877     case Instruction::SIToFP:
6878     case Instruction::SRem:
6879     case Instruction::Store:
6880     case Instruction::Sub:
6881     case Instruction::Trunc:
6882     case Instruction::UDiv:
6883     case Instruction::UIToFP:
6884     case Instruction::URem:
6885     case Instruction::Xor:
6886     case Instruction::ZExt:
6887       return true;
6888     }
6889     return false;
6890   };
6891
6892   if (!IsVectorizableOpcode(I->getOpcode()))
6893     return false;
6894
6895   if (CallInst *CI = dyn_cast<CallInst>(I)) {
6896     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6897     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6898                ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6899       return false;
6900   }
6901
6902   auto willWiden = [&](unsigned VF) -> bool {
6903     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6904                              CM.isProfitableToScalarize(I, VF)))
6905       return false;
6906     if (CallInst *CI = dyn_cast<CallInst>(I)) {
6907       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6908       // The following case may be scalarized depending on the VF.
6909       // The flag shows whether we use Intrinsic or a usual Call for vectorized
6910       // version of the instruction.
6911       // Is it beneficial to perform intrinsic call compared to lib call?
6912       bool NeedToScalarize;
6913       unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6914       bool UseVectorIntrinsic =
6915           ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6916       return UseVectorIntrinsic || !NeedToScalarize;
6917     }
6918     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6919       assert(CM.getWideningDecision(I, VF) ==
6920                  LoopVectorizationCostModel::CM_Scalarize &&
6921              "Memory widening decisions should have been taken care by now");
6922       return false;
6923     }
6924     return true;
6925   };
6926
6927   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6928     return false;
6929
6930   // Success: widen this instruction. We optimize the common case where
6931   // consecutive instructions can be represented by a single recipe.
6932   if (!VPBB->empty()) {
6933     VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());
6934     if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))
6935       return true;
6936   }
6937
6938   VPBB->appendRecipe(new VPWidenRecipe(I));
6939   return true;
6940 }
6941
6942 VPBasicBlock *VPRecipeBuilder::handleReplication(
6943     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6944     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6945     VPlanPtr &Plan) {
6946   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6947       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6948       Range);
6949
6950   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6951       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6952
6953   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6954
6955   // Find if I uses a predicated instruction. If so, it will use its scalar
6956   // value. Avoid hoisting the insert-element which packs the scalar value into
6957   // a vector value, as that happens iff all users use the vector value.
6958   for (auto &Op : I->operands())
6959     if (auto *PredInst = dyn_cast<Instruction>(Op))
6960       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6961         PredInst2Recipe[PredInst]->setAlsoPack(false);
6962
6963   // Finalize the recipe for Instr, first if it is not predicated.
6964   if (!IsPredicated) {
6965     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
6966     VPBB->appendRecipe(Recipe);
6967     return VPBB;
6968   }
6969   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
6970   assert(VPBB->getSuccessors().empty() &&
6971          "VPBB has successors when handling predicated replication.");
6972   // Record predicated instructions for above packing optimizations.
6973   PredInst2Recipe[I] = Recipe;
6974   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
6975   VPBlockUtils::insertBlockAfter(Region, VPBB);
6976   auto *RegSucc = new VPBasicBlock();
6977   VPBlockUtils::insertBlockAfter(RegSucc, Region);
6978   return RegSucc;
6979 }
6980
6981 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
6982                                                       VPRecipeBase *PredRecipe,
6983                                                       VPlanPtr &Plan) {
6984   // Instructions marked for predication are replicated and placed under an
6985   // if-then construct to prevent side-effects.
6986
6987   // Generate recipes to compute the block mask for this region.
6988   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
6989
6990   // Build the triangular if-then region.
6991   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
6992   assert(Instr->getParent() && "Predicated instruction not in any basic block");
6993   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
6994   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
6995   auto *PHIRecipe =
6996       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
6997   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
6998   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
6999   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7000
7001   // Note: first set Entry as region entry and then connect successors starting
7002   // from it in order, to propagate the "parent" of each VPBasicBlock.
7003   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7004   VPBlockUtils::connectBlocks(Pred, Exit);
7005
7006   return Region;
7007 }
7008
7009 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
7010                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
7011   VPRecipeBase *Recipe = nullptr;
7012   // Check if Instr should belong to an interleave memory recipe, or already
7013   // does. In the latter case Instr is irrelevant.
7014   if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {
7015     VPBB->appendRecipe(Recipe);
7016     return true;
7017   }
7018
7019   // Check if Instr is a memory operation that should be widened.
7020   if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
7021     VPBB->appendRecipe(Recipe);
7022     return true;
7023   }
7024
7025   // Check if Instr should form some PHI recipe.
7026   if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
7027     VPBB->appendRecipe(Recipe);
7028     return true;
7029   }
7030   if ((Recipe = tryToBlend(Instr, Plan))) {
7031     VPBB->appendRecipe(Recipe);
7032     return true;
7033   }
7034   if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
7035     VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
7036     return true;
7037   }
7038
7039   // Check if Instr is to be widened by a general VPWidenRecipe, after
7040   // having first checked for specific widening recipes that deal with
7041   // Interleave Groups, Inductions and Phi nodes.
7042   if (tryToWiden(Instr, VPBB, Range))
7043     return true;
7044
7045   return false;
7046 }
7047
7048 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7049                                                         unsigned MaxVF) {
7050   assert(OrigLoop->empty() && "Inner loop expected.");
7051
7052   // Collect conditions feeding internal conditional branches; they need to be
7053   // represented in VPlan for it to model masking.
7054   SmallPtrSet<Value *, 1> NeedDef;
7055
7056   auto *Latch = OrigLoop->getLoopLatch();
7057   for (BasicBlock *BB : OrigLoop->blocks()) {
7058     if (BB == Latch)
7059       continue;
7060     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7061     if (Branch && Branch->isConditional())
7062       NeedDef.insert(Branch->getCondition());
7063   }
7064
7065   // If the tail is to be folded by masking, the primary induction variable
7066   // needs to be represented in VPlan for it to model early-exit masking.
7067   // Also, both the Phi and the live-out instruction of each reduction are
7068   // required in order to introduce a select between them in VPlan.
7069   if (CM.foldTailByMasking()) {
7070     NeedDef.insert(Legal->getPrimaryInduction());
7071     for (auto &Reduction : *Legal->getReductionVars()) {
7072       NeedDef.insert(Reduction.first);
7073       NeedDef.insert(Reduction.second.getLoopExitInstr());
7074     }
7075   }
7076
7077   // Collect instructions from the original loop that will become trivially dead
7078   // in the vectorized loop. We don't need to vectorize these instructions. For
7079   // example, original induction update instructions can become dead because we
7080   // separately emit induction "steps" when generating code for the new loop.
7081   // Similarly, we create a new latch condition when setting up the structure
7082   // of the new loop, so the old one can become dead.
7083   SmallPtrSet<Instruction *, 4> DeadInstructions;
7084   collectTriviallyDeadInstructions(DeadInstructions);
7085
7086   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7087     VFRange SubRange = {VF, MaxVF + 1};
7088     VPlans.push_back(
7089         buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
7090     VF = SubRange.End;
7091   }
7092 }
7093
7094 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7095     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7096     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7097   // Hold a mapping from predicated instructions to their recipes, in order to
7098   // fix their AlsoPack behavior if a user is determined to replicate and use a
7099   // scalar instead of vector value.
7100   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7101
7102   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7103   DenseMap<Instruction *, Instruction *> SinkAfterInverse;
7104
7105   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7106   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7107   auto Plan = std::make_unique<VPlan>(VPBB);
7108
7109   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
7110   // Represent values that will have defs inside VPlan.
7111   for (Value *V : NeedDef)
7112     Plan->addVPValue(V);
7113
7114   // Scan the body of the loop in a topological order to visit each basic block
7115   // after having visited its predecessor basic blocks.
7116   LoopBlocksDFS DFS(OrigLoop);
7117   DFS.perform(LI);
7118
7119   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7120     // Relevant instructions from basic block BB will be grouped into VPRecipe
7121     // ingredients and fill a new VPBasicBlock.
7122     unsigned VPBBsForBB = 0;
7123     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7124     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7125     VPBB = FirstVPBBForBB;
7126     Builder.setInsertPoint(VPBB);
7127
7128     std::vector<Instruction *> Ingredients;
7129
7130     // Organize the ingredients to vectorize from current basic block in the
7131     // right order.
7132     for (Instruction &I : BB->instructionsWithoutDebug()) {
7133       Instruction *Instr = &I;
7134
7135       // First filter out irrelevant instructions, to ensure no recipes are
7136       // built for them.
7137       if (isa<BranchInst>(Instr) ||
7138           DeadInstructions.find(Instr) != DeadInstructions.end())
7139         continue;
7140
7141       // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
7142       // member of the IG, do not construct any Recipe for it.
7143       const InterleaveGroup<Instruction> *IG =
7144           CM.getInterleavedAccessGroup(Instr);
7145       if (IG && Instr != IG->getInsertPos() &&
7146           Range.Start >= 2 && // Query is illegal for VF == 1
7147           CM.getWideningDecision(Instr, Range.Start) ==
7148               LoopVectorizationCostModel::CM_Interleave) {
7149         auto SinkCandidate = SinkAfterInverse.find(Instr);
7150         if (SinkCandidate != SinkAfterInverse.end())
7151           Ingredients.push_back(SinkCandidate->second);
7152         continue;
7153       }
7154
7155       // Move instructions to handle first-order recurrences, step 1: avoid
7156       // handling this instruction until after we've handled the instruction it
7157       // should follow.
7158       auto SAIt = SinkAfter.find(Instr);
7159       if (SAIt != SinkAfter.end()) {
7160         LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after"
7161                           << *SAIt->second
7162                           << " to vectorize a 1st order recurrence.\n");
7163         SinkAfterInverse[SAIt->second] = Instr;
7164         continue;
7165       }
7166
7167       Ingredients.push_back(Instr);
7168
7169       // Move instructions to handle first-order recurrences, step 2: push the
7170       // instruction to be sunk at its insertion point.
7171       auto SAInvIt = SinkAfterInverse.find(Instr);
7172       if (SAInvIt != SinkAfterInverse.end())
7173         Ingredients.push_back(SAInvIt->second);
7174     }
7175
7176     // Introduce each ingredient into VPlan.
7177     for (Instruction *Instr : Ingredients) {
7178       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7179         continue;
7180
7181       // Otherwise, if all widening options failed, Instruction is to be
7182       // replicated. This may create a successor for VPBB.
7183       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7184           Instr, Range, VPBB, PredInst2Recipe, Plan);
7185       if (NextVPBB != VPBB) {
7186         VPBB = NextVPBB;
7187         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7188                                     : "");
7189       }
7190     }
7191   }
7192
7193   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7194   // may also be empty, such as the last one VPBB, reflecting original
7195   // basic-blocks with no recipes.
7196   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7197   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7198   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7199   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7200   delete PreEntry;
7201
7202   // Finally, if tail is folded by masking, introduce selects between the phi
7203   // and the live-out instruction of each reduction, at the end of the latch.
7204   if (CM.foldTailByMasking()) {
7205     Builder.setInsertPoint(VPBB);
7206     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7207     for (auto &Reduction : *Legal->getReductionVars()) {
7208       VPValue *Phi = Plan->getVPValue(Reduction.first);
7209       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7210       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7211     }
7212   }
7213
7214   std::string PlanName;
7215   raw_string_ostream RSO(PlanName);
7216   unsigned VF = Range.Start;
7217   Plan->addVF(VF);
7218   RSO << "Initial VPlan for VF={" << VF;
7219   for (VF *= 2; VF < Range.End; VF *= 2) {
7220     Plan->addVF(VF);
7221     RSO << "," << VF;
7222   }
7223   RSO << "},UF>=1";
7224   RSO.flush();
7225   Plan->setName(PlanName);
7226
7227   return Plan;
7228 }
7229
7230 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7231   // Outer loop handling: They may require CFG and instruction level
7232   // transformations before even evaluating whether vectorization is profitable.
7233   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7234   // the vectorization pipeline.
7235   assert(!OrigLoop->empty());
7236   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7237
7238   // Create new empty VPlan
7239   auto Plan = std::make_unique<VPlan>();
7240
7241   // Build hierarchical CFG
7242   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7243   HCFGBuilder.buildHierarchicalCFG();
7244
7245   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7246     Plan->addVF(VF);
7247
7248   if (EnableVPlanPredication) {
7249     VPlanPredicator VPP(*Plan);
7250     VPP.predicate();
7251
7252     // Avoid running transformation to recipes until masked code generation in
7253     // VPlan-native path is in place.
7254     return Plan;
7255   }
7256
7257   SmallPtrSet<Instruction *, 1> DeadInstructions;
7258   VPlanHCFGTransforms::VPInstructionsToVPRecipes(
7259       Plan, Legal->getInductionVars(), DeadInstructions);
7260
7261   return Plan;
7262 }
7263
7264 Value* LoopVectorizationPlanner::VPCallbackILV::
7265 getOrCreateVectorValues(Value *V, unsigned Part) {
7266       return ILV.getOrCreateVectorValue(V, Part);
7267 }
7268
7269 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
7270   O << " +\n"
7271     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7272   IG->getInsertPos()->printAsOperand(O, false);
7273   if (User) {
7274     O << ", ";
7275     User->getOperand(0)->printAsOperand(O);
7276   }
7277   O << "\\l\"";
7278   for (unsigned i = 0; i < IG->getFactor(); ++i)
7279     if (Instruction *I = IG->getMember(i))
7280       O << " +\n"
7281         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7282 }
7283
7284 void VPWidenRecipe::execute(VPTransformState &State) {
7285   for (auto &Instr : make_range(Begin, End))
7286     State.ILV->widenInstruction(Instr);
7287 }
7288
7289 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7290   assert(!State.Instance && "Int or FP induction being replicated.");
7291   State.ILV->widenIntOrFpInduction(IV, Trunc);
7292 }
7293
7294 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7295   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7296 }
7297
7298 void VPBlendRecipe::execute(VPTransformState &State) {
7299   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7300   // We know that all PHIs in non-header blocks are converted into
7301   // selects, so we don't have to worry about the insertion order and we
7302   // can just use the builder.
7303   // At this point we generate the predication tree. There may be
7304   // duplications since this is a simple recursive scan, but future
7305   // optimizations will clean it up.
7306
7307   unsigned NumIncoming = Phi->getNumIncomingValues();
7308
7309   assert((User || NumIncoming == 1) &&
7310          "Multiple predecessors with predecessors having a full mask");
7311   // Generate a sequence of selects of the form:
7312   // SELECT(Mask3, In3,
7313   //      SELECT(Mask2, In2,
7314   //                   ( ...)))
7315   InnerLoopVectorizer::VectorParts Entry(State.UF);
7316   for (unsigned In = 0; In < NumIncoming; ++In) {
7317     for (unsigned Part = 0; Part < State.UF; ++Part) {
7318       // We might have single edge PHIs (blocks) - use an identity
7319       // 'select' for the first PHI operand.
7320       Value *In0 =
7321           State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7322       if (In == 0)
7323         Entry[Part] = In0; // Initialize with the first incoming value.
7324       else {
7325         // Select between the current value and the previous incoming edge
7326         // based on the incoming mask.
7327         Value *Cond = State.get(User->getOperand(In), Part);
7328         Entry[Part] =
7329             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7330       }
7331     }
7332   }
7333   for (unsigned Part = 0; Part < State.UF; ++Part)
7334     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7335 }
7336
7337 void VPInterleaveRecipe::execute(VPTransformState &State) {
7338   assert(!State.Instance && "Interleave group being replicated.");
7339   if (!User)
7340     return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
7341
7342   // Last (and currently only) operand is a mask.
7343   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7344   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7345   for (unsigned Part = 0; Part < State.UF; ++Part)
7346     MaskValues[Part] = State.get(Mask, Part);
7347   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
7348 }
7349
7350 void VPReplicateRecipe::execute(VPTransformState &State) {
7351   if (State.Instance) { // Generate a single instance.
7352     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7353     // Insert scalar instance packing it into a vector.
7354     if (AlsoPack && State.VF > 1) {
7355       // If we're constructing lane 0, initialize to start from undef.
7356       if (State.Instance->Lane == 0) {
7357         Value *Undef =
7358             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7359         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7360       }
7361       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7362     }
7363     return;
7364   }
7365
7366   // Generate scalar instances for all VF lanes of all UF parts, unless the
7367   // instruction is uniform inwhich case generate only the first lane for each
7368   // of the UF parts.
7369   unsigned EndLane = IsUniform ? 1 : State.VF;
7370   for (unsigned Part = 0; Part < State.UF; ++Part)
7371     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7372       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7373 }
7374
7375 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7376   assert(State.Instance && "Branch on Mask works only on single instance.");
7377
7378   unsigned Part = State.Instance->Part;
7379   unsigned Lane = State.Instance->Lane;
7380
7381   Value *ConditionBit = nullptr;
7382   if (!User) // Block in mask is all-one.
7383     ConditionBit = State.Builder.getTrue();
7384   else {
7385     VPValue *BlockInMask = User->getOperand(0);
7386     ConditionBit = State.get(BlockInMask, Part);
7387     if (ConditionBit->getType()->isVectorTy())
7388       ConditionBit = State.Builder.CreateExtractElement(
7389           ConditionBit, State.Builder.getInt32(Lane));
7390   }
7391
7392   // Replace the temporary unreachable terminator with a new conditional branch,
7393   // whose two destinations will be set later when they are created.
7394   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7395   assert(isa<UnreachableInst>(CurrentTerminator) &&
7396          "Expected to replace unreachable terminator with conditional branch.");
7397   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7398   CondBr->setSuccessor(0, nullptr);
7399   ReplaceInstWithInst(CurrentTerminator, CondBr);
7400 }
7401
7402 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7403   assert(State.Instance && "Predicated instruction PHI works per instance.");
7404   Instruction *ScalarPredInst = cast<Instruction>(
7405       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7406   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7407   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7408   assert(PredicatingBB && "Predicated block has no single predecessor.");
7409
7410   // By current pack/unpack logic we need to generate only a single phi node: if
7411   // a vector value for the predicated instruction exists at this point it means
7412   // the instruction has vector users only, and a phi for the vector value is
7413   // needed. In this case the recipe of the predicated instruction is marked to
7414   // also do that packing, thereby "hoisting" the insert-element sequence.
7415   // Otherwise, a phi node for the scalar value is needed.
7416   unsigned Part = State.Instance->Part;
7417   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7418     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7419     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7420     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7421     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7422     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7423     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7424   } else {
7425     Type *PredInstType = PredInst->getType();
7426     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7427     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7428     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7429     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7430   }
7431 }
7432
7433 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7434   if (!User)
7435     return State.ILV->vectorizeMemoryInstruction(&Instr);
7436
7437   // Last (and currently only) operand is a mask.
7438   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7439   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7440   for (unsigned Part = 0; Part < State.UF; ++Part)
7441     MaskValues[Part] = State.get(Mask, Part);
7442   State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
7443 }
7444
7445 static ScalarEpilogueLowering
7446 getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
7447                           ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
7448   ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
7449   if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7450       (F->hasOptSize() ||
7451        llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
7452     SEL = CM_ScalarEpilogueNotAllowedOptSize;
7453   else if (PreferPredicateOverEpilog || Hints.getPredicate())
7454     SEL = CM_ScalarEpilogueNotNeededUsePredicate;
7455
7456   return SEL;
7457 }
7458
7459 // Process the loop in the VPlan-native vectorization path. This path builds
7460 // VPlan upfront in the vectorization pipeline, which allows to apply
7461 // VPlan-to-VPlan transformations from the very beginning without modifying the
7462 // input LLVM IR.
7463 static bool processLoopInVPlanNativePath(
7464     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7465     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7466     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7467     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7468     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7469
7470   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7471   Function *F = L->getHeader()->getParent();
7472   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7473   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
7474
7475   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7476                                 &Hints, IAI);
7477   // Use the planner for outer loop vectorization.
7478   // TODO: CM is not used at this point inside the planner. Turn CM into an
7479   // optional argument if we don't need it in the future.
7480   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);
7481
7482   // Get user vectorization factor.
7483   const unsigned UserVF = Hints.getWidth();
7484
7485   // Plan how to best vectorize, return the best VF and its cost.
7486   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7487
7488   // If we are stress testing VPlan builds, do not attempt to generate vector
7489   // code. Masked vector code generation support will follow soon.
7490   // Also, do not attempt to vectorize if no vector code will be produced.
7491   if (VPlanBuildStressTest || EnableVPlanPredication ||
7492       VectorizationFactor::Disabled() == VF)
7493     return false;
7494
7495   LVP.setBestPlan(VF.Width, 1);
7496
7497   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7498                          &CM);
7499   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7500                     << L->getHeader()->getParent()->getName() << "\"\n");
7501   LVP.executePlan(LB, DT);
7502
7503   // Mark the loop as already vectorized to avoid vectorizing again.
7504   Hints.setAlreadyVectorized();
7505
7506   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7507   return true;
7508 }
7509
7510 bool LoopVectorizePass::processLoop(Loop *L) {
7511   assert((EnableVPlanNativePath || L->empty()) &&
7512          "VPlan-native path is not enabled. Only process inner loops.");
7513
7514 #ifndef NDEBUG
7515   const std::string DebugLocStr = getDebugLocString(L);
7516 #endif /* NDEBUG */
7517
7518   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7519                     << L->getHeader()->getParent()->getName() << "\" from "
7520                     << DebugLocStr << "\n");
7521
7522   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7523
7524   LLVM_DEBUG(
7525       dbgs() << "LV: Loop hints:"
7526              << " force="
7527              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7528                      ? "disabled"
7529                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7530                             ? "enabled"
7531                             : "?"))
7532              << " width=" << Hints.getWidth()
7533              << " unroll=" << Hints.getInterleave() << "\n");
7534
7535   // Function containing loop
7536   Function *F = L->getHeader()->getParent();
7537
7538   // Looking at the diagnostic output is the only way to determine if a loop
7539   // was vectorized (other than looking at the IR or machine code), so it
7540   // is important to generate an optimization remark for each loop. Most of
7541   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7542   // generated as OptimizationRemark and OptimizationRemarkMissed are
7543   // less verbose reporting vectorized loops and unvectorized loops that may
7544   // benefit from vectorization, respectively.
7545
7546   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7547     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7548     return false;
7549   }
7550
7551   PredicatedScalarEvolution PSE(*SE, *L);
7552
7553   // Check if it is legal to vectorize the loop.
7554   LoopVectorizationRequirements Requirements(*ORE);
7555   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7556                                 &Requirements, &Hints, DB, AC);
7557   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7558     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7559     Hints.emitRemarkWithHints();
7560     return false;
7561   }
7562
7563   // Check the function attributes and profiles to find out if this function
7564   // should be optimized for size.
7565   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
7566
7567   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7568   // here. They may require CFG and instruction level transformations before
7569   // even evaluating whether vectorization is profitable. Since we cannot modify
7570   // the incoming IR, we need to build VPlan upfront in the vectorization
7571   // pipeline.
7572   if (!L->empty())
7573     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7574                                         ORE, BFI, PSI, Hints);
7575
7576   assert(L->empty() && "Inner loop expected.");
7577
7578   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7579   // count by optimizing for size, to minimize overheads.
7580   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7581   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7582     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7583                       << "This loop is worth vectorizing only if no scalar "
7584                       << "iteration overheads are incurred.");
7585     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7586       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7587     else {
7588       LLVM_DEBUG(dbgs() << "\n");
7589       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7590     }
7591   }
7592
7593   // Check the function attributes to see if implicit floats are allowed.
7594   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7595   // an integer loop and the vector instructions selected are purely integer
7596   // vector instructions?
7597   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7598     reportVectorizationFailure(
7599         "Can't vectorize when the NoImplicitFloat attribute is used",
7600         "loop not vectorized due to NoImplicitFloat attribute",
7601         "NoImplicitFloat", ORE, L);
7602     Hints.emitRemarkWithHints();
7603     return false;
7604   }
7605
7606   // Check if the target supports potentially unsafe FP vectorization.
7607   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7608   // for the target we're vectorizing for, to make sure none of the
7609   // additional fp-math flags can help.
7610   if (Hints.isPotentiallyUnsafe() &&
7611       TTI->isFPVectorizationPotentiallyUnsafe()) {
7612     reportVectorizationFailure(
7613         "Potentially unsafe FP op prevents vectorization",
7614         "loop not vectorized due to unsafe FP support.",
7615         "UnsafeFP", ORE, L);
7616     Hints.emitRemarkWithHints();
7617     return false;
7618   }
7619
7620   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7621   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7622
7623   // If an override option has been passed in for interleaved accesses, use it.
7624   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7625     UseInterleaved = EnableInterleavedMemAccesses;
7626
7627   // Analyze interleaved memory accesses.
7628   if (UseInterleaved) {
7629     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7630   }
7631
7632   // Use the cost model.
7633   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7634                                 F, &Hints, IAI);
7635   CM.collectValuesToIgnore();
7636
7637   // Use the planner for vectorization.
7638   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);
7639
7640   // Get user vectorization factor.
7641   unsigned UserVF = Hints.getWidth();
7642
7643   // Plan how to best vectorize, return the best VF and its cost.
7644   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7645
7646   VectorizationFactor VF = VectorizationFactor::Disabled();
7647   unsigned IC = 1;
7648   unsigned UserIC = Hints.getInterleave();
7649
7650   if (MaybeVF) {
7651     VF = *MaybeVF;
7652     // Select the interleave count.
7653     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7654   }
7655
7656   // Identify the diagnostic messages that should be produced.
7657   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7658   bool VectorizeLoop = true, InterleaveLoop = true;
7659   if (Requirements.doesNotMeet(F, L, Hints)) {
7660     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7661                          "requirements.\n");
7662     Hints.emitRemarkWithHints();
7663     return false;
7664   }
7665
7666   if (VF.Width == 1) {
7667     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7668     VecDiagMsg = std::make_pair(
7669         "VectorizationNotBeneficial",
7670         "the cost-model indicates that vectorization is not beneficial");
7671     VectorizeLoop = false;
7672   }
7673
7674   if (!MaybeVF && UserIC > 1) {
7675     // Tell the user interleaving was avoided up-front, despite being explicitly
7676     // requested.
7677     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7678                          "interleaving should be avoided up front\n");
7679     IntDiagMsg = std::make_pair(
7680         "InterleavingAvoided",
7681         "Ignoring UserIC, because interleaving was avoided up front");
7682     InterleaveLoop = false;
7683   } else if (IC == 1 && UserIC <= 1) {
7684     // Tell the user interleaving is not beneficial.
7685     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7686     IntDiagMsg = std::make_pair(
7687         "InterleavingNotBeneficial",
7688         "the cost-model indicates that interleaving is not beneficial");
7689     InterleaveLoop = false;
7690     if (UserIC == 1) {
7691       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7692       IntDiagMsg.second +=
7693           " and is explicitly disabled or interleave count is set to 1";
7694     }
7695   } else if (IC > 1 && UserIC == 1) {
7696     // Tell the user interleaving is beneficial, but it explicitly disabled.
7697     LLVM_DEBUG(
7698         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7699     IntDiagMsg = std::make_pair(
7700         "InterleavingBeneficialButDisabled",
7701         "the cost-model indicates that interleaving is beneficial "
7702         "but is explicitly disabled or interleave count is set to 1");
7703     InterleaveLoop = false;
7704   }
7705
7706   // Override IC if user provided an interleave count.
7707   IC = UserIC > 0 ? UserIC : IC;
7708
7709   // Emit diagnostic messages, if any.
7710   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7711   if (!VectorizeLoop && !InterleaveLoop) {
7712     // Do not vectorize or interleaving the loop.
7713     ORE->emit([&]() {
7714       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7715                                       L->getStartLoc(), L->getHeader())
7716              << VecDiagMsg.second;
7717     });
7718     ORE->emit([&]() {
7719       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7720                                       L->getStartLoc(), L->getHeader())
7721              << IntDiagMsg.second;
7722     });
7723     return false;
7724   } else if (!VectorizeLoop && InterleaveLoop) {
7725     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7726     ORE->emit([&]() {
7727       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7728                                         L->getStartLoc(), L->getHeader())
7729              << VecDiagMsg.second;
7730     });
7731   } else if (VectorizeLoop && !InterleaveLoop) {
7732     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7733                       << ") in " << DebugLocStr << '\n');
7734     ORE->emit([&]() {
7735       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7736                                         L->getStartLoc(), L->getHeader())
7737              << IntDiagMsg.second;
7738     });
7739   } else if (VectorizeLoop && InterleaveLoop) {
7740     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7741                       << ") in " << DebugLocStr << '\n');
7742     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7743   }
7744
7745   LVP.setBestPlan(VF.Width, IC);
7746
7747   using namespace ore;
7748   bool DisableRuntimeUnroll = false;
7749   MDNode *OrigLoopID = L->getLoopID();
7750
7751   if (!VectorizeLoop) {
7752     assert(IC > 1 && "interleave count should not be 1 or 0");
7753     // If we decided that it is not legal to vectorize the loop, then
7754     // interleave it.
7755     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7756                                &CM);
7757     LVP.executePlan(Unroller, DT);
7758
7759     ORE->emit([&]() {
7760       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7761                                 L->getHeader())
7762              << "interleaved loop (interleaved count: "
7763              << NV("InterleaveCount", IC) << ")";
7764     });
7765   } else {
7766     // If we decided that it is *legal* to vectorize the loop, then do it.
7767     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7768                            &LVL, &CM);
7769     LVP.executePlan(LB, DT);
7770     ++LoopsVectorized;
7771
7772     // Add metadata to disable runtime unrolling a scalar loop when there are
7773     // no runtime checks about strides and memory. A scalar loop that is
7774     // rarely used is not worth unrolling.
7775     if (!LB.areSafetyChecksAdded())
7776       DisableRuntimeUnroll = true;
7777
7778     // Report the vectorization decision.
7779     ORE->emit([&]() {
7780       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7781                                 L->getHeader())
7782              << "vectorized loop (vectorization width: "
7783              << NV("VectorizationFactor", VF.Width)
7784              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7785     });
7786   }
7787
7788   Optional<MDNode *> RemainderLoopID =
7789       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7790                                       LLVMLoopVectorizeFollowupEpilogue});
7791   if (RemainderLoopID.hasValue()) {
7792     L->setLoopID(RemainderLoopID.getValue());
7793   } else {
7794     if (DisableRuntimeUnroll)
7795       AddRuntimeUnrollDisableMetaData(L);
7796
7797     // Mark the loop as already vectorized to avoid vectorizing again.
7798     Hints.setAlreadyVectorized();
7799   }
7800
7801   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7802   return true;
7803 }
7804
7805 bool LoopVectorizePass::runImpl(
7806     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7807     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7808     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7809     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7810     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7811   SE = &SE_;
7812   LI = &LI_;
7813   TTI = &TTI_;
7814   DT = &DT_;
7815   BFI = &BFI_;
7816   TLI = TLI_;
7817   AA = &AA_;
7818   AC = &AC_;
7819   GetLAA = &GetLAA_;
7820   DB = &DB_;
7821   ORE = &ORE_;
7822   PSI = PSI_;
7823
7824   // Don't attempt if
7825   // 1. the target claims to have no vector registers, and
7826   // 2. interleaving won't help ILP.
7827   //
7828   // The second condition is necessary because, even if the target has no
7829   // vector registers, loop vectorization may still enable scalar
7830   // interleaving.
7831   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
7832       TTI->getMaxInterleaveFactor(1) < 2)
7833     return false;
7834
7835   bool Changed = false;
7836
7837   // The vectorizer requires loops to be in simplified form.
7838   // Since simplification may add new inner loops, it has to run before the
7839   // legality and profitability checks. This means running the loop vectorizer
7840   // will simplify all loops, regardless of whether anything end up being
7841   // vectorized.
7842   for (auto &L : *LI)
7843     Changed |=
7844         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7845
7846   // Build up a worklist of inner-loops to vectorize. This is necessary as
7847   // the act of vectorizing or partially unrolling a loop creates new loops
7848   // and can invalidate iterators across the loops.
7849   SmallVector<Loop *, 8> Worklist;
7850
7851   for (Loop *L : *LI)
7852     collectSupportedLoops(*L, LI, ORE, Worklist);
7853
7854   LoopsAnalyzed += Worklist.size();
7855
7856   // Now walk the identified inner loops.
7857   while (!Worklist.empty()) {
7858     Loop *L = Worklist.pop_back_val();
7859
7860     // For the inner loops we actually process, form LCSSA to simplify the
7861     // transform.
7862     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7863
7864     Changed |= processLoop(L);
7865   }
7866
7867   // Process each loop nest in the function.
7868   return Changed;
7869 }
7870
7871 PreservedAnalyses LoopVectorizePass::run(Function &F,
7872                                          FunctionAnalysisManager &AM) {
7873     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7874     auto &LI = AM.getResult<LoopAnalysis>(F);
7875     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7876     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7877     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7878     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7879     auto &AA = AM.getResult<AAManager>(F);
7880     auto &AC = AM.getResult<AssumptionAnalysis>(F);
7881     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7882     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7883     MemorySSA *MSSA = EnableMSSALoopDependency
7884                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7885                           : nullptr;
7886
7887     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7888     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7889         [&](Loop &L) -> const LoopAccessInfo & {
7890       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7891       return LAM.getResult<LoopAccessAnalysis>(L, AR);
7892     };
7893     const ModuleAnalysisManager &MAM =
7894         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7895     ProfileSummaryInfo *PSI =
7896         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
7897     bool Changed =
7898         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
7899     if (!Changed)
7900       return PreservedAnalyses::all();
7901     PreservedAnalyses PA;
7902
7903     // We currently do not preserve loopinfo/dominator analyses with outer loop
7904     // vectorization. Until this is addressed, mark these analyses as preserved
7905     // only for non-VPlan-native path.
7906     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7907     if (!EnableVPlanNativePath) {
7908       PA.preserve<LoopAnalysis>();
7909       PA.preserve<DominatorTreeAnalysis>();
7910     }
7911     PA.preserve<BasicAA>();
7912     PA.preserve<GlobalsAA>();
7913     return PA;
7914 }