llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

   1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
  10 // and generates target-independent LLVM-IR.
  11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
  12 // of instructions in order to estimate the profitability of vectorization.
  13 //
  14 // The loop vectorizer combines consecutive loop iterations into a single
  15 // 'wide' iteration. After this transformation the index is incremented
  16 // by the SIMD vector width, and not by one.
  17 //
  18 // This pass has three parts:
  19 // 1. The main loop pass that drives the different parts.
  20 // 2. LoopVectorizationLegality - A unit that checks for the legality
  21 //    of the vectorization.
  22 // 3. InnerLoopVectorizer - A unit that performs the actual
  23 //    widening of instructions.
  24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
  25 //    of vectorization. It decides on the optimal vector width, which
  26 //    can be one, if vectorization is not profitable.
  27 //
  28 // There is a development effort going on to migrate loop vectorizer to the
  29 // VPlan infrastructure and to introduce outer loop vectorization support (see
  30 // docs/Proposal/VectorizationPlan.rst and
  31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
  32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
  33 // alternative vectorization path that is natively implemented on top of the
  34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
  35 //
  36 //===----------------------------------------------------------------------===//
  37 //
  38 // The reduction-variable vectorization is based on the paper:
  39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
  40 //
  41 // Variable uniformity checks are inspired by:
  42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
  43 //
  44 // The interleaved access vectorization is based on the paper:
  45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
  46 //  Data for SIMD
  47 //
  48 // Other ideas/concepts are from:
  49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
  50 //
  51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
  52 //  Vectorizing Compilers.
  53 //
  54 //===----------------------------------------------------------------------===//
  55
  56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
  57 #include "LoopVectorizationPlanner.h"
  58 #include "VPRecipeBuilder.h"
  59 #include "VPlan.h"
  60 #include "VPlanHCFGBuilder.h"
  61 #include "VPlanPredicator.h"
  62 #include "VPlanTransforms.h"
  63 #include "llvm/ADT/APInt.h"
  64 #include "llvm/ADT/ArrayRef.h"
  65 #include "llvm/ADT/DenseMap.h"
  66 #include "llvm/ADT/DenseMapInfo.h"
  67 #include "llvm/ADT/Hashing.h"
  68 #include "llvm/ADT/MapVector.h"
  69 #include "llvm/ADT/None.h"
  70 #include "llvm/ADT/Optional.h"
  71 #include "llvm/ADT/STLExtras.h"
  72 #include "llvm/ADT/SmallPtrSet.h"
  73 #include "llvm/ADT/SmallSet.h"
  74 #include "llvm/ADT/SmallVector.h"
  75 #include "llvm/ADT/Statistic.h"
  76 #include "llvm/ADT/StringRef.h"
  77 #include "llvm/ADT/Twine.h"
  78 #include "llvm/ADT/iterator_range.h"
  79 #include "llvm/Analysis/AssumptionCache.h"
  80 #include "llvm/Analysis/BasicAliasAnalysis.h"
  81 #include "llvm/Analysis/BlockFrequencyInfo.h"
  82 #include "llvm/Analysis/CFG.h"
  83 #include "llvm/Analysis/CodeMetrics.h"
  84 #include "llvm/Analysis/DemandedBits.h"
  85 #include "llvm/Analysis/GlobalsModRef.h"
  86 #include "llvm/Analysis/LoopAccessAnalysis.h"
  87 #include "llvm/Analysis/LoopAnalysisManager.h"
  88 #include "llvm/Analysis/LoopInfo.h"
  89 #include "llvm/Analysis/LoopIterator.h"
  90 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  91 #include "llvm/Analysis/ProfileSummaryInfo.h"
  92 #include "llvm/Analysis/ScalarEvolution.h"
  93 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  94 #include "llvm/Analysis/TargetLibraryInfo.h"
  95 #include "llvm/Analysis/TargetTransformInfo.h"
  96 #include "llvm/Analysis/VectorUtils.h"
  97 #include "llvm/IR/Attributes.h"
  98 #include "llvm/IR/BasicBlock.h"
  99 #include "llvm/IR/CFG.h"
 100 #include "llvm/IR/Constant.h"
 101 #include "llvm/IR/Constants.h"
 102 #include "llvm/IR/DataLayout.h"
 103 #include "llvm/IR/DebugInfoMetadata.h"
 104 #include "llvm/IR/DebugLoc.h"
 105 #include "llvm/IR/DerivedTypes.h"
 106 #include "llvm/IR/DiagnosticInfo.h"
 107 #include "llvm/IR/Dominators.h"
 108 #include "llvm/IR/Function.h"
 109 #include "llvm/IR/IRBuilder.h"
 110 #include "llvm/IR/InstrTypes.h"
 111 #include "llvm/IR/Instruction.h"
 112 #include "llvm/IR/Instructions.h"
 113 #include "llvm/IR/IntrinsicInst.h"
 114 #include "llvm/IR/Intrinsics.h"
 115 #include "llvm/IR/LLVMContext.h"
 116 #include "llvm/IR/Metadata.h"
 117 #include "llvm/IR/Module.h"
 118 #include "llvm/IR/Operator.h"
 119 #include "llvm/IR/PatternMatch.h"
 120 #include "llvm/IR/Type.h"
 121 #include "llvm/IR/Use.h"
 122 #include "llvm/IR/User.h"
 123 #include "llvm/IR/Value.h"
 124 #include "llvm/IR/ValueHandle.h"
 125 #include "llvm/IR/Verifier.h"
 126 #include "llvm/InitializePasses.h"
 127 #include "llvm/Pass.h"
 128 #include "llvm/Support/Casting.h"
 129 #include "llvm/Support/CommandLine.h"
 130 #include "llvm/Support/Compiler.h"
 131 #include "llvm/Support/Debug.h"
 132 #include "llvm/Support/ErrorHandling.h"
 133 #include "llvm/Support/InstructionCost.h"
 134 #include "llvm/Support/MathExtras.h"
 135 #include "llvm/Support/raw_ostream.h"
 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
 138 #include "llvm/Transforms/Utils/LoopSimplify.h"
 139 #include "llvm/Transforms/Utils/LoopUtils.h"
 140 #include "llvm/Transforms/Utils/LoopVersioning.h"
 141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 142 #include "llvm/Transforms/Utils/SizeOpts.h"
 143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 144 #include <algorithm>
 145 #include <cassert>
 146 #include <cstdint>
 147 #include <cstdlib>
 148 #include <functional>
 149 #include <iterator>
 150 #include <limits>
 151 #include <memory>
 152 #include <string>
 153 #include <tuple>
 154 #include <utility>
 155
 156 using namespace llvm;
 157
 158 #define LV_NAME "loop-vectorize"
 159 #define DEBUG_TYPE LV_NAME
 160
 161 #ifndef NDEBUG
 162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
 163 #endif
 164
 165 /// @{
 166 /// Metadata attribute names
 167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
 168 const char LLVMLoopVectorizeFollowupVectorized[] =
 169     "llvm.loop.vectorize.followup_vectorized";
 170 const char LLVMLoopVectorizeFollowupEpilogue[] =
 171     "llvm.loop.vectorize.followup_epilogue";
 172 /// @}
 173
 174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
 175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
 176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
 177
 178 static cl::opt<bool> EnableEpilogueVectorization(
 179     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
 180     cl::desc("Enable vectorization of epilogue loops."));
 181
 182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
 183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
 184     cl::desc("When epilogue vectorization is enabled, and a value greater than "
 185              "1 is specified, forces the given VF for all applicable epilogue "
 186              "loops."));
 187
 188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
 189     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
 190     cl::desc("Only loops with vectorization factor equal to or larger than "
 191              "the specified value are considered for epilogue vectorization."));
 192
 193 /// Loops with a known constant trip count below this number are vectorized only
 194 /// if no scalar iteration overheads are incurred.
 195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
 196     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
 197     cl::desc("Loops with a constant trip count that is smaller than this "
 198              "value are vectorized only if no scalar iteration overheads "
 199              "are incurred."));
 200
 201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
 202     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
 203     cl::desc("The maximum allowed number of runtime memory checks with a "
 204              "vectorize(enable) pragma."));
 205
 206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
 207 // that predication is preferred, and this lists all options. I.e., the
 208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
 209 // and predicate the instructions accordingly. If tail-folding fails, there are
 210 // different fallback strategies depending on these values:
 211 namespace PreferPredicateTy {
 212   enum Option {
 213     ScalarEpilogue = 0,
 214     PredicateElseScalarEpilogue,
 215     PredicateOrDontVectorize
 216   };
 217 } // namespace PreferPredicateTy
 218
 219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
 220     "prefer-predicate-over-epilogue",
 221     cl::init(PreferPredicateTy::ScalarEpilogue),
 222     cl::Hidden,
 223     cl::desc("Tail-folding and predication preferences over creating a scalar "
 224              "epilogue loop."),
 225     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
 226                          "scalar-epilogue",
 227                          "Don't tail-predicate loops, create scalar epilogue"),
 228               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
 229                          "predicate-else-scalar-epilogue",
 230                          "prefer tail-folding, create scalar epilogue if tail "
 231                          "folding fails."),
 232               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
 233                          "predicate-dont-vectorize",
 234                          "prefers tail-folding, don't attempt vectorization if "
 235                          "tail-folding fails.")));
 236
 237 static cl::opt<bool> MaximizeBandwidth(
 238     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
 239     cl::desc("Maximize bandwidth when selecting vectorization factor which "
 240              "will be determined by the smallest type in loop."));
 241
 242 static cl::opt<bool> EnableInterleavedMemAccesses(
 243     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
 244     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
 245
 246 /// An interleave-group may need masking if it resides in a block that needs
 247 /// predication, or in order to mask away gaps.
 248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
 249     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
 250     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
 251
 252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
 253     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
 254     cl::desc("We don't interleave loops with a estimated constant trip count "
 255              "below this number"));
 256
 257 static cl::opt<unsigned> ForceTargetNumScalarRegs(
 258     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
 259     cl::desc("A flag that overrides the target's number of scalar registers."));
 260
 261 static cl::opt<unsigned> ForceTargetNumVectorRegs(
 262     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
 263     cl::desc("A flag that overrides the target's number of vector registers."));
 264
 265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
 266     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
 267     cl::desc("A flag that overrides the target's max interleave factor for "
 268              "scalar loops."));
 269
 270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
 271     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
 272     cl::desc("A flag that overrides the target's max interleave factor for "
 273              "vectorized loops."));
 274
 275 static cl::opt<unsigned> ForceTargetInstructionCost(
 276     "force-target-instruction-cost", cl::init(0), cl::Hidden,
 277     cl::desc("A flag that overrides the target's expected cost for "
 278              "an instruction to a single constant value. Mostly "
 279              "useful for getting consistent testing."));
 280
 281 static cl::opt<bool> ForceTargetSupportsScalableVectors(
 282     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
 283     cl::desc(
 284         "Pretend that scalable vectors are supported, even if the target does "
 285         "not support them. This flag should only be used for testing."));
 286
 287 static cl::opt<unsigned> SmallLoopCost(
 288     "small-loop-cost", cl::init(20), cl::Hidden,
 289     cl::desc(
 290         "The cost of a loop that is considered 'small' by the interleaver."));
 291
 292 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
 293     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
 294     cl::desc("Enable the use of the block frequency analysis to access PGO "
 295              "heuristics minimizing code growth in cold regions and being more "
 296              "aggressive in hot regions."));
 297
 298 // Runtime interleave loops for load/store throughput.
 299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
 300     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
 301     cl::desc(
 302         "Enable runtime interleaving until load/store ports are saturated"));
 303
 304 /// Interleave small loops with scalar reductions.
 305 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
 306     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
 307     cl::desc("Enable interleaving for loops with small iteration counts that "
 308              "contain scalar reductions to expose ILP."));
 309
 310 /// The number of stores in a loop that are allowed to need predication.
 311 static cl::opt<unsigned> NumberOfStoresToPredicate(
 312     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
 313     cl::desc("Max number of stores to be predicated behind an if."));
 314
 315 static cl::opt<bool> EnableIndVarRegisterHeur(
 316     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
 317     cl::desc("Count the induction variable only once when interleaving"));
 318
 319 static cl::opt<bool> EnableCondStoresVectorization(
 320     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
 321     cl::desc("Enable if predication of stores during vectorization."));
 322
 323 static cl::opt<unsigned> MaxNestedScalarReductionIC(
 324     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
 325     cl::desc("The maximum interleave count to use when interleaving a scalar "
 326              "reduction in a nested loop."));
 327
 328 static cl::opt<bool>
 329     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
 330                            cl::Hidden,
 331                            cl::desc("Prefer in-loop vector reductions, "
 332                                     "overriding the targets preference."));
 333
 334 cl::opt<bool> ForceOrderedReductions(
 335     "force-ordered-reductions", cl::init(false), cl::Hidden,
 336     cl::desc("Enable the vectorisation of loops with in-order (strict) "
 337              "FP reductions"));
 338
 339 static cl::opt<bool> PreferPredicatedReductionSelect(
 340     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
 341     cl::desc(
 342         "Prefer predicating a reduction operation over an after loop select."));
 343
 344 cl::opt<bool> EnableVPlanNativePath(
 345     "enable-vplan-native-path", cl::init(false), cl::Hidden,
 346     cl::desc("Enable VPlan-native vectorization path with "
 347              "support for outer loop vectorization."));
 348
 349 // FIXME: Remove this switch once we have divergence analysis. Currently we
 350 // assume divergent non-backedge branches when this switch is true.
 351 cl::opt<bool> EnableVPlanPredication(
 352     "enable-vplan-predication", cl::init(false), cl::Hidden,
 353     cl::desc("Enable VPlan-native vectorization path predicator with "
 354              "support for outer loop vectorization."));
 355
 356 // This flag enables the stress testing of the VPlan H-CFG construction in the
 357 // VPlan-native vectorization path. It must be used in conjuction with
 358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
 359 // verification of the H-CFGs built.
 360 static cl::opt<bool> VPlanBuildStressTest(
 361     "vplan-build-stress-test", cl::init(false), cl::Hidden,
 362     cl::desc(
 363         "Build VPlan for every supported loop nest in the function and bail "
 364         "out right after the build (stress test the VPlan H-CFG construction "
 365         "in the VPlan-native vectorization path)."));
 366
 367 cl::opt<bool> llvm::EnableLoopInterleaving(
 368     "interleave-loops", cl::init(true), cl::Hidden,
 369     cl::desc("Enable loop interleaving in Loop vectorization passes"));
 370 cl::opt<bool> llvm::EnableLoopVectorization(
 371     "vectorize-loops", cl::init(true), cl::Hidden,
 372     cl::desc("Run the Loop vectorization passes"));
 373
 374 cl::opt<bool> PrintVPlansInDotFormat(
 375     "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
 376     cl::desc("Use dot format instead of plain text when dumping VPlans"));
 377
 378 /// A helper function that returns true if the given type is irregular. The
 379 /// type is irregular if its allocated size doesn't equal the store size of an
 380 /// element of the corresponding vector type.
 381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
 382   // Determine if an array of N elements of type Ty is "bitcast compatible"
 383   // with a <N x Ty> vector.
 384   // This is only true if there is no padding between the array elements.
 385   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
 386 }
 387
 388 /// A helper function that returns the reciprocal of the block probability of
 389 /// predicated blocks. If we return X, we are assuming the predicated block
 390 /// will execute once for every X iterations of the loop header.
 391 ///
 392 /// TODO: We should use actual block probability here, if available. Currently,
 393 ///       we always assume predicated blocks have a 50% chance of executing.
 394 static unsigned getReciprocalPredBlockProb() { return 2; }
 395
 396 /// A helper function that returns an integer or floating-point constant with
 397 /// value C.
 398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
 399   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
 400                            : ConstantFP::get(Ty, C);
 401 }
 402
 403 /// Returns "best known" trip count for the specified loop \p L as defined by
 404 /// the following procedure:
 405 ///   1) Returns exact trip count if it is known.
 406 ///   2) Returns expected trip count according to profile data if any.
 407 ///   3) Returns upper bound estimate if it is known.
 408 ///   4) Returns None if all of the above failed.
 409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
 410   // Check if exact trip count is known.
 411   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
 412     return ExpectedTC;
 413
 414   // Check if there is an expected trip count available from profile data.
 415   if (LoopVectorizeWithBlockFrequency)
 416     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
 417       return EstimatedTC;
 418
 419   // Check if upper bound estimate is known.
 420   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
 421     return ExpectedTC;
 422
 423   return None;
 424 }
 425
 426 // Forward declare GeneratedRTChecks.
 427 class GeneratedRTChecks;
 428
 429 namespace llvm {
 430
 431 /// InnerLoopVectorizer vectorizes loops which contain only one basic
 432 /// block to a specified vectorization factor (VF).
 433 /// This class performs the widening of scalars into vectors, or multiple
 434 /// scalars. This class also implements the following features:
 435 /// * It inserts an epilogue loop for handling loops that don't have iteration
 436 ///   counts that are known to be a multiple of the vectorization factor.
 437 /// * It handles the code generation for reduction variables.
 438 /// * Scalarization (implementation using scalars) of un-vectorizable
 439 ///   instructions.
 440 /// InnerLoopVectorizer does not perform any vectorization-legality
 441 /// checks, and relies on the caller to check for the different legality
 442 /// aspects. The InnerLoopVectorizer relies on the
 443 /// LoopVectorizationLegality class to provide information about the induction
 444 /// and reduction variables that were found to a given vectorization factor.
 445 class InnerLoopVectorizer {
 446 public:
 447   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
 448                       LoopInfo *LI, DominatorTree *DT,
 449                       const TargetLibraryInfo *TLI,
 450                       const TargetTransformInfo *TTI, AssumptionCache *AC,
 451                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
 452                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
 453                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
 454                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
 455       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
 456         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
 457         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
 458         PSI(PSI), RTChecks(RTChecks) {
 459     // Query this against the original loop and save it here because the profile
 460     // of the original loop header may change as the transformation happens.
 461     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
 462         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
 463   }
 464
 465   virtual ~InnerLoopVectorizer() = default;
 466
 467   /// Create a new empty loop that will contain vectorized instructions later
 468   /// on, while the old loop will be used as the scalar remainder. Control flow
 469   /// is generated around the vectorized (and scalar epilogue) loops consisting
 470   /// of various checks and bypasses. Return the pre-header block of the new
 471   /// loop.
 472   /// In the case of epilogue vectorization, this function is overriden to
 473   /// handle the more complex control flow around the loops.
 474   virtual BasicBlock *createVectorizedLoopSkeleton();
 475
 476   /// Widen a single instruction within the innermost loop.
 477   void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
 478                         VPTransformState &State);
 479
 480   /// Widen a single call instruction within the innermost loop.
 481   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
 482                             VPTransformState &State);
 483
 484   /// Widen a single select instruction within the innermost loop.
 485   void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
 486                               bool InvariantCond, VPTransformState &State);
 487
 488   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
 489   void fixVectorizedLoop(VPTransformState &State);
 490
 491   // Return true if any runtime check is added.
 492   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
 493
 494   /// A type for vectorized values in the new loop. Each value from the
 495   /// original loop, when vectorized, is represented by UF vector values in the
 496   /// new unrolled loop, where UF is the unroll factor.
 497   using VectorParts = SmallVector<Value *, 2>;
 498
 499   /// Vectorize a single GetElementPtrInst based on information gathered and
 500   /// decisions taken during planning.
 501   void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
 502                 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
 503                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
 504
 505   /// Vectorize a single first-order recurrence or pointer induction PHINode in
 506   /// a block. This method handles the induction variable canonicalization. It
 507   /// supports both VF = 1 for unrolled loops and arbitrary length vectors.
 508   void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
 509                            VPTransformState &State);
 510
 511   /// A helper function to scalarize a single Instruction in the innermost loop.
 512   /// Generates a sequence of scalar instances for each lane between \p MinLane
 513   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
 514   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
 515   /// Instr's operands.
 516   void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands,
 517                             const VPIteration &Instance, bool IfPredicateInstr,
 518                             VPTransformState &State);
 519
 520   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
 521   /// is provided, the integer induction variable will first be truncated to
 522   /// the corresponding type.
 523   void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc,
 524                              VPValue *Def, VPValue *CastDef,
 525                              VPTransformState &State);
 526
 527   /// Construct the vector value of a scalarized value \p V one lane at a time.
 528   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
 529                                  VPTransformState &State);
 530
 531   /// Try to vectorize interleaved access group \p Group with the base address
 532   /// given in \p Addr, optionally masking the vector operations if \p
 533   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
 534   /// values in the vectorized loop.
 535   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
 536                                 ArrayRef<VPValue *> VPDefs,
 537                                 VPTransformState &State, VPValue *Addr,
 538                                 ArrayRef<VPValue *> StoredValues,
 539                                 VPValue *BlockInMask = nullptr);
 540
 541   /// Vectorize Load and Store instructions with the base address given in \p
 542   /// Addr, optionally masking the vector operations if \p BlockInMask is
 543   /// non-null. Use \p State to translate given VPValues to IR values in the
 544   /// vectorized loop.
 545   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
 546                                   VPValue *Def, VPValue *Addr,
 547                                   VPValue *StoredValue, VPValue *BlockInMask);
 548
 549   /// Set the debug location in the builder \p Ptr using the debug location in
 550   /// \p V. If \p Ptr is None then it uses the class member's Builder.
 551   void setDebugLocFromInst(const Value *V,
 552                            Optional<IRBuilder<> *> CustomBuilder = None);
 553
 554   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
 555   void fixNonInductionPHIs(VPTransformState &State);
 556
 557   /// Returns true if the reordering of FP operations is not allowed, but we are
 558   /// able to vectorize with strict in-order reductions for the given RdxDesc.
 559   bool useOrderedReductions(RecurrenceDescriptor &RdxDesc);
 560
 561   /// Create a broadcast instruction. This method generates a broadcast
 562   /// instruction (shuffle) for loop invariant values and for the induction
 563   /// value. If this is the induction variable then we extend it to N, N+1, ...
 564   /// this is needed because each iteration in the loop corresponds to a SIMD
 565   /// element.
 566   virtual Value *getBroadcastInstrs(Value *V);
 567
 568 protected:
 569   friend class LoopVectorizationPlanner;
 570
 571   /// A small list of PHINodes.
 572   using PhiVector = SmallVector<PHINode *, 4>;
 573
 574   /// A type for scalarized values in the new loop. Each value from the
 575   /// original loop, when scalarized, is represented by UF x VF scalar values
 576   /// in the new unrolled loop, where UF is the unroll factor and VF is the
 577   /// vectorization factor.
 578   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
 579
 580   /// Set up the values of the IVs correctly when exiting the vector loop.
 581   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
 582                     Value *CountRoundDown, Value *EndValue,
 583                     BasicBlock *MiddleBlock);
 584
 585   /// Create a new induction variable inside L.
 586   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
 587                                    Value *Step, Instruction *DL);
 588
 589   /// Handle all cross-iteration phis in the header.
 590   void fixCrossIterationPHIs(VPTransformState &State);
 591
 592   /// Create the exit value of first order recurrences in the middle block and
 593   /// update their users.
 594   void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State);
 595
 596   /// Create code for the loop exit value of the reduction.
 597   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
 598
 599   /// Clear NSW/NUW flags from reduction instructions if necessary.
 600   void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
 601                                VPTransformState &State);
 602
 603   /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
 604   /// means we need to add the appropriate incoming value from the middle
 605   /// block as exiting edges from the scalar epilogue loop (if present) are
 606   /// already in place, and we exit the vector loop exclusively to the middle
 607   /// block.
 608   void fixLCSSAPHIs(VPTransformState &State);
 609
 610   /// Iteratively sink the scalarized operands of a predicated instruction into
 611   /// the block that was created for it.
 612   void sinkScalarOperands(Instruction *PredInst);
 613
 614   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
 615   /// represented as.
 616   void truncateToMinimalBitwidths(VPTransformState &State);
 617
 618   /// This function adds
 619   /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
 620   /// to each vector element of Val. The sequence starts at StartIndex.
 621   /// \p Opcode is relevant for FP induction variable.
 622   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
 623                                Instruction::BinaryOps Opcode =
 624                                Instruction::BinaryOpsEnd);
 625
 626   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
 627   /// variable on which to base the steps, \p Step is the size of the step, and
 628   /// \p EntryVal is the value from the original loop that maps to the steps.
 629   /// Note that \p EntryVal doesn't have to be an induction variable - it
 630   /// can also be a truncate instruction.
 631   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
 632                         const InductionDescriptor &ID, VPValue *Def,
 633                         VPValue *CastDef, VPTransformState &State);
 634
 635   /// Create a vector induction phi node based on an existing scalar one. \p
 636   /// EntryVal is the value from the original loop that maps to the vector phi
 637   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
 638   /// truncate instruction, instead of widening the original IV, we widen a
 639   /// version of the IV truncated to \p EntryVal's type.
 640   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
 641                                        Value *Step, Value *Start,
 642                                        Instruction *EntryVal, VPValue *Def,
 643                                        VPValue *CastDef,
 644                                        VPTransformState &State);
 645
 646   /// Returns true if an instruction \p I should be scalarized instead of
 647   /// vectorized for the chosen vectorization factor.
 648   bool shouldScalarizeInstruction(Instruction *I) const;
 649
 650   /// Returns true if we should generate a scalar version of \p IV.
 651   bool needsScalarInduction(Instruction *IV) const;
 652
 653   /// If there is a cast involved in the induction variable \p ID, which should
 654   /// be ignored in the vectorized loop body, this function records the
 655   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
 656   /// cast. We had already proved that the casted Phi is equal to the uncasted
 657   /// Phi in the vectorized loop (under a runtime guard), and therefore
 658   /// there is no need to vectorize the cast - the same value can be used in the
 659   /// vector loop for both the Phi and the cast.
 660   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
 661   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
 662   ///
 663   /// \p EntryVal is the value from the original loop that maps to the vector
 664   /// phi node and is used to distinguish what is the IV currently being
 665   /// processed - original one (if \p EntryVal is a phi corresponding to the
 666   /// original IV) or the "newly-created" one based on the proof mentioned above
 667   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
 668   /// latter case \p EntryVal is a TruncInst and we must not record anything for
 669   /// that IV, but it's error-prone to expect callers of this routine to care
 670   /// about that, hence this explicit parameter.
 671   void recordVectorLoopValueForInductionCast(
 672       const InductionDescriptor &ID, const Instruction *EntryVal,
 673       Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State,
 674       unsigned Part, unsigned Lane = UINT_MAX);
 675
 676   /// Generate a shuffle sequence that will reverse the vector Vec.
 677   virtual Value *reverseVector(Value *Vec);
 678
 679   /// Returns (and creates if needed) the original loop trip count.
 680   Value *getOrCreateTripCount(Loop *NewLoop);
 681
 682   /// Returns (and creates if needed) the trip count of the widened loop.
 683   Value *getOrCreateVectorTripCount(Loop *NewLoop);
 684
 685   /// Returns a bitcasted value to the requested vector type.
 686   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
 687   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
 688                                 const DataLayout &DL);
 689
 690   /// Emit a bypass check to see if the vector trip count is zero, including if
 691   /// it overflows.
 692   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
 693
 694   /// Emit a bypass check to see if all of the SCEV assumptions we've
 695   /// had to make are correct. Returns the block containing the checks or
 696   /// nullptr if no checks have been added.
 697   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
 698
 699   /// Emit bypass checks to check any memory assumptions we may have made.
 700   /// Returns the block containing the checks or nullptr if no checks have been
 701   /// added.
 702   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
 703
 704   /// Compute the transformed value of Index at offset StartValue using step
 705   /// StepValue.
 706   /// For integer induction, returns StartValue + Index * StepValue.
 707   /// For pointer induction, returns StartValue[Index * StepValue].
 708   /// FIXME: The newly created binary instructions should contain nsw/nuw
 709   /// flags, which can be found from the original scalar operations.
 710   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
 711                               const DataLayout &DL,
 712                               const InductionDescriptor &ID) const;
 713
 714   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
 715   /// vector loop preheader, middle block and scalar preheader. Also
 716   /// allocate a loop object for the new vector loop and return it.
 717   Loop *createVectorLoopSkeleton(StringRef Prefix);
 718
 719   /// Create new phi nodes for the induction variables to resume iteration count
 720   /// in the scalar epilogue, from where the vectorized loop left off (given by
 721   /// \p VectorTripCount).
 722   /// In cases where the loop skeleton is more complicated (eg. epilogue
 723   /// vectorization) and the resume values can come from an additional bypass
 724   /// block, the \p AdditionalBypass pair provides information about the bypass
 725   /// block and the end value on the edge from bypass to this loop.
 726   void createInductionResumeValues(
 727       Loop *L, Value *VectorTripCount,
 728       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
 729
 730   /// Complete the loop skeleton by adding debug MDs, creating appropriate
 731   /// conditional branches in the middle block, preparing the builder and
 732   /// running the verifier. Take in the vector loop \p L as argument, and return
 733   /// the preheader of the completed vector loop.
 734   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
 735
 736   /// Add additional metadata to \p To that was not present on \p Orig.
 737   ///
 738   /// Currently this is used to add the noalias annotations based on the
 739   /// inserted memchecks.  Use this for instructions that are *cloned* into the
 740   /// vector loop.
 741   void addNewMetadata(Instruction *To, const Instruction *Orig);
 742
 743   /// Add metadata from one instruction to another.
 744   ///
 745   /// This includes both the original MDs from \p From and additional ones (\see
 746   /// addNewMetadata).  Use this for *newly created* instructions in the vector
 747   /// loop.
 748   void addMetadata(Instruction *To, Instruction *From);
 749
 750   /// Similar to the previous function but it adds the metadata to a
 751   /// vector of instructions.
 752   void addMetadata(ArrayRef<Value *> To, Instruction *From);
 753
 754   /// Allow subclasses to override and print debug traces before/after vplan
 755   /// execution, when trace information is requested.
 756   virtual void printDebugTracesAtStart(){};
 757   virtual void printDebugTracesAtEnd(){};
 758
 759   /// The original loop.
 760   Loop *OrigLoop;
 761
 762   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
 763   /// dynamic knowledge to simplify SCEV expressions and converts them to a
 764   /// more usable form.
 765   PredicatedScalarEvolution &PSE;
 766
 767   /// Loop Info.
 768   LoopInfo *LI;
 769
 770   /// Dominator Tree.
 771   DominatorTree *DT;
 772
 773   /// Alias Analysis.
 774   AAResults *AA;
 775
 776   /// Target Library Info.
 777   const TargetLibraryInfo *TLI;
 778
 779   /// Target Transform Info.
 780   const TargetTransformInfo *TTI;
 781
 782   /// Assumption Cache.
 783   AssumptionCache *AC;
 784
 785   /// Interface to emit optimization remarks.
 786   OptimizationRemarkEmitter *ORE;
 787
 788   /// LoopVersioning.  It's only set up (non-null) if memchecks were
 789   /// used.
 790   ///
 791   /// This is currently only used to add no-alias metadata based on the
 792   /// memchecks.  The actually versioning is performed manually.
 793   std::unique_ptr<LoopVersioning> LVer;
 794
 795   /// The vectorization SIMD factor to use. Each vector will have this many
 796   /// vector elements.
 797   ElementCount VF;
 798
 799   /// The vectorization unroll factor to use. Each scalar is vectorized to this
 800   /// many different vector instructions.
 801   unsigned UF;
 802
 803   /// The builder that we use
 804   IRBuilder<> Builder;
 805
 806   // --- Vectorization state ---
 807
 808   /// The vector-loop preheader.
 809   BasicBlock *LoopVectorPreHeader;
 810
 811   /// The scalar-loop preheader.
 812   BasicBlock *LoopScalarPreHeader;
 813
 814   /// Middle Block between the vector and the scalar.
 815   BasicBlock *LoopMiddleBlock;
 816
 817   /// The unique ExitBlock of the scalar loop if one exists.  Note that
 818   /// there can be multiple exiting edges reaching this block.
 819   BasicBlock *LoopExitBlock;
 820
 821   /// The vector loop body.
 822   BasicBlock *LoopVectorBody;
 823
 824   /// The scalar loop body.
 825   BasicBlock *LoopScalarBody;
 826
 827   /// A list of all bypass blocks. The first block is the entry of the loop.
 828   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
 829
 830   /// The new Induction variable which was added to the new block.
 831   PHINode *Induction = nullptr;
 832
 833   /// The induction variable of the old basic block.
 834   PHINode *OldInduction = nullptr;
 835
 836   /// Store instructions that were predicated.
 837   SmallVector<Instruction *, 4> PredicatedInstructions;
 838
 839   /// Trip count of the original loop.
 840   Value *TripCount = nullptr;
 841
 842   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
 843   Value *VectorTripCount = nullptr;
 844
 845   /// The legality analysis.
 846   LoopVectorizationLegality *Legal;
 847
 848   /// The profitablity analysis.
 849   LoopVectorizationCostModel *Cost;
 850
 851   // Record whether runtime checks are added.
 852   bool AddedSafetyChecks = false;
 853
 854   // Holds the end values for each induction variable. We save the end values
 855   // so we can later fix-up the external users of the induction variables.
 856   DenseMap<PHINode *, Value *> IVEndValues;
 857
 858   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
 859   // fixed up at the end of vector code generation.
 860   SmallVector<PHINode *, 8> OrigPHIsToFix;
 861
 862   /// BFI and PSI are used to check for profile guided size optimizations.
 863   BlockFrequencyInfo *BFI;
 864   ProfileSummaryInfo *PSI;
 865
 866   // Whether this loop should be optimized for size based on profile guided size
 867   // optimizatios.
 868   bool OptForSizeBasedOnProfile;
 869
 870   /// Structure to hold information about generated runtime checks, responsible
 871   /// for cleaning the checks, if vectorization turns out unprofitable.
 872   GeneratedRTChecks &RTChecks;
 873 };
 874
 875 class InnerLoopUnroller : public InnerLoopVectorizer {
 876 public:
 877   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
 878                     LoopInfo *LI, DominatorTree *DT,
 879                     const TargetLibraryInfo *TLI,
 880                     const TargetTransformInfo *TTI, AssumptionCache *AC,
 881                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
 882                     LoopVectorizationLegality *LVL,
 883                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
 884                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
 885       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
 886                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
 887                             BFI, PSI, Check) {}
 888
 889 private:
 890   Value *getBroadcastInstrs(Value *V) override;
 891   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
 892                        Instruction::BinaryOps Opcode =
 893                        Instruction::BinaryOpsEnd) override;
 894   Value *reverseVector(Value *Vec) override;
 895 };
 896
 897 /// Encapsulate information regarding vectorization of a loop and its epilogue.
 898 /// This information is meant to be updated and used across two stages of
 899 /// epilogue vectorization.
 900 struct EpilogueLoopVectorizationInfo {
 901   ElementCount MainLoopVF = ElementCount::getFixed(0);
 902   unsigned MainLoopUF = 0;
 903   ElementCount EpilogueVF = ElementCount::getFixed(0);
 904   unsigned EpilogueUF = 0;
 905   BasicBlock *MainLoopIterationCountCheck = nullptr;
 906   BasicBlock *EpilogueIterationCountCheck = nullptr;
 907   BasicBlock *SCEVSafetyCheck = nullptr;
 908   BasicBlock *MemSafetyCheck = nullptr;
 909   Value *TripCount = nullptr;
 910   Value *VectorTripCount = nullptr;
 911
 912   EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
 913                                 unsigned EUF)
 914       : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
 915         EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
 916     assert(EUF == 1 &&
 917            "A high UF for the epilogue loop is likely not beneficial.");
 918   }
 919 };
 920
 921 /// An extension of the inner loop vectorizer that creates a skeleton for a
 922 /// vectorized loop that has its epilogue (residual) also vectorized.
 923 /// The idea is to run the vplan on a given loop twice, firstly to setup the
 924 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
 925 /// from the first step and vectorize the epilogue.  This is achieved by
 926 /// deriving two concrete strategy classes from this base class and invoking
 927 /// them in succession from the loop vectorizer planner.
 928 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
 929 public:
 930   InnerLoopAndEpilogueVectorizer(
 931       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
 932       DominatorTree *DT, const TargetLibraryInfo *TLI,
 933       const TargetTransformInfo *TTI, AssumptionCache *AC,
 934       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
 935       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
 936       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
 937       GeneratedRTChecks &Checks)
 938       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
 939                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
 940                             Checks),
 941         EPI(EPI) {}
 942
 943   // Override this function to handle the more complex control flow around the
 944   // three loops.
 945   BasicBlock *createVectorizedLoopSkeleton() final override {
 946     return createEpilogueVectorizedLoopSkeleton();
 947   }
 948
 949   /// The interface for creating a vectorized skeleton using one of two
 950   /// different strategies, each corresponding to one execution of the vplan
 951   /// as described above.
 952   virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
 953
 954   /// Holds and updates state information required to vectorize the main loop
 955   /// and its epilogue in two separate passes. This setup helps us avoid
 956   /// regenerating and recomputing runtime safety checks. It also helps us to
 957   /// shorten the iteration-count-check path length for the cases where the
 958   /// iteration count of the loop is so small that the main vector loop is
 959   /// completely skipped.
 960   EpilogueLoopVectorizationInfo &EPI;
 961 };
 962
 963 /// A specialized derived class of inner loop vectorizer that performs
 964 /// vectorization of *main* loops in the process of vectorizing loops and their
 965 /// epilogues.
 966 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
 967 public:
 968   EpilogueVectorizerMainLoop(
 969       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
 970       DominatorTree *DT, const TargetLibraryInfo *TLI,
 971       const TargetTransformInfo *TTI, AssumptionCache *AC,
 972       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
 973       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
 974       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
 975       GeneratedRTChecks &Check)
 976       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
 977                                        EPI, LVL, CM, BFI, PSI, Check) {}
 978   /// Implements the interface for creating a vectorized skeleton using the
 979   /// *main loop* strategy (ie the first pass of vplan execution).
 980   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
 981
 982 protected:
 983   /// Emits an iteration count bypass check once for the main loop (when \p
 984   /// ForEpilogue is false) and once for the epilogue loop (when \p
 985   /// ForEpilogue is true).
 986   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
 987                                              bool ForEpilogue);
 988   void printDebugTracesAtStart() override;
 989   void printDebugTracesAtEnd() override;
 990 };
 991
 992 // A specialized derived class of inner loop vectorizer that performs
 993 // vectorization of *epilogue* loops in the process of vectorizing loops and
 994 // their epilogues.
 995 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
 996 public:
 997   EpilogueVectorizerEpilogueLoop(
 998       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
 999       DominatorTree *DT, const TargetLibraryInfo *TLI,
1000       const TargetTransformInfo *TTI, AssumptionCache *AC,
1001       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
1002       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
1003       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
1004       GeneratedRTChecks &Checks)
1005       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1006                                        EPI, LVL, CM, BFI, PSI, Checks) {}
1007   /// Implements the interface for creating a vectorized skeleton using the
1008   /// *epilogue loop* strategy (ie the second pass of vplan execution).
1009   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1010
1011 protected:
1012   /// Emits an iteration count bypass check after the main vector loop has
1013   /// finished to see if there are any iterations left to execute by either
1014   /// the vector epilogue or the scalar epilogue.
1015   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1016                                                       BasicBlock *Bypass,
1017                                                       BasicBlock *Insert);
1018   void printDebugTracesAtStart() override;
1019   void printDebugTracesAtEnd() override;
1020 };
1021 } // end namespace llvm
1022
1023 /// Look for a meaningful debug location on the instruction or it's
1024 /// operands.
1025 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1026   if (!I)
1027     return I;
1028
1029   DebugLoc Empty;
1030   if (I->getDebugLoc() != Empty)
1031     return I;
1032
1033   for (Use &Op : I->operands()) {
1034     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
1035       if (OpInst->getDebugLoc() != Empty)
1036         return OpInst;
1037   }
1038
1039   return I;
1040 }
1041
1042 void InnerLoopVectorizer::setDebugLocFromInst(
1043     const Value *V, Optional<IRBuilder<> *> CustomBuilder) {
1044   IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
1045   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
1046     const DILocation *DIL = Inst->getDebugLoc();
1047
1048     // When a FSDiscriminator is enabled, we don't need to add the multiply
1049     // factors to the discriminators.
1050     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1051         !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
1052       // FIXME: For scalable vectors, assume vscale=1.
1053       auto NewDIL =
1054           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1055       if (NewDIL)
1056         B->SetCurrentDebugLocation(NewDIL.getValue());
1057       else
1058         LLVM_DEBUG(dbgs()
1059                    << "Failed to create new discriminator: "
1060                    << DIL->getFilename() << " Line: " << DIL->getLine());
1061     } else
1062       B->SetCurrentDebugLocation(DIL);
1063   } else
1064     B->SetCurrentDebugLocation(DebugLoc());
1065 }
1066
1067 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
1068 /// is passed, the message relates to that particular instruction.
1069 #ifndef NDEBUG
1070 static void debugVectorizationMessage(const StringRef Prefix,
1071                                       const StringRef DebugMsg,
1072                                       Instruction *I) {
1073   dbgs() << "LV: " << Prefix << DebugMsg;
1074   if (I != nullptr)
1075     dbgs() << " " << *I;
1076   else
1077     dbgs() << '.';
1078   dbgs() << '\n';
1079 }
1080 #endif
1081
1082 /// Create an analysis remark that explains why vectorization failed
1083 ///
1084 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1085 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1086 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1087 /// the location of the remark.  \return the remark object that can be
1088 /// streamed to.
1089 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1090     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1091   Value *CodeRegion = TheLoop->getHeader();
1092   DebugLoc DL = TheLoop->getStartLoc();
1093
1094   if (I) {
1095     CodeRegion = I->getParent();
1096     // If there is no debug location attached to the instruction, revert back to
1097     // using the loop's.
1098     if (I->getDebugLoc())
1099       DL = I->getDebugLoc();
1100   }
1101
1102   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1103 }
1104
1105 /// Return a value for Step multiplied by VF.
1106 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
1107   assert(isa<ConstantInt>(Step) && "Expected an integer step");
1108   Constant *StepVal = ConstantInt::get(
1109       Step->getType(),
1110       cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
1111   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1112 }
1113
1114 namespace llvm {
1115
1116 /// Return the runtime value for VF.
1117 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
1118   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1119   return VF.isScalable() ? B.CreateVScale(EC) : EC;
1120 }
1121
1122 void reportVectorizationFailure(const StringRef DebugMsg,
1123                                 const StringRef OREMsg, const StringRef ORETag,
1124                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1125                                 Instruction *I) {
1126   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1127   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1128   ORE->emit(
1129       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1130       << "loop not vectorized: " << OREMsg);
1131 }
1132
1133 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1134                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1135                              Instruction *I) {
1136   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1137   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1138   ORE->emit(
1139       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1140       << Msg);
1141 }
1142
1143 } // end namespace llvm
1144
1145 #ifndef NDEBUG
1146 /// \return string containing a file name and a line # for the given loop.
1147 static std::string getDebugLocString(const Loop *L) {
1148   std::string Result;
1149   if (L) {
1150     raw_string_ostream OS(Result);
1151     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1152       LoopDbgLoc.print(OS);
1153     else
1154       // Just print the module name.
1155       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1156     OS.flush();
1157   }
1158   return Result;
1159 }
1160 #endif
1161
1162 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1163                                          const Instruction *Orig) {
1164   // If the loop was versioned with memchecks, add the corresponding no-alias
1165   // metadata.
1166   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1167     LVer->annotateInstWithNoAlias(To, Orig);
1168 }
1169
1170 void InnerLoopVectorizer::addMetadata(Instruction *To,
1171                                       Instruction *From) {
1172   propagateMetadata(To, From);
1173   addNewMetadata(To, From);
1174 }
1175
1176 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1177                                       Instruction *From) {
1178   for (Value *V : To) {
1179     if (Instruction *I = dyn_cast<Instruction>(V))
1180       addMetadata(I, From);
1181   }
1182 }
1183
1184 namespace llvm {
1185
1186 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1187 // lowered.
1188 enum ScalarEpilogueLowering {
1189
1190   // The default: allowing scalar epilogues.
1191   CM_ScalarEpilogueAllowed,
1192
1193   // Vectorization with OptForSize: don't allow epilogues.
1194   CM_ScalarEpilogueNotAllowedOptSize,
1195
1196   // A special case of vectorisation with OptForSize: loops with a very small
1197   // trip count are considered for vectorization under OptForSize, thereby
1198   // making sure the cost of their loop body is dominant, free of runtime
1199   // guards and scalar iteration overheads.
1200   CM_ScalarEpilogueNotAllowedLowTripLoop,
1201
1202   // Loop hint predicate indicating an epilogue is undesired.
1203   CM_ScalarEpilogueNotNeededUsePredicate,
1204
1205   // Directive indicating we must either tail fold or not vectorize
1206   CM_ScalarEpilogueNotAllowedUsePredicate
1207 };
1208
1209 /// ElementCountComparator creates a total ordering for ElementCount
1210 /// for the purposes of using it in a set structure.
1211 struct ElementCountComparator {
1212   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1213     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1214            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1215   }
1216 };
1217 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1218
1219 /// LoopVectorizationCostModel - estimates the expected speedups due to
1220 /// vectorization.
1221 /// In many cases vectorization is not profitable. This can happen because of
1222 /// a number of reasons. In this class we mainly attempt to predict the
1223 /// expected speedup/slowdowns due to the supported instruction set. We use the
1224 /// TargetTransformInfo to query the different backends for the cost of
1225 /// different operations.
1226 class LoopVectorizationCostModel {
1227 public:
1228   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1229                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1230                              LoopVectorizationLegality *Legal,
1231                              const TargetTransformInfo &TTI,
1232                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1233                              AssumptionCache *AC,
1234                              OptimizationRemarkEmitter *ORE, const Function *F,
1235                              const LoopVectorizeHints *Hints,
1236                              InterleavedAccessInfo &IAI)
1237       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1238         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1239         Hints(Hints), InterleaveInfo(IAI) {}
1240
1241   /// \return An upper bound for the vectorization factors (both fixed and
1242   /// scalable). If the factors are 0, vectorization and interleaving should be
1243   /// avoided up front.
1244   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1245
1246   /// \return True if runtime checks are required for vectorization, and false
1247   /// otherwise.
1248   bool runtimeChecksRequired();
1249
1250   /// \return The most profitable vectorization factor and the cost of that VF.
1251   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1252   /// then this vectorization factor will be selected if vectorization is
1253   /// possible.
1254   VectorizationFactor
1255   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1256
1257   VectorizationFactor
1258   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1259                                     const LoopVectorizationPlanner &LVP);
1260
1261   /// Setup cost-based decisions for user vectorization factor.
1262   /// \return true if the UserVF is a feasible VF to be chosen.
1263   bool selectUserVectorizationFactor(ElementCount UserVF) {
1264     collectUniformsAndScalars(UserVF);
1265     collectInstsToScalarize(UserVF);
1266     return expectedCost(UserVF).first.isValid();
1267   }
1268
1269   /// \return The size (in bits) of the smallest and widest types in the code
1270   /// that needs to be vectorized. We ignore values that remain scalar such as
1271   /// 64 bit loop indices.
1272   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1273
1274   /// \return The desired interleave count.
1275   /// If interleave count has been specified by metadata it will be returned.
1276   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1277   /// are the selected vectorization factor and the cost of the selected VF.
1278   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1279
1280   /// Memory access instruction may be vectorized in more than one way.
1281   /// Form of instruction after vectorization depends on cost.
1282   /// This function takes cost-based decisions for Load/Store instructions
1283   /// and collects them in a map. This decisions map is used for building
1284   /// the lists of loop-uniform and loop-scalar instructions.
1285   /// The calculated cost is saved with widening decision in order to
1286   /// avoid redundant calculations.
1287   void setCostBasedWideningDecision(ElementCount VF);
1288
1289   /// A struct that represents some properties of the register usage
1290   /// of a loop.
1291   struct RegisterUsage {
1292     /// Holds the number of loop invariant values that are used in the loop.
1293     /// The key is ClassID of target-provided register class.
1294     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1295     /// Holds the maximum number of concurrent live intervals in the loop.
1296     /// The key is ClassID of target-provided register class.
1297     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1298   };
1299
1300   /// \return Returns information about the register usages of the loop for the
1301   /// given vectorization factors.
1302   SmallVector<RegisterUsage, 8>
1303   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1304
1305   /// Collect values we want to ignore in the cost model.
1306   void collectValuesToIgnore();
1307
1308   /// Collect all element types in the loop for which widening is needed.
1309   void collectElementTypesForWidening();
1310
1311   /// Split reductions into those that happen in the loop, and those that happen
1312   /// outside. In loop reductions are collected into InLoopReductionChains.
1313   void collectInLoopReductions();
1314
1315   /// Returns true if we should use strict in-order reductions for the given
1316   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1317   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1318   /// of FP operations.
1319   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
1320     return ForceOrderedReductions && !Hints->allowReordering() &&
1321            RdxDesc.isOrdered();
1322   }
1323
1324   /// \returns The smallest bitwidth each instruction can be represented with.
1325   /// The vector equivalents of these instructions should be truncated to this
1326   /// type.
1327   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1328     return MinBWs;
1329   }
1330
1331   /// \returns True if it is more profitable to scalarize instruction \p I for
1332   /// vectorization factor \p VF.
1333   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1334     assert(VF.isVector() &&
1335            "Profitable to scalarize relevant only for VF > 1.");
1336
1337     // Cost model is not run in the VPlan-native path - return conservative
1338     // result until this changes.
1339     if (EnableVPlanNativePath)
1340       return false;
1341
1342     auto Scalars = InstsToScalarize.find(VF);
1343     assert(Scalars != InstsToScalarize.end() &&
1344            "VF not yet analyzed for scalarization profitability");
1345     return Scalars->second.find(I) != Scalars->second.end();
1346   }
1347
1348   /// Returns true if \p I is known to be uniform after vectorization.
1349   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1350     if (VF.isScalar())
1351       return true;
1352
1353     // Cost model is not run in the VPlan-native path - return conservative
1354     // result until this changes.
1355     if (EnableVPlanNativePath)
1356       return false;
1357
1358     auto UniformsPerVF = Uniforms.find(VF);
1359     assert(UniformsPerVF != Uniforms.end() &&
1360            "VF not yet analyzed for uniformity");
1361     return UniformsPerVF->second.count(I);
1362   }
1363
1364   /// Returns true if \p I is known to be scalar after vectorization.
1365   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1366     if (VF.isScalar())
1367       return true;
1368
1369     // Cost model is not run in the VPlan-native path - return conservative
1370     // result until this changes.
1371     if (EnableVPlanNativePath)
1372       return false;
1373
1374     auto ScalarsPerVF = Scalars.find(VF);
1375     assert(ScalarsPerVF != Scalars.end() &&
1376            "Scalar values are not calculated for VF");
1377     return ScalarsPerVF->second.count(I);
1378   }
1379
1380   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1381   /// for vectorization factor \p VF.
1382   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1383     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1384            !isProfitableToScalarize(I, VF) &&
1385            !isScalarAfterVectorization(I, VF);
1386   }
1387
1388   /// Decision that was taken during cost calculation for memory instruction.
1389   enum InstWidening {
1390     CM_Unknown,
1391     CM_Widen,         // For consecutive accesses with stride +1.
1392     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1393     CM_Interleave,
1394     CM_GatherScatter,
1395     CM_Scalarize
1396   };
1397
1398   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1399   /// instruction \p I and vector width \p VF.
1400   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1401                            InstructionCost Cost) {
1402     assert(VF.isVector() && "Expected VF >=2");
1403     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1404   }
1405
1406   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1407   /// interleaving group \p Grp and vector width \p VF.
1408   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1409                            ElementCount VF, InstWidening W,
1410                            InstructionCost Cost) {
1411     assert(VF.isVector() && "Expected VF >=2");
1412     /// Broadcast this decicion to all instructions inside the group.
1413     /// But the cost will be assigned to one instruction only.
1414     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1415       if (auto *I = Grp->getMember(i)) {
1416         if (Grp->getInsertPos() == I)
1417           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1418         else
1419           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1420       }
1421     }
1422   }
1423
1424   /// Return the cost model decision for the given instruction \p I and vector
1425   /// width \p VF. Return CM_Unknown if this instruction did not pass
1426   /// through the cost modeling.
1427   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1428     assert(VF.isVector() && "Expected VF to be a vector VF");
1429     // Cost model is not run in the VPlan-native path - return conservative
1430     // result until this changes.
1431     if (EnableVPlanNativePath)
1432       return CM_GatherScatter;
1433
1434     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1435     auto Itr = WideningDecisions.find(InstOnVF);
1436     if (Itr == WideningDecisions.end())
1437       return CM_Unknown;
1438     return Itr->second.first;
1439   }
1440
1441   /// Return the vectorization cost for the given instruction \p I and vector
1442   /// width \p VF.
1443   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1444     assert(VF.isVector() && "Expected VF >=2");
1445     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1446     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1447            "The cost is not calculated");
1448     return WideningDecisions[InstOnVF].second;
1449   }
1450
1451   /// Return True if instruction \p I is an optimizable truncate whose operand
1452   /// is an induction variable. Such a truncate will be removed by adding a new
1453   /// induction variable with the destination type.
1454   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1455     // If the instruction is not a truncate, return false.
1456     auto *Trunc = dyn_cast<TruncInst>(I);
1457     if (!Trunc)
1458       return false;
1459
1460     // Get the source and destination types of the truncate.
1461     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1462     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1463
1464     // If the truncate is free for the given types, return false. Replacing a
1465     // free truncate with an induction variable would add an induction variable
1466     // update instruction to each iteration of the loop. We exclude from this
1467     // check the primary induction variable since it will need an update
1468     // instruction regardless.
1469     Value *Op = Trunc->getOperand(0);
1470     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1471       return false;
1472
1473     // If the truncated value is not an induction variable, return false.
1474     return Legal->isInductionPhi(Op);
1475   }
1476
1477   /// Collects the instructions to scalarize for each predicated instruction in
1478   /// the loop.
1479   void collectInstsToScalarize(ElementCount VF);
1480
1481   /// Collect Uniform and Scalar values for the given \p VF.
1482   /// The sets depend on CM decision for Load/Store instructions
1483   /// that may be vectorized as interleave, gather-scatter or scalarized.
1484   void collectUniformsAndScalars(ElementCount VF) {
1485     // Do the analysis once.
1486     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1487       return;
1488     setCostBasedWideningDecision(VF);
1489     collectLoopUniforms(VF);
1490     collectLoopScalars(VF);
1491   }
1492
1493   /// Returns true if the target machine supports masked store operation
1494   /// for the given \p DataType and kind of access to \p Ptr.
1495   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1496     return Legal->isConsecutivePtr(Ptr) &&
1497            TTI.isLegalMaskedStore(DataType, Alignment);
1498   }
1499
1500   /// Returns true if the target machine supports masked load operation
1501   /// for the given \p DataType and kind of access to \p Ptr.
1502   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1503     return Legal->isConsecutivePtr(Ptr) &&
1504            TTI.isLegalMaskedLoad(DataType, Alignment);
1505   }
1506
1507   /// Returns true if the target machine can represent \p V as a masked gather
1508   /// or scatter operation.
1509   bool isLegalGatherOrScatter(Value *V) {
1510     bool LI = isa<LoadInst>(V);
1511     bool SI = isa<StoreInst>(V);
1512     if (!LI && !SI)
1513       return false;
1514     auto *Ty = getLoadStoreType(V);
1515     Align Align = getLoadStoreAlignment(V);
1516     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1517            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1518   }
1519
1520   /// Returns true if the target machine supports all of the reduction
1521   /// variables found for the given VF.
1522   bool canVectorizeReductions(ElementCount VF) const {
1523     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1524       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1525       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1526     }));
1527   }
1528
1529   /// Returns true if \p I is an instruction that will be scalarized with
1530   /// predication. Such instructions include conditional stores and
1531   /// instructions that may divide by zero.
1532   /// If a non-zero VF has been calculated, we check if I will be scalarized
1533   /// predication for that VF.
1534   bool isScalarWithPredication(Instruction *I) const;
1535
1536   // Returns true if \p I is an instruction that will be predicated either
1537   // through scalar predication or masked load/store or masked gather/scatter.
1538   // Superset of instructions that return true for isScalarWithPredication.
1539   bool isPredicatedInst(Instruction *I) {
1540     if (!blockNeedsPredication(I->getParent()))
1541       return false;
1542     // Loads and stores that need some form of masked operation are predicated
1543     // instructions.
1544     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1545       return Legal->isMaskRequired(I);
1546     return isScalarWithPredication(I);
1547   }
1548
1549   /// Returns true if \p I is a memory instruction with consecutive memory
1550   /// access that can be widened.
1551   bool
1552   memoryInstructionCanBeWidened(Instruction *I,
1553                                 ElementCount VF = ElementCount::getFixed(1));
1554
1555   /// Returns true if \p I is a memory instruction in an interleaved-group
1556   /// of memory accesses that can be vectorized with wide vector loads/stores
1557   /// and shuffles.
1558   bool
1559   interleavedAccessCanBeWidened(Instruction *I,
1560                                 ElementCount VF = ElementCount::getFixed(1));
1561
1562   /// Check if \p Instr belongs to any interleaved access group.
1563   bool isAccessInterleaved(Instruction *Instr) {
1564     return InterleaveInfo.isInterleaved(Instr);
1565   }
1566
1567   /// Get the interleaved access group that \p Instr belongs to.
1568   const InterleaveGroup<Instruction> *
1569   getInterleavedAccessGroup(Instruction *Instr) {
1570     return InterleaveInfo.getInterleaveGroup(Instr);
1571   }
1572
1573   /// Returns true if we're required to use a scalar epilogue for at least
1574   /// the final iteration of the original loop.
1575   bool requiresScalarEpilogue(ElementCount VF) const {
1576     if (!isScalarEpilogueAllowed())
1577       return false;
1578     // If we might exit from anywhere but the latch, must run the exiting
1579     // iteration in scalar form.
1580     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1581       return true;
1582     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1583   }
1584
1585   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1586   /// loop hint annotation.
1587   bool isScalarEpilogueAllowed() const {
1588     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1589   }
1590
1591   /// Returns true if all loop blocks should be masked to fold tail loop.
1592   bool foldTailByMasking() const { return FoldTailByMasking; }
1593
1594   bool blockNeedsPredication(BasicBlock *BB) const {
1595     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1596   }
1597
1598   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1599   /// nodes to the chain of instructions representing the reductions. Uses a
1600   /// MapVector to ensure deterministic iteration order.
1601   using ReductionChainMap =
1602       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1603
1604   /// Return the chain of instructions representing an inloop reduction.
1605   const ReductionChainMap &getInLoopReductionChains() const {
1606     return InLoopReductionChains;
1607   }
1608
1609   /// Returns true if the Phi is part of an inloop reduction.
1610   bool isInLoopReduction(PHINode *Phi) const {
1611     return InLoopReductionChains.count(Phi);
1612   }
1613
1614   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1615   /// with factor VF.  Return the cost of the instruction, including
1616   /// scalarization overhead if it's needed.
1617   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1618
1619   /// Estimate cost of a call instruction CI if it were vectorized with factor
1620   /// VF. Return the cost of the instruction, including scalarization overhead
1621   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1622   /// scalarized -
1623   /// i.e. either vector version isn't available, or is too expensive.
1624   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1625                                     bool &NeedToScalarize) const;
1626
1627   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1628   /// that of B.
1629   bool isMoreProfitable(const VectorizationFactor &A,
1630                         const VectorizationFactor &B) const;
1631
1632   /// Invalidates decisions already taken by the cost model.
1633   void invalidateCostModelingDecisions() {
1634     WideningDecisions.clear();
1635     Uniforms.clear();
1636     Scalars.clear();
1637   }
1638
1639 private:
1640   unsigned NumPredStores = 0;
1641
1642   /// \return An upper bound for the vectorization factors for both
1643   /// fixed and scalable vectorization, where the minimum-known number of
1644   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1645   /// disabled or unsupported, then the scalable part will be equal to
1646   /// ElementCount::getScalable(0).
1647   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1648                                            ElementCount UserVF);
1649
1650   /// \return the maximized element count based on the targets vector
1651   /// registers and the loop trip-count, but limited to a maximum safe VF.
1652   /// This is a helper function of computeFeasibleMaxVF.
1653   /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1654   /// issue that occurred on one of the buildbots which cannot be reproduced
1655   /// without having access to the properietary compiler (see comments on
1656   /// D98509). The issue is currently under investigation and this workaround
1657   /// will be removed as soon as possible.
1658   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1659                                        unsigned SmallestType,
1660                                        unsigned WidestType,
1661                                        const ElementCount &MaxSafeVF);
1662
1663   /// \return the maximum legal scalable VF, based on the safe max number
1664   /// of elements.
1665   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1666
1667   /// The vectorization cost is a combination of the cost itself and a boolean
1668   /// indicating whether any of the contributing operations will actually
1669   /// operate on vector values after type legalization in the backend. If this
1670   /// latter value is false, then all operations will be scalarized (i.e. no
1671   /// vectorization has actually taken place).
1672   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1673
1674   /// Returns the expected execution cost. The unit of the cost does
1675   /// not matter because we use the 'cost' units to compare different
1676   /// vector widths. The cost that is returned is *not* normalized by
1677   /// the factor width. If \p Invalid is not nullptr, this function
1678   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1679   /// each instruction that has an Invalid cost for the given VF.
1680   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1681   VectorizationCostTy
1682   expectedCost(ElementCount VF,
1683                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1684
1685   /// Returns the execution time cost of an instruction for a given vector
1686   /// width. Vector width of one means scalar.
1687   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1688
1689   /// The cost-computation logic from getInstructionCost which provides
1690   /// the vector type as an output parameter.
1691   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1692                                      Type *&VectorTy);
1693
1694   /// Return the cost of instructions in an inloop reduction pattern, if I is
1695   /// part of that pattern.
1696   Optional<InstructionCost>
1697   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1698                           TTI::TargetCostKind CostKind);
1699
1700   /// Calculate vectorization cost of memory instruction \p I.
1701   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1702
1703   /// The cost computation for scalarized memory instruction.
1704   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1705
1706   /// The cost computation for interleaving group of memory instructions.
1707   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1708
1709   /// The cost computation for Gather/Scatter instruction.
1710   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1711
1712   /// The cost computation for widening instruction \p I with consecutive
1713   /// memory access.
1714   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1715
1716   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1717   /// Load: scalar load + broadcast.
1718   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1719   /// element)
1720   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1721
1722   /// Estimate the overhead of scalarizing an instruction. This is a
1723   /// convenience wrapper for the type-based getScalarizationOverhead API.
1724   InstructionCost getScalarizationOverhead(Instruction *I,
1725                                            ElementCount VF) const;
1726
1727   /// Returns whether the instruction is a load or store and will be a emitted
1728   /// as a vector operation.
1729   bool isConsecutiveLoadOrStore(Instruction *I);
1730
1731   /// Returns true if an artificially high cost for emulated masked memrefs
1732   /// should be used.
1733   bool useEmulatedMaskMemRefHack(Instruction *I);
1734
1735   /// Map of scalar integer values to the smallest bitwidth they can be legally
1736   /// represented as. The vector equivalents of these values should be truncated
1737   /// to this type.
1738   MapVector<Instruction *, uint64_t> MinBWs;
1739
1740   /// A type representing the costs for instructions if they were to be
1741   /// scalarized rather than vectorized. The entries are Instruction-Cost
1742   /// pairs.
1743   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1744
1745   /// A set containing all BasicBlocks that are known to present after
1746   /// vectorization as a predicated block.
1747   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1748
1749   /// Records whether it is allowed to have the original scalar loop execute at
1750   /// least once. This may be needed as a fallback loop in case runtime
1751   /// aliasing/dependence checks fail, or to handle the tail/remainder
1752   /// iterations when the trip count is unknown or doesn't divide by the VF,
1753   /// or as a peel-loop to handle gaps in interleave-groups.
1754   /// Under optsize and when the trip count is very small we don't allow any
1755   /// iterations to execute in the scalar loop.
1756   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1757
1758   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1759   bool FoldTailByMasking = false;
1760
1761   /// A map holding scalar costs for different vectorization factors. The
1762   /// presence of a cost for an instruction in the mapping indicates that the
1763   /// instruction will be scalarized when vectorizing with the associated
1764   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1765   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1766
1767   /// Holds the instructions known to be uniform after vectorization.
1768   /// The data is collected per VF.
1769   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1770
1771   /// Holds the instructions known to be scalar after vectorization.
1772   /// The data is collected per VF.
1773   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1774
1775   /// Holds the instructions (address computations) that are forced to be
1776   /// scalarized.
1777   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1778
1779   /// PHINodes of the reductions that should be expanded in-loop along with
1780   /// their associated chains of reduction operations, in program order from top
1781   /// (PHI) to bottom
1782   ReductionChainMap InLoopReductionChains;
1783
1784   /// A Map of inloop reduction operations and their immediate chain operand.
1785   /// FIXME: This can be removed once reductions can be costed correctly in
1786   /// vplan. This was added to allow quick lookup to the inloop operations,
1787   /// without having to loop through InLoopReductionChains.
1788   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1789
1790   /// Returns the expected difference in cost from scalarizing the expression
1791   /// feeding a predicated instruction \p PredInst. The instructions to
1792   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1793   /// non-negative return value implies the expression will be scalarized.
1794   /// Currently, only single-use chains are considered for scalarization.
1795   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1796                               ElementCount VF);
1797
1798   /// Collect the instructions that are uniform after vectorization. An
1799   /// instruction is uniform if we represent it with a single scalar value in
1800   /// the vectorized loop corresponding to each vector iteration. Examples of
1801   /// uniform instructions include pointer operands of consecutive or
1802   /// interleaved memory accesses. Note that although uniformity implies an
1803   /// instruction will be scalar, the reverse is not true. In general, a
1804   /// scalarized instruction will be represented by VF scalar values in the
1805   /// vectorized loop, each corresponding to an iteration of the original
1806   /// scalar loop.
1807   void collectLoopUniforms(ElementCount VF);
1808
1809   /// Collect the instructions that are scalar after vectorization. An
1810   /// instruction is scalar if it is known to be uniform or will be scalarized
1811   /// during vectorization. Non-uniform scalarized instructions will be
1812   /// represented by VF values in the vectorized loop, each corresponding to an
1813   /// iteration of the original scalar loop.
1814   void collectLoopScalars(ElementCount VF);
1815
1816   /// Keeps cost model vectorization decision and cost for instructions.
1817   /// Right now it is used for memory instructions only.
1818   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1819                                 std::pair<InstWidening, InstructionCost>>;
1820
1821   DecisionList WideningDecisions;
1822
1823   /// Returns true if \p V is expected to be vectorized and it needs to be
1824   /// extracted.
1825   bool needsExtract(Value *V, ElementCount VF) const {
1826     Instruction *I = dyn_cast<Instruction>(V);
1827     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1828         TheLoop->isLoopInvariant(I))
1829       return false;
1830
1831     // Assume we can vectorize V (and hence we need extraction) if the
1832     // scalars are not computed yet. This can happen, because it is called
1833     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1834     // the scalars are collected. That should be a safe assumption in most
1835     // cases, because we check if the operands have vectorizable types
1836     // beforehand in LoopVectorizationLegality.
1837     return Scalars.find(VF) == Scalars.end() ||
1838            !isScalarAfterVectorization(I, VF);
1839   };
1840
1841   /// Returns a range containing only operands needing to be extracted.
1842   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1843                                                    ElementCount VF) const {
1844     return SmallVector<Value *, 4>(make_filter_range(
1845         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1846   }
1847
1848   /// Determines if we have the infrastructure to vectorize loop \p L and its
1849   /// epilogue, assuming the main loop is vectorized by \p VF.
1850   bool isCandidateForEpilogueVectorization(const Loop &L,
1851                                            const ElementCount VF) const;
1852
1853   /// Returns true if epilogue vectorization is considered profitable, and
1854   /// false otherwise.
1855   /// \p VF is the vectorization factor chosen for the original loop.
1856   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1857
1858 public:
1859   /// The loop that we evaluate.
1860   Loop *TheLoop;
1861
1862   /// Predicated scalar evolution analysis.
1863   PredicatedScalarEvolution &PSE;
1864
1865   /// Loop Info analysis.
1866   LoopInfo *LI;
1867
1868   /// Vectorization legality.
1869   LoopVectorizationLegality *Legal;
1870
1871   /// Vector target information.
1872   const TargetTransformInfo &TTI;
1873
1874   /// Target Library Info.
1875   const TargetLibraryInfo *TLI;
1876
1877   /// Demanded bits analysis.
1878   DemandedBits *DB;
1879
1880   /// Assumption cache.
1881   AssumptionCache *AC;
1882
1883   /// Interface to emit optimization remarks.
1884   OptimizationRemarkEmitter *ORE;
1885
1886   const Function *TheFunction;
1887
1888   /// Loop Vectorize Hint.
1889   const LoopVectorizeHints *Hints;
1890
1891   /// The interleave access information contains groups of interleaved accesses
1892   /// with the same stride and close to each other.
1893   InterleavedAccessInfo &InterleaveInfo;
1894
1895   /// Values to ignore in the cost model.
1896   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1897
1898   /// Values to ignore in the cost model when VF > 1.
1899   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1900
1901   /// All element types found in the loop.
1902   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1903
1904   /// Profitable vector factors.
1905   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1906 };
1907 } // end namespace llvm
1908
1909 /// Helper struct to manage generating runtime checks for vectorization.
1910 ///
1911 /// The runtime checks are created up-front in temporary blocks to allow better
1912 /// estimating the cost and un-linked from the existing IR. After deciding to
1913 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1914 /// temporary blocks are completely removed.
1915 class GeneratedRTChecks {
1916   /// Basic block which contains the generated SCEV checks, if any.
1917   BasicBlock *SCEVCheckBlock = nullptr;
1918
1919   /// The value representing the result of the generated SCEV checks. If it is
1920   /// nullptr, either no SCEV checks have been generated or they have been used.
1921   Value *SCEVCheckCond = nullptr;
1922
1923   /// Basic block which contains the generated memory runtime checks, if any.
1924   BasicBlock *MemCheckBlock = nullptr;
1925
1926   /// The value representing the result of the generated memory runtime checks.
1927   /// If it is nullptr, either no memory runtime checks have been generated or
1928   /// they have been used.
1929   Instruction *MemRuntimeCheckCond = nullptr;
1930
1931   DominatorTree *DT;
1932   LoopInfo *LI;
1933
1934   SCEVExpander SCEVExp;
1935   SCEVExpander MemCheckExp;
1936
1937 public:
1938   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1939                     const DataLayout &DL)
1940       : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
1941         MemCheckExp(SE, DL, "scev.check") {}
1942
1943   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1944   /// accurately estimate the cost of the runtime checks. The blocks are
1945   /// un-linked from the IR and is added back during vector code generation. If
1946   /// there is no vector code generation, the check blocks are removed
1947   /// completely.
1948   void Create(Loop *L, const LoopAccessInfo &LAI,
1949               const SCEVUnionPredicate &UnionPred) {
1950
1951     BasicBlock *LoopHeader = L->getHeader();
1952     BasicBlock *Preheader = L->getLoopPreheader();
1953
1954     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1955     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1956     // may be used by SCEVExpander. The blocks will be un-linked from their
1957     // predecessors and removed from LI & DT at the end of the function.
1958     if (!UnionPred.isAlwaysTrue()) {
1959       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1960                                   nullptr, "vector.scevcheck");
1961
1962       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1963           &UnionPred, SCEVCheckBlock->getTerminator());
1964     }
1965
1966     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1967     if (RtPtrChecking.Need) {
1968       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1969       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1970                                  "vector.memcheck");
1971
1972       std::tie(std::ignore, MemRuntimeCheckCond) =
1973           addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1974                            RtPtrChecking.getChecks(), MemCheckExp);
1975       assert(MemRuntimeCheckCond &&
1976              "no RT checks generated although RtPtrChecking "
1977              "claimed checks are required");
1978     }
1979
1980     if (!MemCheckBlock && !SCEVCheckBlock)
1981       return;
1982
1983     // Unhook the temporary block with the checks, update various places
1984     // accordingly.
1985     if (SCEVCheckBlock)
1986       SCEVCheckBlock->replaceAllUsesWith(Preheader);
1987     if (MemCheckBlock)
1988       MemCheckBlock->replaceAllUsesWith(Preheader);
1989
1990     if (SCEVCheckBlock) {
1991       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1992       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1993       Preheader->getTerminator()->eraseFromParent();
1994     }
1995     if (MemCheckBlock) {
1996       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1997       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1998       Preheader->getTerminator()->eraseFromParent();
1999     }
2000
2001     DT->changeImmediateDominator(LoopHeader, Preheader);
2002     if (MemCheckBlock) {
2003       DT->eraseNode(MemCheckBlock);
2004       LI->removeBlock(MemCheckBlock);
2005     }
2006     if (SCEVCheckBlock) {
2007       DT->eraseNode(SCEVCheckBlock);
2008       LI->removeBlock(SCEVCheckBlock);
2009     }
2010   }
2011
2012   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2013   /// unused.
2014   ~GeneratedRTChecks() {
2015     SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT);
2016     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT);
2017     if (!SCEVCheckCond)
2018       SCEVCleaner.markResultUsed();
2019
2020     if (!MemRuntimeCheckCond)
2021       MemCheckCleaner.markResultUsed();
2022
2023     if (MemRuntimeCheckCond) {
2024       auto &SE = *MemCheckExp.getSE();
2025       // Memory runtime check generation creates compares that use expanded
2026       // values. Remove them before running the SCEVExpanderCleaners.
2027       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2028         if (MemCheckExp.isInsertedInstruction(&I))
2029           continue;
2030         SE.forgetValue(&I);
2031         SE.eraseValueFromMap(&I);
2032         I.eraseFromParent();
2033       }
2034     }
2035     MemCheckCleaner.cleanup();
2036     SCEVCleaner.cleanup();
2037
2038     if (SCEVCheckCond)
2039       SCEVCheckBlock->eraseFromParent();
2040     if (MemRuntimeCheckCond)
2041       MemCheckBlock->eraseFromParent();
2042   }
2043
2044   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2045   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2046   /// depending on the generated condition.
2047   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
2048                              BasicBlock *LoopVectorPreHeader,
2049                              BasicBlock *LoopExitBlock) {
2050     if (!SCEVCheckCond)
2051       return nullptr;
2052     if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
2053       if (C->isZero())
2054         return nullptr;
2055
2056     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2057
2058     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2059     // Create new preheader for vector loop.
2060     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2061       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2062
2063     SCEVCheckBlock->getTerminator()->eraseFromParent();
2064     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2065     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2066                                                 SCEVCheckBlock);
2067
2068     DT->addNewBlock(SCEVCheckBlock, Pred);
2069     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2070
2071     ReplaceInstWithInst(
2072         SCEVCheckBlock->getTerminator(),
2073         BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2074     // Mark the check as used, to prevent it from being removed during cleanup.
2075     SCEVCheckCond = nullptr;
2076     return SCEVCheckBlock;
2077   }
2078
2079   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2080   /// the branches to branch to the vector preheader or \p Bypass, depending on
2081   /// the generated condition.
2082   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
2083                                    BasicBlock *LoopVectorPreHeader) {
2084     // Check if we generated code that checks in runtime if arrays overlap.
2085     if (!MemRuntimeCheckCond)
2086       return nullptr;
2087
2088     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2089     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2090                                                 MemCheckBlock);
2091
2092     DT->addNewBlock(MemCheckBlock, Pred);
2093     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2094     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2095
2096     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2097       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2098
2099     ReplaceInstWithInst(
2100         MemCheckBlock->getTerminator(),
2101         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2102     MemCheckBlock->getTerminator()->setDebugLoc(
2103         Pred->getTerminator()->getDebugLoc());
2104
2105     // Mark the check as used, to prevent it from being removed during cleanup.
2106     MemRuntimeCheckCond = nullptr;
2107     return MemCheckBlock;
2108   }
2109 };
2110
2111 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2112 // vectorization. The loop needs to be annotated with #pragma omp simd
2113 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2114 // vector length information is not provided, vectorization is not considered
2115 // explicit. Interleave hints are not allowed either. These limitations will be
2116 // relaxed in the future.
2117 // Please, note that we are currently forced to abuse the pragma 'clang
2118 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2119 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2120 // provides *explicit vectorization hints* (LV can bypass legal checks and
2121 // assume that vectorization is legal). However, both hints are implemented
2122 // using the same metadata (llvm.loop.vectorize, processed by
2123 // LoopVectorizeHints). This will be fixed in the future when the native IR
2124 // representation for pragma 'omp simd' is introduced.
2125 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2126                                    OptimizationRemarkEmitter *ORE) {
2127   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2128   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2129
2130   // Only outer loops with an explicit vectorization hint are supported.
2131   // Unannotated outer loops are ignored.
2132   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2133     return false;
2134
2135   Function *Fn = OuterLp->getHeader()->getParent();
2136   if (!Hints.allowVectorization(Fn, OuterLp,
2137                                 true /*VectorizeOnlyWhenForced*/)) {
2138     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2139     return false;
2140   }
2141
2142   if (Hints.getInterleave() > 1) {
2143     // TODO: Interleave support is future work.
2144     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2145                          "outer loops.\n");
2146     Hints.emitRemarkWithHints();
2147     return false;
2148   }
2149
2150   return true;
2151 }
2152
2153 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2154                                   OptimizationRemarkEmitter *ORE,
2155                                   SmallVectorImpl<Loop *> &V) {
2156   // Collect inner loops and outer loops without irreducible control flow. For
2157   // now, only collect outer loops that have explicit vectorization hints. If we
2158   // are stress testing the VPlan H-CFG construction, we collect the outermost
2159   // loop of every loop nest.
2160   if (L.isInnermost() || VPlanBuildStressTest ||
2161       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2162     LoopBlocksRPO RPOT(&L);
2163     RPOT.perform(LI);
2164     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2165       V.push_back(&L);
2166       // TODO: Collect inner loops inside marked outer loops in case
2167       // vectorization fails for the outer loop. Do not invoke
2168       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2169       // already known to be reducible. We can use an inherited attribute for
2170       // that.
2171       return;
2172     }
2173   }
2174   for (Loop *InnerL : L)
2175     collectSupportedLoops(*InnerL, LI, ORE, V);
2176 }
2177
2178 namespace {
2179
2180 /// The LoopVectorize Pass.
2181 struct LoopVectorize : public FunctionPass {
2182   /// Pass identification, replacement for typeid
2183   static char ID;
2184
2185   LoopVectorizePass Impl;
2186
2187   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2188                          bool VectorizeOnlyWhenForced = false)
2189       : FunctionPass(ID),
2190         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2191     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2192   }
2193
2194   bool runOnFunction(Function &F) override {
2195     if (skipFunction(F))
2196       return false;
2197
2198     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2199     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2200     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2201     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2202     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2203     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2204     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2205     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2206     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2207     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2208     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2209     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2210     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2211
2212     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2213         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2214
2215     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2216                         GetLAA, *ORE, PSI).MadeAnyChange;
2217   }
2218
2219   void getAnalysisUsage(AnalysisUsage &AU) const override {
2220     AU.addRequired<AssumptionCacheTracker>();
2221     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2222     AU.addRequired<DominatorTreeWrapperPass>();
2223     AU.addRequired<LoopInfoWrapperPass>();
2224     AU.addRequired<ScalarEvolutionWrapperPass>();
2225     AU.addRequired<TargetTransformInfoWrapperPass>();
2226     AU.addRequired<AAResultsWrapperPass>();
2227     AU.addRequired<LoopAccessLegacyAnalysis>();
2228     AU.addRequired<DemandedBitsWrapperPass>();
2229     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2230     AU.addRequired<InjectTLIMappingsLegacy>();
2231
2232     // We currently do not preserve loopinfo/dominator analyses with outer loop
2233     // vectorization. Until this is addressed, mark these analyses as preserved
2234     // only for non-VPlan-native path.
2235     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2236     if (!EnableVPlanNativePath) {
2237       AU.addPreserved<LoopInfoWrapperPass>();
2238       AU.addPreserved<DominatorTreeWrapperPass>();
2239     }
2240
2241     AU.addPreserved<BasicAAWrapperPass>();
2242     AU.addPreserved<GlobalsAAWrapperPass>();
2243     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2244   }
2245 };
2246
2247 } // end anonymous namespace
2248
2249 //===----------------------------------------------------------------------===//
2250 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2251 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2252 //===----------------------------------------------------------------------===//
2253
2254 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2255   // We need to place the broadcast of invariant variables outside the loop,
2256   // but only if it's proven safe to do so. Else, broadcast will be inside
2257   // vector loop body.
2258   Instruction *Instr = dyn_cast<Instruction>(V);
2259   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2260                      (!Instr ||
2261                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2262   // Place the code for broadcasting invariant variables in the new preheader.
2263   IRBuilder<>::InsertPointGuard Guard(Builder);
2264   if (SafeToHoist)
2265     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2266
2267   // Broadcast the scalar into all locations in the vector.
2268   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2269
2270   return Shuf;
2271 }
2272
2273 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2274     const InductionDescriptor &II, Value *Step, Value *Start,
2275     Instruction *EntryVal, VPValue *Def, VPValue *CastDef,
2276     VPTransformState &State) {
2277   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2278          "Expected either an induction phi-node or a truncate of it!");
2279
2280   // Construct the initial value of the vector IV in the vector loop preheader
2281   auto CurrIP = Builder.saveIP();
2282   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2283   if (isa<TruncInst>(EntryVal)) {
2284     assert(Start->getType()->isIntegerTy() &&
2285            "Truncation requires an integer type");
2286     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2287     Step = Builder.CreateTrunc(Step, TruncType);
2288     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2289   }
2290   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2291   Value *SteppedStart =
2292       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2293
2294   // We create vector phi nodes for both integer and floating-point induction
2295   // variables. Here, we determine the kind of arithmetic we will perform.
2296   Instruction::BinaryOps AddOp;
2297   Instruction::BinaryOps MulOp;
2298   if (Step->getType()->isIntegerTy()) {
2299     AddOp = Instruction::Add;
2300     MulOp = Instruction::Mul;
2301   } else {
2302     AddOp = II.getInductionOpcode();
2303     MulOp = Instruction::FMul;
2304   }
2305
2306   // Multiply the vectorization factor by the step using integer or
2307   // floating-point arithmetic as appropriate.
2308   Type *StepType = Step->getType();
2309   if (Step->getType()->isFloatingPointTy())
2310     StepType = IntegerType::get(StepType->getContext(),
2311                                 StepType->getScalarSizeInBits());
2312   Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF);
2313   if (Step->getType()->isFloatingPointTy())
2314     RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType());
2315   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
2316
2317   // Create a vector splat to use in the induction update.
2318   //
2319   // FIXME: If the step is non-constant, we create the vector splat with
2320   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2321   //        handle a constant vector splat.
2322   Value *SplatVF = isa<Constant>(Mul)
2323                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2324                        : Builder.CreateVectorSplat(VF, Mul);
2325   Builder.restoreIP(CurrIP);
2326
2327   // We may need to add the step a number of times, depending on the unroll
2328   // factor. The last of those goes into the PHI.
2329   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2330                                     &*LoopVectorBody->getFirstInsertionPt());
2331   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2332   Instruction *LastInduction = VecInd;
2333   for (unsigned Part = 0; Part < UF; ++Part) {
2334     State.set(Def, LastInduction, Part);
2335
2336     if (isa<TruncInst>(EntryVal))
2337       addMetadata(LastInduction, EntryVal);
2338     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef,
2339                                           State, Part);
2340
2341     LastInduction = cast<Instruction>(
2342         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
2343     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2344   }
2345
2346   // Move the last step to the end of the latch block. This ensures consistent
2347   // placement of all induction updates.
2348   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2349   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2350   auto *ICmp = cast<Instruction>(Br->getCondition());
2351   LastInduction->moveBefore(ICmp);
2352   LastInduction->setName("vec.ind.next");
2353
2354   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2355   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2356 }
2357
2358 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2359   return Cost->isScalarAfterVectorization(I, VF) ||
2360          Cost->isProfitableToScalarize(I, VF);
2361 }
2362
2363 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2364   if (shouldScalarizeInstruction(IV))
2365     return true;
2366   auto isScalarInst = [&](User *U) -> bool {
2367     auto *I = cast<Instruction>(U);
2368     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2369   };
2370   return llvm::any_of(IV->users(), isScalarInst);
2371 }
2372
2373 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2374     const InductionDescriptor &ID, const Instruction *EntryVal,
2375     Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State,
2376     unsigned Part, unsigned Lane) {
2377   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2378          "Expected either an induction phi-node or a truncate of it!");
2379
2380   // This induction variable is not the phi from the original loop but the
2381   // newly-created IV based on the proof that casted Phi is equal to the
2382   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2383   // re-uses the same InductionDescriptor that original IV uses but we don't
2384   // have to do any recording in this case - that is done when original IV is
2385   // processed.
2386   if (isa<TruncInst>(EntryVal))
2387     return;
2388
2389   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2390   if (Casts.empty())
2391     return;
2392   // Only the first Cast instruction in the Casts vector is of interest.
2393   // The rest of the Casts (if exist) have no uses outside the
2394   // induction update chain itself.
2395   if (Lane < UINT_MAX)
2396     State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane));
2397   else
2398     State.set(CastDef, VectorLoopVal, Part);
2399 }
2400
2401 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
2402                                                 TruncInst *Trunc, VPValue *Def,
2403                                                 VPValue *CastDef,
2404                                                 VPTransformState &State) {
2405   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2406          "Primary induction variable must have an integer type");
2407
2408   auto II = Legal->getInductionVars().find(IV);
2409   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
2410
2411   auto ID = II->second;
2412   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2413
2414   // The value from the original loop to which we are mapping the new induction
2415   // variable.
2416   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2417
2418   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2419
2420   // Generate code for the induction step. Note that induction steps are
2421   // required to be loop-invariant
2422   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2423     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2424            "Induction step should be loop invariant");
2425     if (PSE.getSE()->isSCEVable(IV->getType())) {
2426       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2427       return Exp.expandCodeFor(Step, Step->getType(),
2428                                LoopVectorPreHeader->getTerminator());
2429     }
2430     return cast<SCEVUnknown>(Step)->getValue();
2431   };
2432
2433   // The scalar value to broadcast. This is derived from the canonical
2434   // induction variable. If a truncation type is given, truncate the canonical
2435   // induction variable and step. Otherwise, derive these values from the
2436   // induction descriptor.
2437   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2438     Value *ScalarIV = Induction;
2439     if (IV != OldInduction) {
2440       ScalarIV = IV->getType()->isIntegerTy()
2441                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2442                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2443                                           IV->getType());
2444       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2445       ScalarIV->setName("offset.idx");
2446     }
2447     if (Trunc) {
2448       auto *TruncType = cast<IntegerType>(Trunc->getType());
2449       assert(Step->getType()->isIntegerTy() &&
2450              "Truncation requires an integer step");
2451       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2452       Step = Builder.CreateTrunc(Step, TruncType);
2453     }
2454     return ScalarIV;
2455   };
2456
2457   // Create the vector values from the scalar IV, in the absence of creating a
2458   // vector IV.
2459   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2460     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2461     for (unsigned Part = 0; Part < UF; ++Part) {
2462       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2463       Value *EntryPart =
2464           getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2465                         ID.getInductionOpcode());
2466       State.set(Def, EntryPart, Part);
2467       if (Trunc)
2468         addMetadata(EntryPart, Trunc);
2469       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef,
2470                                             State, Part);
2471     }
2472   };
2473
2474   // Fast-math-flags propagate from the original induction instruction.
2475   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
2476   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
2477     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
2478
2479   // Now do the actual transformations, and start with creating the step value.
2480   Value *Step = CreateStepValue(ID.getStep());
2481   if (VF.isZero() || VF.isScalar()) {
2482     Value *ScalarIV = CreateScalarIV(Step);
2483     CreateSplatIV(ScalarIV, Step);
2484     return;
2485   }
2486
2487   // Determine if we want a scalar version of the induction variable. This is
2488   // true if the induction variable itself is not widened, or if it has at
2489   // least one user in the loop that is not widened.
2490   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2491   if (!NeedsScalarIV) {
2492     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2493                                     State);
2494     return;
2495   }
2496
2497   // Try to create a new independent vector induction variable. If we can't
2498   // create the phi node, we will splat the scalar induction variable in each
2499   // loop iteration.
2500   if (!shouldScalarizeInstruction(EntryVal)) {
2501     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2502                                     State);
2503     Value *ScalarIV = CreateScalarIV(Step);
2504     // Create scalar steps that can be used by instructions we will later
2505     // scalarize. Note that the addition of the scalar steps will not increase
2506     // the number of instructions in the loop in the common case prior to
2507     // InstCombine. We will be trading one vector extract for each scalar step.
2508     buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2509     return;
2510   }
2511
2512   // All IV users are scalar instructions, so only emit a scalar IV, not a
2513   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2514   // predicate used by the masked loads/stores.
2515   Value *ScalarIV = CreateScalarIV(Step);
2516   if (!Cost->isScalarEpilogueAllowed())
2517     CreateSplatIV(ScalarIV, Step);
2518   buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2519 }
2520
2521 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2522                                           Instruction::BinaryOps BinOp) {
2523   // Create and check the types.
2524   auto *ValVTy = cast<VectorType>(Val->getType());
2525   ElementCount VLen = ValVTy->getElementCount();
2526
2527   Type *STy = Val->getType()->getScalarType();
2528   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2529          "Induction Step must be an integer or FP");
2530   assert(Step->getType() == STy && "Step has wrong type");
2531
2532   SmallVector<Constant *, 8> Indices;
2533
2534   // Create a vector of consecutive numbers from zero to VF.
2535   VectorType *InitVecValVTy = ValVTy;
2536   Type *InitVecValSTy = STy;
2537   if (STy->isFloatingPointTy()) {
2538     InitVecValSTy =
2539         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2540     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2541   }
2542   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2543
2544   // Add on StartIdx
2545   Value *StartIdxSplat = Builder.CreateVectorSplat(
2546       VLen, ConstantInt::get(InitVecValSTy, StartIdx));
2547   InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2548
2549   if (STy->isIntegerTy()) {
2550     Step = Builder.CreateVectorSplat(VLen, Step);
2551     assert(Step->getType() == Val->getType() && "Invalid step vec");
2552     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2553     // which can be found from the original scalar operations.
2554     Step = Builder.CreateMul(InitVec, Step);
2555     return Builder.CreateAdd(Val, Step, "induction");
2556   }
2557
2558   // Floating point induction.
2559   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2560          "Binary Opcode should be specified for FP induction");
2561   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2562   Step = Builder.CreateVectorSplat(VLen, Step);
2563   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2564   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2565 }
2566
2567 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2568                                            Instruction *EntryVal,
2569                                            const InductionDescriptor &ID,
2570                                            VPValue *Def, VPValue *CastDef,
2571                                            VPTransformState &State) {
2572   // We shouldn't have to build scalar steps if we aren't vectorizing.
2573   assert(VF.isVector() && "VF should be greater than one");
2574   // Get the value type and ensure it and the step have the same integer type.
2575   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2576   assert(ScalarIVTy == Step->getType() &&
2577          "Val and Step should have the same type");
2578
2579   // We build scalar steps for both integer and floating-point induction
2580   // variables. Here, we determine the kind of arithmetic we will perform.
2581   Instruction::BinaryOps AddOp;
2582   Instruction::BinaryOps MulOp;
2583   if (ScalarIVTy->isIntegerTy()) {
2584     AddOp = Instruction::Add;
2585     MulOp = Instruction::Mul;
2586   } else {
2587     AddOp = ID.getInductionOpcode();
2588     MulOp = Instruction::FMul;
2589   }
2590
2591   // Determine the number of scalars we need to generate for each unroll
2592   // iteration. If EntryVal is uniform, we only need to generate the first
2593   // lane. Otherwise, we generate all VF values.
2594   bool IsUniform =
2595       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF);
2596   unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue();
2597   // Compute the scalar steps and save the results in State.
2598   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2599                                      ScalarIVTy->getScalarSizeInBits());
2600   Type *VecIVTy = nullptr;
2601   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2602   if (!IsUniform && VF.isScalable()) {
2603     VecIVTy = VectorType::get(ScalarIVTy, VF);
2604     UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF));
2605     SplatStep = Builder.CreateVectorSplat(VF, Step);
2606     SplatIV = Builder.CreateVectorSplat(VF, ScalarIV);
2607   }
2608
2609   for (unsigned Part = 0; Part < UF; ++Part) {
2610     Value *StartIdx0 =
2611         createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
2612
2613     if (!IsUniform && VF.isScalable()) {
2614       auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0);
2615       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2616       if (ScalarIVTy->isFloatingPointTy())
2617         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2618       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2619       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2620       State.set(Def, Add, Part);
2621       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2622                                             Part);
2623       // It's useful to record the lane values too for the known minimum number
2624       // of elements so we do those below. This improves the code quality when
2625       // trying to extract the first element, for example.
2626     }
2627
2628     if (ScalarIVTy->isFloatingPointTy())
2629       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2630
2631     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2632       Value *StartIdx = Builder.CreateBinOp(
2633           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2634       // The step returned by `createStepForVF` is a runtime-evaluated value
2635       // when VF is scalable. Otherwise, it should be folded into a Constant.
2636       assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
2637              "Expected StartIdx to be folded to a constant when VF is not "
2638              "scalable");
2639       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2640       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2641       State.set(Def, Add, VPIteration(Part, Lane));
2642       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2643                                             Part, Lane);
2644     }
2645   }
2646 }
2647
2648 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2649                                                     const VPIteration &Instance,
2650                                                     VPTransformState &State) {
2651   Value *ScalarInst = State.get(Def, Instance);
2652   Value *VectorValue = State.get(Def, Instance.Part);
2653   VectorValue = Builder.CreateInsertElement(
2654       VectorValue, ScalarInst,
2655       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2656   State.set(Def, VectorValue, Instance.Part);
2657 }
2658
2659 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2660   assert(Vec->getType()->isVectorTy() && "Invalid type");
2661   return Builder.CreateVectorReverse(Vec, "reverse");
2662 }
2663
2664 // Return whether we allow using masked interleave-groups (for dealing with
2665 // strided loads/stores that reside in predicated blocks, or for dealing
2666 // with gaps).
2667 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2668   // If an override option has been passed in for interleaved accesses, use it.
2669   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2670     return EnableMaskedInterleavedMemAccesses;
2671
2672   return TTI.enableMaskedInterleavedAccessVectorization();
2673 }
2674
2675 // Try to vectorize the interleave group that \p Instr belongs to.
2676 //
2677 // E.g. Translate following interleaved load group (factor = 3):
2678 //   for (i = 0; i < N; i+=3) {
2679 //     R = Pic[i];             // Member of index 0
2680 //     G = Pic[i+1];           // Member of index 1
2681 //     B = Pic[i+2];           // Member of index 2
2682 //     ... // do something to R, G, B
2683 //   }
2684 // To:
2685 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2686 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2687 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2688 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2689 //
2690 // Or translate following interleaved store group (factor = 3):
2691 //   for (i = 0; i < N; i+=3) {
2692 //     ... do something to R, G, B
2693 //     Pic[i]   = R;           // Member of index 0
2694 //     Pic[i+1] = G;           // Member of index 1
2695 //     Pic[i+2] = B;           // Member of index 2
2696 //   }
2697 // To:
2698 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2699 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2700 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2701 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2702 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2703 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2704     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2705     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2706     VPValue *BlockInMask) {
2707   Instruction *Instr = Group->getInsertPos();
2708   const DataLayout &DL = Instr->getModule()->getDataLayout();
2709
2710   // Prepare for the vector type of the interleaved load/store.
2711   Type *ScalarTy = getLoadStoreType(Instr);
2712   unsigned InterleaveFactor = Group->getFactor();
2713   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2714   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2715
2716   // Prepare for the new pointers.
2717   SmallVector<Value *, 2> AddrParts;
2718   unsigned Index = Group->getIndex(Instr);
2719
2720   // TODO: extend the masked interleaved-group support to reversed access.
2721   assert((!BlockInMask || !Group->isReverse()) &&
2722          "Reversed masked interleave-group not supported.");
2723
2724   // If the group is reverse, adjust the index to refer to the last vector lane
2725   // instead of the first. We adjust the index from the first vector lane,
2726   // rather than directly getting the pointer for lane VF - 1, because the
2727   // pointer operand of the interleaved access is supposed to be uniform. For
2728   // uniform instructions, we're only required to generate a value for the
2729   // first vector lane in each unroll iteration.
2730   if (Group->isReverse())
2731     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2732
2733   for (unsigned Part = 0; Part < UF; Part++) {
2734     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2735     setDebugLocFromInst(AddrPart);
2736
2737     // Notice current instruction could be any index. Need to adjust the address
2738     // to the member of index 0.
2739     //
2740     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2741     //       b = A[i];       // Member of index 0
2742     // Current pointer is pointed to A[i+1], adjust it to A[i].
2743     //
2744     // E.g.  A[i+1] = a;     // Member of index 1
2745     //       A[i]   = b;     // Member of index 0
2746     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2747     // Current pointer is pointed to A[i+2], adjust it to A[i].
2748
2749     bool InBounds = false;
2750     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2751       InBounds = gep->isInBounds();
2752     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2753     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2754
2755     // Cast to the vector pointer type.
2756     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2757     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2758     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2759   }
2760
2761   setDebugLocFromInst(Instr);
2762   Value *PoisonVec = PoisonValue::get(VecTy);
2763
2764   Value *MaskForGaps = nullptr;
2765   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2766     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2767     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2768   }
2769
2770   // Vectorize the interleaved load group.
2771   if (isa<LoadInst>(Instr)) {
2772     // For each unroll part, create a wide load for the group.
2773     SmallVector<Value *, 2> NewLoads;
2774     for (unsigned Part = 0; Part < UF; Part++) {
2775       Instruction *NewLoad;
2776       if (BlockInMask || MaskForGaps) {
2777         assert(useMaskedInterleavedAccesses(*TTI) &&
2778                "masked interleaved groups are not allowed.");
2779         Value *GroupMask = MaskForGaps;
2780         if (BlockInMask) {
2781           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2782           Value *ShuffledMask = Builder.CreateShuffleVector(
2783               BlockInMaskPart,
2784               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2785               "interleaved.mask");
2786           GroupMask = MaskForGaps
2787                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2788                                                 MaskForGaps)
2789                           : ShuffledMask;
2790         }
2791         NewLoad =
2792             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2793                                      GroupMask, PoisonVec, "wide.masked.vec");
2794       }
2795       else
2796         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2797                                             Group->getAlign(), "wide.vec");
2798       Group->addMetadata(NewLoad);
2799       NewLoads.push_back(NewLoad);
2800     }
2801
2802     // For each member in the group, shuffle out the appropriate data from the
2803     // wide loads.
2804     unsigned J = 0;
2805     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2806       Instruction *Member = Group->getMember(I);
2807
2808       // Skip the gaps in the group.
2809       if (!Member)
2810         continue;
2811
2812       auto StrideMask =
2813           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2814       for (unsigned Part = 0; Part < UF; Part++) {
2815         Value *StridedVec = Builder.CreateShuffleVector(
2816             NewLoads[Part], StrideMask, "strided.vec");
2817
2818         // If this member has different type, cast the result type.
2819         if (Member->getType() != ScalarTy) {
2820           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2821           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2822           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2823         }
2824
2825         if (Group->isReverse())
2826           StridedVec = reverseVector(StridedVec);
2827
2828         State.set(VPDefs[J], StridedVec, Part);
2829       }
2830       ++J;
2831     }
2832     return;
2833   }
2834
2835   // The sub vector type for current instruction.
2836   auto *SubVT = VectorType::get(ScalarTy, VF);
2837
2838   // Vectorize the interleaved store group.
2839   MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2840   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2841          "masked interleaved groups are not allowed.");
2842   assert((!MaskForGaps || !VF.isScalable()) &&
2843          "masking gaps for scalable vectors is not yet supported.");
2844   for (unsigned Part = 0; Part < UF; Part++) {
2845     // Collect the stored vector from each member.
2846     SmallVector<Value *, 4> StoredVecs;
2847     for (unsigned i = 0; i < InterleaveFactor; i++) {
2848       assert((Group->getMember(i) || MaskForGaps) &&
2849              "Fail to get a member from an interleaved store group");
2850       Instruction *Member = Group->getMember(i);
2851
2852       // Skip the gaps in the group.
2853       if (!Member) {
2854         Value *Undef = PoisonValue::get(SubVT);
2855         StoredVecs.push_back(Undef);
2856         continue;
2857       }
2858
2859       Value *StoredVec = State.get(StoredValues[i], Part);
2860
2861       if (Group->isReverse())
2862         StoredVec = reverseVector(StoredVec);
2863
2864       // If this member has different type, cast it to a unified type.
2865
2866       if (StoredVec->getType() != SubVT)
2867         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2868
2869       StoredVecs.push_back(StoredVec);
2870     }
2871
2872     // Concatenate all vectors into a wide vector.
2873     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2874
2875     // Interleave the elements in the wide vector.
2876     Value *IVec = Builder.CreateShuffleVector(
2877         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2878         "interleaved.vec");
2879
2880     Instruction *NewStoreInstr;
2881     if (BlockInMask || MaskForGaps) {
2882       Value *GroupMask = MaskForGaps;
2883       if (BlockInMask) {
2884         Value *BlockInMaskPart = State.get(BlockInMask, Part);
2885         Value *ShuffledMask = Builder.CreateShuffleVector(
2886             BlockInMaskPart,
2887             createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2888             "interleaved.mask");
2889         GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2890                                                       ShuffledMask, MaskForGaps)
2891                                 : ShuffledMask;
2892       }
2893       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2894                                                 Group->getAlign(), GroupMask);
2895     } else
2896       NewStoreInstr =
2897           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2898
2899     Group->addMetadata(NewStoreInstr);
2900   }
2901 }
2902
2903 void InnerLoopVectorizer::vectorizeMemoryInstruction(
2904     Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2905     VPValue *StoredValue, VPValue *BlockInMask) {
2906   // Attempt to issue a wide load.
2907   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2908   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2909
2910   assert((LI || SI) && "Invalid Load/Store instruction");
2911   assert((!SI || StoredValue) && "No stored value provided for widened store");
2912   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2913
2914   LoopVectorizationCostModel::InstWidening Decision =
2915       Cost->getWideningDecision(Instr, VF);
2916   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2917           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2918           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2919          "CM decision is not to widen the memory instruction");
2920
2921   Type *ScalarDataTy = getLoadStoreType(Instr);
2922
2923   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2924   const Align Alignment = getLoadStoreAlignment(Instr);
2925
2926   // Determine if the pointer operand of the access is either consecutive or
2927   // reverse consecutive.
2928   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2929   bool ConsecutiveStride =
2930       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2931   bool CreateGatherScatter =
2932       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2933
2934   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2935   // gather/scatter. Otherwise Decision should have been to Scalarize.
2936   assert((ConsecutiveStride || CreateGatherScatter) &&
2937          "The instruction should be scalarized");
2938   (void)ConsecutiveStride;
2939
2940   VectorParts BlockInMaskParts(UF);
2941   bool isMaskRequired = BlockInMask;
2942   if (isMaskRequired)
2943     for (unsigned Part = 0; Part < UF; ++Part)
2944       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2945
2946   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2947     // Calculate the pointer for the specific unroll-part.
2948     GetElementPtrInst *PartPtr = nullptr;
2949
2950     bool InBounds = false;
2951     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2952       InBounds = gep->isInBounds();
2953     if (Reverse) {
2954       // If the address is consecutive but reversed, then the
2955       // wide store needs to start at the last vector element.
2956       // RunTimeVF =  VScale * VF.getKnownMinValue()
2957       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
2958       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2959       // NumElt = -Part * RunTimeVF
2960       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
2961       // LastLane = 1 - RunTimeVF
2962       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
2963       PartPtr =
2964           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
2965       PartPtr->setIsInBounds(InBounds);
2966       PartPtr = cast<GetElementPtrInst>(
2967           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
2968       PartPtr->setIsInBounds(InBounds);
2969       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2970         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2971     } else {
2972       Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
2973       PartPtr = cast<GetElementPtrInst>(
2974           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
2975       PartPtr->setIsInBounds(InBounds);
2976     }
2977
2978     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2979     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2980   };
2981
2982   // Handle Stores:
2983   if (SI) {
2984     setDebugLocFromInst(SI);
2985
2986     for (unsigned Part = 0; Part < UF; ++Part) {
2987       Instruction *NewSI = nullptr;
2988       Value *StoredVal = State.get(StoredValue, Part);
2989       if (CreateGatherScatter) {
2990         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2991         Value *VectorGep = State.get(Addr, Part);
2992         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2993                                             MaskPart);
2994       } else {
2995         if (Reverse) {
2996           // If we store to reverse consecutive memory locations, then we need
2997           // to reverse the order of elements in the stored value.
2998           StoredVal = reverseVector(StoredVal);
2999           // We don't want to update the value in the map as it might be used in
3000           // another expression. So don't call resetVectorValue(StoredVal).
3001         }
3002         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
3003         if (isMaskRequired)
3004           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
3005                                             BlockInMaskParts[Part]);
3006         else
3007           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
3008       }
3009       addMetadata(NewSI, SI);
3010     }
3011     return;
3012   }
3013
3014   // Handle loads.
3015   assert(LI && "Must have a load instruction");
3016   setDebugLocFromInst(LI);
3017   for (unsigned Part = 0; Part < UF; ++Part) {
3018     Value *NewLI;
3019     if (CreateGatherScatter) {
3020       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
3021       Value *VectorGep = State.get(Addr, Part);
3022       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
3023                                          nullptr, "wide.masked.gather");
3024       addMetadata(NewLI, LI);
3025     } else {
3026       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
3027       if (isMaskRequired)
3028         NewLI = Builder.CreateMaskedLoad(
3029             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
3030             PoisonValue::get(DataTy), "wide.masked.load");
3031       else
3032         NewLI =
3033             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
3034
3035       // Add metadata to the load, but setVectorValue to the reverse shuffle.
3036       addMetadata(NewLI, LI);
3037       if (Reverse)
3038         NewLI = reverseVector(NewLI);
3039     }
3040
3041     State.set(Def, NewLI, Part);
3042   }
3043 }
3044
3045 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
3046                                                VPUser &User,
3047                                                const VPIteration &Instance,
3048                                                bool IfPredicateInstr,
3049                                                VPTransformState &State) {
3050   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
3051
3052   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
3053   // the first lane and part.
3054   if (isa<NoAliasScopeDeclInst>(Instr))
3055     if (!Instance.isFirstIteration())
3056       return;
3057
3058   setDebugLocFromInst(Instr);
3059
3060   // Does this instruction return a value ?
3061   bool IsVoidRetTy = Instr->getType()->isVoidTy();
3062
3063   Instruction *Cloned = Instr->clone();
3064   if (!IsVoidRetTy)
3065     Cloned->setName(Instr->getName() + ".cloned");
3066
3067   State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
3068                                Builder.GetInsertPoint());
3069   // Replace the operands of the cloned instructions with their scalar
3070   // equivalents in the new loop.
3071   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
3072     auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
3073     auto InputInstance = Instance;
3074     if (!Operand || !OrigLoop->contains(Operand) ||
3075         (Cost->isUniformAfterVectorization(Operand, State.VF)))
3076       InputInstance.Lane = VPLane::getFirstLane();
3077     auto *NewOp = State.get(User.getOperand(op), InputInstance);
3078     Cloned->setOperand(op, NewOp);
3079   }
3080   addNewMetadata(Cloned, Instr);
3081
3082   // Place the cloned scalar in the new loop.
3083   Builder.Insert(Cloned);
3084
3085   State.set(Def, Cloned, Instance);
3086
3087   // If we just cloned a new assumption, add it the assumption cache.
3088   if (auto *II = dyn_cast<AssumeInst>(Cloned))
3089     AC->registerAssumption(II);
3090
3091   // End if-block.
3092   if (IfPredicateInstr)
3093     PredicatedInstructions.push_back(Cloned);
3094 }
3095
3096 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
3097                                                       Value *End, Value *Step,
3098                                                       Instruction *DL) {
3099   BasicBlock *Header = L->getHeader();
3100   BasicBlock *Latch = L->getLoopLatch();
3101   // As we're just creating this loop, it's possible no latch exists
3102   // yet. If so, use the header as this will be a single block loop.
3103   if (!Latch)
3104     Latch = Header;
3105
3106   IRBuilder<> B(&*Header->getFirstInsertionPt());
3107   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
3108   setDebugLocFromInst(OldInst, &B);
3109   auto *Induction = B.CreatePHI(Start->getType(), 2, "index");
3110
3111   B.SetInsertPoint(Latch->getTerminator());
3112   setDebugLocFromInst(OldInst, &B);
3113
3114   // Create i+1 and fill the PHINode.
3115   //
3116   // If the tail is not folded, we know that End - Start >= Step (either
3117   // statically or through the minimum iteration checks). We also know that both
3118   // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV +
3119   // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned
3120   // overflows and we can mark the induction increment as NUW.
3121   Value *Next = B.CreateAdd(Induction, Step, "index.next",
3122                             /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false);
3123   Induction->addIncoming(Start, L->getLoopPreheader());
3124   Induction->addIncoming(Next, Latch);
3125   // Create the compare.
3126   Value *ICmp = B.CreateICmpEQ(Next, End);
3127   B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
3128
3129   // Now we have two terminators. Remove the old one from the block.
3130   Latch->getTerminator()->eraseFromParent();
3131
3132   return Induction;
3133 }
3134
3135 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3136   if (TripCount)
3137     return TripCount;
3138
3139   assert(L && "Create Trip Count for null loop.");
3140   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3141   // Find the loop boundaries.
3142   ScalarEvolution *SE = PSE.getSE();
3143   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3144   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
3145          "Invalid loop count");
3146
3147   Type *IdxTy = Legal->getWidestInductionType();
3148   assert(IdxTy && "No type for induction");
3149
3150   // The exit count might have the type of i64 while the phi is i32. This can
3151   // happen if we have an induction variable that is sign extended before the
3152   // compare. The only way that we get a backedge taken count is that the
3153   // induction variable was signed and as such will not overflow. In such a case
3154   // truncation is legal.
3155   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
3156       IdxTy->getPrimitiveSizeInBits())
3157     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3158   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3159
3160   // Get the total trip count from the count by adding 1.
3161   const SCEV *ExitCount = SE->getAddExpr(
3162       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3163
3164   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3165
3166   // Expand the trip count and place the new instructions in the preheader.
3167   // Notice that the pre-header does not change, only the loop body.
3168   SCEVExpander Exp(*SE, DL, "induction");
3169
3170   // Count holds the overall loop count (N).
3171   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3172                                 L->getLoopPreheader()->getTerminator());
3173
3174   if (TripCount->getType()->isPointerTy())
3175     TripCount =
3176         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3177                                     L->getLoopPreheader()->getTerminator());
3178
3179   return TripCount;
3180 }
3181
3182 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3183   if (VectorTripCount)
3184     return VectorTripCount;
3185
3186   Value *TC = getOrCreateTripCount(L);
3187   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3188
3189   Type *Ty = TC->getType();
3190   // This is where we can make the step a runtime constant.
3191   Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
3192
3193   // If the tail is to be folded by masking, round the number of iterations N
3194   // up to a multiple of Step instead of rounding down. This is done by first
3195   // adding Step-1 and then rounding down. Note that it's ok if this addition
3196   // overflows: the vector induction variable will eventually wrap to zero given
3197   // that it starts at zero and its Step is a power of two; the loop will then
3198   // exit, with the last early-exit vector comparison also producing all-true.
3199   if (Cost->foldTailByMasking()) {
3200     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
3201            "VF*UF must be a power of 2 when folding tail by masking");
3202     assert(!VF.isScalable() &&
3203            "Tail folding not yet supported for scalable vectors");
3204     TC = Builder.CreateAdd(
3205         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
3206   }
3207
3208   // Now we need to generate the expression for the part of the loop that the
3209   // vectorized body will execute. This is equal to N - (N % Step) if scalar
3210   // iterations are not required for correctness, or N - Step, otherwise. Step
3211   // is equal to the vectorization factor (number of SIMD elements) times the
3212   // unroll factor (number of SIMD instructions).
3213   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3214
3215   // There are cases where we *must* run at least one iteration in the remainder
3216   // loop.  See the cost model for when this can happen.  If the step evenly
3217   // divides the trip count, we set the remainder to be equal to the step. If
3218   // the step does not evenly divide the trip count, no adjustment is necessary
3219   // since there will already be scalar iterations. Note that the minimum
3220   // iterations check ensures that N >= Step.
3221   if (Cost->requiresScalarEpilogue(VF)) {
3222     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3223     R = Builder.CreateSelect(IsZero, Step, R);
3224   }
3225
3226   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3227
3228   return VectorTripCount;
3229 }
3230
3231 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3232                                                    const DataLayout &DL) {
3233   // Verify that V is a vector type with same number of elements as DstVTy.
3234   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3235   unsigned VF = DstFVTy->getNumElements();
3236   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3237   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3238   Type *SrcElemTy = SrcVecTy->getElementType();
3239   Type *DstElemTy = DstFVTy->getElementType();
3240   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3241          "Vector elements must have same size");
3242
3243   // Do a direct cast if element types are castable.
3244   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3245     return Builder.CreateBitOrPointerCast(V, DstFVTy);
3246   }
3247   // V cannot be directly casted to desired vector type.
3248   // May happen when V is a floating point vector but DstVTy is a vector of
3249   // pointers or vice-versa. Handle this using a two-step bitcast using an
3250   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3251   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3252          "Only one type should be a pointer type");
3253   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3254          "Only one type should be a floating point type");
3255   Type *IntTy =
3256       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3257   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3258   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3259   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3260 }
3261
3262 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3263                                                          BasicBlock *Bypass) {
3264   Value *Count = getOrCreateTripCount(L);
3265   // Reuse existing vector loop preheader for TC checks.
3266   // Note that new preheader block is generated for vector loop.
3267   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3268   IRBuilder<> Builder(TCCheckBlock->getTerminator());
3269
3270   // Generate code to check if the loop's trip count is less than VF * UF, or
3271   // equal to it in case a scalar epilogue is required; this implies that the
3272   // vector trip count is zero. This check also covers the case where adding one
3273   // to the backedge-taken count overflowed leading to an incorrect trip count
3274   // of zero. In this case we will also jump to the scalar loop.
3275   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
3276                                             : ICmpInst::ICMP_ULT;
3277
3278   // If tail is to be folded, vector loop takes care of all iterations.
3279   Value *CheckMinIters = Builder.getFalse();
3280   if (!Cost->foldTailByMasking()) {
3281     Value *Step =
3282         createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
3283     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3284   }
3285   // Create new preheader for vector loop.
3286   LoopVectorPreHeader =
3287       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3288                  "vector.ph");
3289
3290   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3291                                DT->getNode(Bypass)->getIDom()) &&
3292          "TC check is expected to dominate Bypass");
3293
3294   // Update dominator for Bypass & LoopExit (if needed).
3295   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3296   if (!Cost->requiresScalarEpilogue(VF))
3297     // If there is an epilogue which must run, there's no edge from the
3298     // middle block to exit blocks  and thus no need to update the immediate
3299     // dominator of the exit blocks.
3300     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3301
3302   ReplaceInstWithInst(
3303       TCCheckBlock->getTerminator(),
3304       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3305   LoopBypassBlocks.push_back(TCCheckBlock);
3306 }
3307
3308 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3309
3310   BasicBlock *const SCEVCheckBlock =
3311       RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
3312   if (!SCEVCheckBlock)
3313     return nullptr;
3314
3315   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3316            (OptForSizeBasedOnProfile &&
3317             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3318          "Cannot SCEV check stride or overflow when optimizing for size");
3319
3320
3321   // Update dominator only if this is first RT check.
3322   if (LoopBypassBlocks.empty()) {
3323     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3324     if (!Cost->requiresScalarEpilogue(VF))
3325       // If there is an epilogue which must run, there's no edge from the
3326       // middle block to exit blocks  and thus no need to update the immediate
3327       // dominator of the exit blocks.
3328       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3329   }
3330
3331   LoopBypassBlocks.push_back(SCEVCheckBlock);
3332   AddedSafetyChecks = true;
3333   return SCEVCheckBlock;
3334 }
3335
3336 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
3337                                                       BasicBlock *Bypass) {
3338   // VPlan-native path does not do any analysis for runtime checks currently.
3339   if (EnableVPlanNativePath)
3340     return nullptr;
3341
3342   BasicBlock *const MemCheckBlock =
3343       RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3344
3345   // Check if we generated code that checks in runtime if arrays overlap. We put
3346   // the checks into a separate block to make the more common case of few
3347   // elements faster.
3348   if (!MemCheckBlock)
3349     return nullptr;
3350
3351   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3352     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3353            "Cannot emit memory checks when optimizing for size, unless forced "
3354            "to vectorize.");
3355     ORE->emit([&]() {
3356       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3357                                         L->getStartLoc(), L->getHeader())
3358              << "Code-size may be reduced by not forcing "
3359                 "vectorization, or by source-code modifications "
3360                 "eliminating the need for runtime checks "
3361                 "(e.g., adding 'restrict').";
3362     });
3363   }
3364
3365   LoopBypassBlocks.push_back(MemCheckBlock);
3366
3367   AddedSafetyChecks = true;
3368
3369   // We currently don't use LoopVersioning for the actual loop cloning but we
3370   // still use it to add the noalias metadata.
3371   LVer = std::make_unique<LoopVersioning>(
3372       *Legal->getLAI(),
3373       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3374       DT, PSE.getSE());
3375   LVer->prepareNoAliasMetadata();
3376   return MemCheckBlock;
3377 }
3378
3379 Value *InnerLoopVectorizer::emitTransformedIndex(
3380     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3381     const InductionDescriptor &ID) const {
3382
3383   SCEVExpander Exp(*SE, DL, "induction");
3384   auto Step = ID.getStep();
3385   auto StartValue = ID.getStartValue();
3386   assert(Index->getType()->getScalarType() == Step->getType() &&
3387          "Index scalar type does not match StepValue type");
3388
3389   // Note: the IR at this point is broken. We cannot use SE to create any new
3390   // SCEV and then expand it, hoping that SCEV's simplification will give us
3391   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3392   // lead to various SCEV crashes. So all we can do is to use builder and rely
3393   // on InstCombine for future simplifications. Here we handle some trivial
3394   // cases only.
3395   auto CreateAdd = [&B](Value *X, Value *Y) {
3396     assert(X->getType() == Y->getType() && "Types don't match!");
3397     if (auto *CX = dyn_cast<ConstantInt>(X))
3398       if (CX->isZero())
3399         return Y;
3400     if (auto *CY = dyn_cast<ConstantInt>(Y))
3401       if (CY->isZero())
3402         return X;
3403     return B.CreateAdd(X, Y);
3404   };
3405
3406   // We allow X to be a vector type, in which case Y will potentially be
3407   // splatted into a vector with the same element count.
3408   auto CreateMul = [&B](Value *X, Value *Y) {
3409     assert(X->getType()->getScalarType() == Y->getType() &&
3410            "Types don't match!");
3411     if (auto *CX = dyn_cast<ConstantInt>(X))
3412       if (CX->isOne())
3413         return Y;
3414     if (auto *CY = dyn_cast<ConstantInt>(Y))
3415       if (CY->isOne())
3416         return X;
3417     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
3418     if (XVTy && !isa<VectorType>(Y->getType()))
3419       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
3420     return B.CreateMul(X, Y);
3421   };
3422
3423   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3424   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3425   // the DomTree is not kept up-to-date for additional blocks generated in the
3426   // vector loop. By using the header as insertion point, we guarantee that the
3427   // expanded instructions dominate all their uses.
3428   auto GetInsertPoint = [this, &B]() {
3429     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3430     if (InsertBB != LoopVectorBody &&
3431         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3432       return LoopVectorBody->getTerminator();
3433     return &*B.GetInsertPoint();
3434   };
3435
3436   switch (ID.getKind()) {
3437   case InductionDescriptor::IK_IntInduction: {
3438     assert(!isa<VectorType>(Index->getType()) &&
3439            "Vector indices not supported for integer inductions yet");
3440     assert(Index->getType() == StartValue->getType() &&
3441            "Index type does not match StartValue type");
3442     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3443       return B.CreateSub(StartValue, Index);
3444     auto *Offset = CreateMul(
3445         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3446     return CreateAdd(StartValue, Offset);
3447   }
3448   case InductionDescriptor::IK_PtrInduction: {
3449     assert(isa<SCEVConstant>(Step) &&
3450            "Expected constant step for pointer induction");
3451     return B.CreateGEP(
3452         StartValue->getType()->getPointerElementType(), StartValue,
3453         CreateMul(Index,
3454                   Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
3455                                     GetInsertPoint())));
3456   }
3457   case InductionDescriptor::IK_FpInduction: {
3458     assert(!isa<VectorType>(Index->getType()) &&
3459            "Vector indices not supported for FP inductions yet");
3460     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3461     auto InductionBinOp = ID.getInductionBinOp();
3462     assert(InductionBinOp &&
3463            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3464             InductionBinOp->getOpcode() == Instruction::FSub) &&
3465            "Original bin op should be defined for FP induction");
3466
3467     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3468     Value *MulExp = B.CreateFMul(StepValue, Index);
3469     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3470                          "induction");
3471   }
3472   case InductionDescriptor::IK_NoInduction:
3473     return nullptr;
3474   }
3475   llvm_unreachable("invalid enum");
3476 }
3477
3478 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3479   LoopScalarBody = OrigLoop->getHeader();
3480   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3481   assert(LoopVectorPreHeader && "Invalid loop structure");
3482   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3483   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3484          "multiple exit loop without required epilogue?");
3485
3486   LoopMiddleBlock =
3487       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3488                  LI, nullptr, Twine(Prefix) + "middle.block");
3489   LoopScalarPreHeader =
3490       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3491                  nullptr, Twine(Prefix) + "scalar.ph");
3492
3493   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3494
3495   // Set up the middle block terminator.  Two cases:
3496   // 1) If we know that we must execute the scalar epilogue, emit an
3497   //    unconditional branch.
3498   // 2) Otherwise, we must have a single unique exit block (due to how we
3499   //    implement the multiple exit case).  In this case, set up a conditonal
3500   //    branch from the middle block to the loop scalar preheader, and the
3501   //    exit block.  completeLoopSkeleton will update the condition to use an
3502   //    iteration check, if required to decide whether to execute the remainder.
3503   BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3504     BranchInst::Create(LoopScalarPreHeader) :
3505     BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3506                        Builder.getTrue());
3507   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3508   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3509
3510   // We intentionally don't let SplitBlock to update LoopInfo since
3511   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3512   // LoopVectorBody is explicitly added to the correct place few lines later.
3513   LoopVectorBody =
3514       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3515                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3516
3517   // Update dominator for loop exit.
3518   if (!Cost->requiresScalarEpilogue(VF))
3519     // If there is an epilogue which must run, there's no edge from the
3520     // middle block to exit blocks  and thus no need to update the immediate
3521     // dominator of the exit blocks.
3522     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3523
3524   // Create and register the new vector loop.
3525   Loop *Lp = LI->AllocateLoop();
3526   Loop *ParentLoop = OrigLoop->getParentLoop();
3527
3528   // Insert the new loop into the loop nest and register the new basic blocks
3529   // before calling any utilities such as SCEV that require valid LoopInfo.
3530   if (ParentLoop) {
3531     ParentLoop->addChildLoop(Lp);
3532   } else {
3533     LI->addTopLevelLoop(Lp);
3534   }
3535   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3536   return Lp;
3537 }
3538
3539 void InnerLoopVectorizer::createInductionResumeValues(
3540     Loop *L, Value *VectorTripCount,
3541     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3542   assert(VectorTripCount && L && "Expected valid arguments");
3543   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3544           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3545          "Inconsistent information about additional bypass.");
3546   // We are going to resume the execution of the scalar loop.
3547   // Go over all of the induction variables that we found and fix the
3548   // PHIs that are left in the scalar version of the loop.
3549   // The starting values of PHI nodes depend on the counter of the last
3550   // iteration in the vectorized loop.
3551   // If we come from a bypass edge then we need to start from the original
3552   // start value.
3553   for (auto &InductionEntry : Legal->getInductionVars()) {
3554     PHINode *OrigPhi = InductionEntry.first;
3555     InductionDescriptor II = InductionEntry.second;
3556
3557     // Create phi nodes to merge from the  backedge-taken check block.
3558     PHINode *BCResumeVal =
3559         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3560                         LoopScalarPreHeader->getTerminator());
3561     // Copy original phi DL over to the new one.
3562     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3563     Value *&EndValue = IVEndValues[OrigPhi];
3564     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3565     if (OrigPhi == OldInduction) {
3566       // We know what the end value is.
3567       EndValue = VectorTripCount;
3568     } else {
3569       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3570
3571       // Fast-math-flags propagate from the original induction instruction.
3572       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3573         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3574
3575       Type *StepType = II.getStep()->getType();
3576       Instruction::CastOps CastOp =
3577           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3578       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3579       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3580       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3581       EndValue->setName("ind.end");
3582
3583       // Compute the end value for the additional bypass (if applicable).
3584       if (AdditionalBypass.first) {
3585         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3586         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3587                                          StepType, true);
3588         CRD =
3589             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3590         EndValueFromAdditionalBypass =
3591             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3592         EndValueFromAdditionalBypass->setName("ind.end");
3593       }
3594     }
3595     // The new PHI merges the original incoming value, in case of a bypass,
3596     // or the value at the end of the vectorized loop.
3597     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3598
3599     // Fix the scalar body counter (PHI node).
3600     // The old induction's phi node in the scalar body needs the truncated
3601     // value.
3602     for (BasicBlock *BB : LoopBypassBlocks)
3603       BCResumeVal->addIncoming(II.getStartValue(), BB);
3604
3605     if (AdditionalBypass.first)
3606       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3607                                             EndValueFromAdditionalBypass);
3608
3609     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3610   }
3611 }
3612
3613 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3614                                                       MDNode *OrigLoopID) {
3615   assert(L && "Expected valid loop.");
3616
3617   // The trip counts should be cached by now.
3618   Value *Count = getOrCreateTripCount(L);
3619   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3620
3621   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3622
3623   // Add a check in the middle block to see if we have completed
3624   // all of the iterations in the first vector loop.  Three cases:
3625   // 1) If we require a scalar epilogue, there is no conditional branch as
3626   //    we unconditionally branch to the scalar preheader.  Do nothing.
3627   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3628   //    Thus if tail is to be folded, we know we don't need to run the
3629   //    remainder and we can use the previous value for the condition (true).
3630   // 3) Otherwise, construct a runtime check.
3631   if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3632     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3633                                         Count, VectorTripCount, "cmp.n",
3634                                         LoopMiddleBlock->getTerminator());
3635
3636     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3637     // of the corresponding compare because they may have ended up with
3638     // different line numbers and we want to avoid awkward line stepping while
3639     // debugging. Eg. if the compare has got a line number inside the loop.
3640     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3641     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3642   }
3643
3644   // Get ready to start creating new instructions into the vectorized body.
3645   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3646          "Inconsistent vector loop preheader");
3647   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3648
3649   Optional<MDNode *> VectorizedLoopID =
3650       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3651                                       LLVMLoopVectorizeFollowupVectorized});
3652   if (VectorizedLoopID.hasValue()) {
3653     L->setLoopID(VectorizedLoopID.getValue());
3654
3655     // Do not setAlreadyVectorized if loop attributes have been defined
3656     // explicitly.
3657     return LoopVectorPreHeader;
3658   }
3659
3660   // Keep all loop hints from the original loop on the vector loop (we'll
3661   // replace the vectorizer-specific hints below).
3662   if (MDNode *LID = OrigLoop->getLoopID())
3663     L->setLoopID(LID);
3664
3665   LoopVectorizeHints Hints(L, true, *ORE);
3666   Hints.setAlreadyVectorized();
3667
3668 #ifdef EXPENSIVE_CHECKS
3669   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3670   LI->verify(*DT);
3671 #endif
3672
3673   return LoopVectorPreHeader;
3674 }
3675
3676 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3677   /*
3678    In this function we generate a new loop. The new loop will contain
3679    the vectorized instructions while the old loop will continue to run the
3680    scalar remainder.
3681
3682        [ ] <-- loop iteration number check.
3683     /   |
3684    /    v
3685   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3686   |  /  |
3687   | /   v
3688   ||   [ ]     <-- vector pre header.
3689   |/    |
3690   |     v
3691   |    [  ] \
3692   |    [  ]_|   <-- vector loop.
3693   |     |
3694   |     v
3695   \   -[ ]   <--- middle-block.
3696    \/   |
3697    /\   v
3698    | ->[ ]     <--- new preheader.
3699    |    |
3700  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3701    |   [ ] \
3702    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3703     \   |
3704      \  v
3705       >[ ]     <-- exit block(s).
3706    ...
3707    */
3708
3709   // Get the metadata of the original loop before it gets modified.
3710   MDNode *OrigLoopID = OrigLoop->getLoopID();
3711
3712   // Workaround!  Compute the trip count of the original loop and cache it
3713   // before we start modifying the CFG.  This code has a systemic problem
3714   // wherein it tries to run analysis over partially constructed IR; this is
3715   // wrong, and not simply for SCEV.  The trip count of the original loop
3716   // simply happens to be prone to hitting this in practice.  In theory, we
3717   // can hit the same issue for any SCEV, or ValueTracking query done during
3718   // mutation.  See PR49900.
3719   getOrCreateTripCount(OrigLoop);
3720
3721   // Create an empty vector loop, and prepare basic blocks for the runtime
3722   // checks.
3723   Loop *Lp = createVectorLoopSkeleton("");
3724
3725   // Now, compare the new count to zero. If it is zero skip the vector loop and
3726   // jump to the scalar loop. This check also covers the case where the
3727   // backedge-taken count is uint##_max: adding one to it will overflow leading
3728   // to an incorrect trip count of zero. In this (rare) case we will also jump
3729   // to the scalar loop.
3730   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3731
3732   // Generate the code to check any assumptions that we've made for SCEV
3733   // expressions.
3734   emitSCEVChecks(Lp, LoopScalarPreHeader);
3735
3736   // Generate the code that checks in runtime if arrays overlap. We put the
3737   // checks into a separate block to make the more common case of few elements
3738   // faster.
3739   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3740
3741   // Some loops have a single integer induction variable, while other loops
3742   // don't. One example is c++ iterators that often have multiple pointer
3743   // induction variables. In the code below we also support a case where we
3744   // don't have a single induction variable.
3745   //
3746   // We try to obtain an induction variable from the original loop as hard
3747   // as possible. However if we don't find one that:
3748   //   - is an integer
3749   //   - counts from zero, stepping by one
3750   //   - is the size of the widest induction variable type
3751   // then we create a new one.
3752   OldInduction = Legal->getPrimaryInduction();
3753   Type *IdxTy = Legal->getWidestInductionType();
3754   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3755   // The loop step is equal to the vectorization factor (num of SIMD elements)
3756   // times the unroll factor (num of SIMD instructions).
3757   Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3758   Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
3759   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3760   Induction =
3761       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3762                               getDebugLocFromInstOrOperands(OldInduction));
3763
3764   // Emit phis for the new starting index of the scalar loop.
3765   createInductionResumeValues(Lp, CountRoundDown);
3766
3767   return completeLoopSkeleton(Lp, OrigLoopID);
3768 }
3769
3770 // Fix up external users of the induction variable. At this point, we are
3771 // in LCSSA form, with all external PHIs that use the IV having one input value,
3772 // coming from the remainder loop. We need those PHIs to also have a correct
3773 // value for the IV when arriving directly from the middle block.
3774 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3775                                        const InductionDescriptor &II,
3776                                        Value *CountRoundDown, Value *EndValue,
3777                                        BasicBlock *MiddleBlock) {
3778   // There are two kinds of external IV usages - those that use the value
3779   // computed in the last iteration (the PHI) and those that use the penultimate
3780   // value (the value that feeds into the phi from the loop latch).
3781   // We allow both, but they, obviously, have different values.
3782
3783   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3784
3785   DenseMap<Value *, Value *> MissingVals;
3786
3787   // An external user of the last iteration's value should see the value that
3788   // the remainder loop uses to initialize its own IV.
3789   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3790   for (User *U : PostInc->users()) {
3791     Instruction *UI = cast<Instruction>(U);
3792     if (!OrigLoop->contains(UI)) {
3793       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3794       MissingVals[UI] = EndValue;
3795     }
3796   }
3797
3798   // An external user of the penultimate value need to see EndValue - Step.
3799   // The simplest way to get this is to recompute it from the constituent SCEVs,
3800   // that is Start + (Step * (CRD - 1)).
3801   for (User *U : OrigPhi->users()) {
3802     auto *UI = cast<Instruction>(U);
3803     if (!OrigLoop->contains(UI)) {
3804       const DataLayout &DL =
3805           OrigLoop->getHeader()->getModule()->getDataLayout();
3806       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3807
3808       IRBuilder<> B(MiddleBlock->getTerminator());
3809
3810       // Fast-math-flags propagate from the original induction instruction.
3811       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3812         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3813
3814       Value *CountMinusOne = B.CreateSub(
3815           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3816       Value *CMO =
3817           !II.getStep()->getType()->isIntegerTy()
3818               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3819                              II.getStep()->getType())
3820               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3821       CMO->setName("cast.cmo");
3822       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3823       Escape->setName("ind.escape");
3824       MissingVals[UI] = Escape;
3825     }
3826   }
3827
3828   for (auto &I : MissingVals) {
3829     PHINode *PHI = cast<PHINode>(I.first);
3830     // One corner case we have to handle is two IVs "chasing" each-other,
3831     // that is %IV2 = phi [...], [ %IV1, %latch ]
3832     // In this case, if IV1 has an external use, we need to avoid adding both
3833     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3834     // don't already have an incoming value for the middle block.
3835     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3836       PHI->addIncoming(I.second, MiddleBlock);
3837   }
3838 }
3839
3840 namespace {
3841
3842 struct CSEDenseMapInfo {
3843   static bool canHandle(const Instruction *I) {
3844     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3845            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3846   }
3847
3848   static inline Instruction *getEmptyKey() {
3849     return DenseMapInfo<Instruction *>::getEmptyKey();
3850   }
3851
3852   static inline Instruction *getTombstoneKey() {
3853     return DenseMapInfo<Instruction *>::getTombstoneKey();
3854   }
3855
3856   static unsigned getHashValue(const Instruction *I) {
3857     assert(canHandle(I) && "Unknown instruction!");
3858     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3859                                                            I->value_op_end()));
3860   }
3861
3862   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3863     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3864         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3865       return LHS == RHS;
3866     return LHS->isIdenticalTo(RHS);
3867   }
3868 };
3869
3870 } // end anonymous namespace
3871
3872 ///Perform cse of induction variable instructions.
3873 static void cse(BasicBlock *BB) {
3874   // Perform simple cse.
3875   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3876   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3877     Instruction *In = &*I++;
3878
3879     if (!CSEDenseMapInfo::canHandle(In))
3880       continue;
3881
3882     // Check if we can replace this instruction with any of the
3883     // visited instructions.
3884     if (Instruction *V = CSEMap.lookup(In)) {
3885       In->replaceAllUsesWith(V);
3886       In->eraseFromParent();
3887       continue;
3888     }
3889
3890     CSEMap[In] = In;
3891   }
3892 }
3893
3894 InstructionCost
3895 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3896                                               bool &NeedToScalarize) const {
3897   Function *F = CI->getCalledFunction();
3898   Type *ScalarRetTy = CI->getType();
3899   SmallVector<Type *, 4> Tys, ScalarTys;
3900   for (auto &ArgOp : CI->arg_operands())
3901     ScalarTys.push_back(ArgOp->getType());
3902
3903   // Estimate cost of scalarized vector call. The source operands are assumed
3904   // to be vectors, so we need to extract individual elements from there,
3905   // execute VF scalar calls, and then gather the result into the vector return
3906   // value.
3907   InstructionCost ScalarCallCost =
3908       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3909   if (VF.isScalar())
3910     return ScalarCallCost;
3911
3912   // Compute corresponding vector type for return value and arguments.
3913   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3914   for (Type *ScalarTy : ScalarTys)
3915     Tys.push_back(ToVectorTy(ScalarTy, VF));
3916
3917   // Compute costs of unpacking argument values for the scalar calls and
3918   // packing the return values to a vector.
3919   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3920
3921   InstructionCost Cost =
3922       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3923
3924   // If we can't emit a vector call for this function, then the currently found
3925   // cost is the cost we need to return.
3926   NeedToScalarize = true;
3927   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3928   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3929
3930   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3931     return Cost;
3932
3933   // If the corresponding vector cost is cheaper, return its cost.
3934   InstructionCost VectorCallCost =
3935       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3936   if (VectorCallCost < Cost) {
3937     NeedToScalarize = false;
3938     Cost = VectorCallCost;
3939   }
3940   return Cost;
3941 }
3942
3943 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3944   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3945     return Elt;
3946   return VectorType::get(Elt, VF);
3947 }
3948
3949 InstructionCost
3950 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3951                                                    ElementCount VF) const {
3952   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3953   assert(ID && "Expected intrinsic call!");
3954   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3955   FastMathFlags FMF;
3956   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3957     FMF = FPMO->getFastMathFlags();
3958
3959   SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());
3960   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3961   SmallVector<Type *> ParamTys;
3962   std::transform(FTy->param_begin(), FTy->param_end(),
3963                  std::back_inserter(ParamTys),
3964                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3965
3966   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3967                                     dyn_cast<IntrinsicInst>(CI));
3968   return TTI.getIntrinsicInstrCost(CostAttrs,
3969                                    TargetTransformInfo::TCK_RecipThroughput);
3970 }
3971
3972 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3973   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3974   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3975   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3976 }
3977
3978 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3979   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3980   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3981   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3982 }
3983
3984 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3985   // For every instruction `I` in MinBWs, truncate the operands, create a
3986   // truncated version of `I` and reextend its result. InstCombine runs
3987   // later and will remove any ext/trunc pairs.
3988   SmallPtrSet<Value *, 4> Erased;
3989   for (const auto &KV : Cost->getMinimalBitwidths()) {
3990     // If the value wasn't vectorized, we must maintain the original scalar
3991     // type. The absence of the value from State indicates that it
3992     // wasn't vectorized.
3993     VPValue *Def = State.Plan->getVPValue(KV.first);
3994     if (!State.hasAnyVectorValue(Def))
3995       continue;
3996     for (unsigned Part = 0; Part < UF; ++Part) {
3997       Value *I = State.get(Def, Part);
3998       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3999         continue;
4000       Type *OriginalTy = I->getType();
4001       Type *ScalarTruncatedTy =
4002           IntegerType::get(OriginalTy->getContext(), KV.second);
4003       auto *TruncatedTy = VectorType::get(
4004           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
4005       if (TruncatedTy == OriginalTy)
4006         continue;
4007
4008       IRBuilder<> B(cast<Instruction>(I));
4009       auto ShrinkOperand = [&](Value *V) -> Value * {
4010         if (auto *ZI = dyn_cast<ZExtInst>(V))
4011           if (ZI->getSrcTy() == TruncatedTy)
4012             return ZI->getOperand(0);
4013         return B.CreateZExtOrTrunc(V, TruncatedTy);
4014       };
4015
4016       // The actual instruction modification depends on the instruction type,
4017       // unfortunately.
4018       Value *NewI = nullptr;
4019       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
4020         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
4021                              ShrinkOperand(BO->getOperand(1)));
4022
4023         // Any wrapping introduced by shrinking this operation shouldn't be
4024         // considered undefined behavior. So, we can't unconditionally copy
4025         // arithmetic wrapping flags to NewI.
4026         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
4027       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
4028         NewI =
4029             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
4030                          ShrinkOperand(CI->getOperand(1)));
4031       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
4032         NewI = B.CreateSelect(SI->getCondition(),
4033                               ShrinkOperand(SI->getTrueValue()),
4034                               ShrinkOperand(SI->getFalseValue()));
4035       } else if (auto *CI = dyn_cast<CastInst>(I)) {
4036         switch (CI->getOpcode()) {
4037         default:
4038           llvm_unreachable("Unhandled cast!");
4039         case Instruction::Trunc:
4040           NewI = ShrinkOperand(CI->getOperand(0));
4041           break;
4042         case Instruction::SExt:
4043           NewI = B.CreateSExtOrTrunc(
4044               CI->getOperand(0),
4045               smallestIntegerVectorType(OriginalTy, TruncatedTy));
4046           break;
4047         case Instruction::ZExt:
4048           NewI = B.CreateZExtOrTrunc(
4049               CI->getOperand(0),
4050               smallestIntegerVectorType(OriginalTy, TruncatedTy));
4051           break;
4052         }
4053       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
4054         auto Elements0 =
4055             cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
4056         auto *O0 = B.CreateZExtOrTrunc(
4057             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
4058         auto Elements1 =
4059             cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
4060         auto *O1 = B.CreateZExtOrTrunc(
4061             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
4062
4063         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
4064       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
4065         // Don't do anything with the operands, just extend the result.
4066         continue;
4067       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
4068         auto Elements =
4069             cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
4070         auto *O0 = B.CreateZExtOrTrunc(
4071             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
4072         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
4073         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
4074       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
4075         auto Elements =
4076             cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
4077         auto *O0 = B.CreateZExtOrTrunc(
4078             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
4079         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
4080       } else {
4081         // If we don't know what to do, be conservative and don't do anything.
4082         continue;
4083       }
4084
4085       // Lastly, extend the result.
4086       NewI->takeName(cast<Instruction>(I));
4087       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
4088       I->replaceAllUsesWith(Res);
4089       cast<Instruction>(I)->eraseFromParent();
4090       Erased.insert(I);
4091       State.reset(Def, Res, Part);
4092     }
4093   }
4094
4095   // We'll have created a bunch of ZExts that are now parentless. Clean up.
4096   for (const auto &KV : Cost->getMinimalBitwidths()) {
4097     // If the value wasn't vectorized, we must maintain the original scalar
4098     // type. The absence of the value from State indicates that it
4099     // wasn't vectorized.
4100     VPValue *Def = State.Plan->getVPValue(KV.first);
4101     if (!State.hasAnyVectorValue(Def))
4102       continue;
4103     for (unsigned Part = 0; Part < UF; ++Part) {
4104       Value *I = State.get(Def, Part);
4105       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
4106       if (Inst && Inst->use_empty()) {
4107         Value *NewI = Inst->getOperand(0);
4108         Inst->eraseFromParent();
4109         State.reset(Def, NewI, Part);
4110       }
4111     }
4112   }
4113 }
4114
4115 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
4116   // Insert truncates and extends for any truncated instructions as hints to
4117   // InstCombine.
4118   if (VF.isVector())
4119     truncateToMinimalBitwidths(State);
4120
4121   // Fix widened non-induction PHIs by setting up the PHI operands.
4122   if (OrigPHIsToFix.size()) {
4123     assert(EnableVPlanNativePath &&
4124            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
4125     fixNonInductionPHIs(State);
4126   }
4127
4128   // At this point every instruction in the original loop is widened to a
4129   // vector form. Now we need to fix the recurrences in the loop. These PHI
4130   // nodes are currently empty because we did not want to introduce cycles.
4131   // This is the second stage of vectorizing recurrences.
4132   fixCrossIterationPHIs(State);
4133
4134   // Forget the original basic block.
4135   PSE.getSE()->forgetLoop(OrigLoop);
4136
4137   // If we inserted an edge from the middle block to the unique exit block,
4138   // update uses outside the loop (phis) to account for the newly inserted
4139   // edge.
4140   if (!Cost->requiresScalarEpilogue(VF)) {
4141     // Fix-up external users of the induction variables.
4142     for (auto &Entry : Legal->getInductionVars())
4143       fixupIVUsers(Entry.first, Entry.second,
4144                    getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
4145                    IVEndValues[Entry.first], LoopMiddleBlock);
4146
4147     fixLCSSAPHIs(State);
4148   }
4149
4150   for (Instruction *PI : PredicatedInstructions)
4151     sinkScalarOperands(&*PI);
4152
4153   // Remove redundant induction instructions.
4154   cse(LoopVectorBody);
4155
4156   // Set/update profile weights for the vector and remainder loops as original
4157   // loop iterations are now distributed among them. Note that original loop
4158   // represented by LoopScalarBody becomes remainder loop after vectorization.
4159   //
4160   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
4161   // end up getting slightly roughened result but that should be OK since
4162   // profile is not inherently precise anyway. Note also possible bypass of
4163   // vector code caused by legality checks is ignored, assigning all the weight
4164   // to the vector loop, optimistically.
4165   //
4166   // For scalable vectorization we can't know at compile time how many iterations
4167   // of the loop are handled in one vector iteration, so instead assume a pessimistic
4168   // vscale of '1'.
4169   setProfileInfoAfterUnrolling(
4170       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
4171       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
4172 }
4173
4174 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
4175   // In order to support recurrences we need to be able to vectorize Phi nodes.
4176   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4177   // stage #2: We now need to fix the recurrences by adding incoming edges to
4178   // the currently empty PHI nodes. At this point every instruction in the
4179   // original loop is widened to a vector form so we can use them to construct
4180   // the incoming edges.
4181   VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
4182   for (VPRecipeBase &R : Header->phis()) {
4183     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
4184       fixReduction(ReductionPhi, State);
4185     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
4186       fixFirstOrderRecurrence(FOR, State);
4187   }
4188 }
4189
4190 void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR,
4191                                                   VPTransformState &State) {
4192   // This is the second phase of vectorizing first-order recurrences. An
4193   // overview of the transformation is described below. Suppose we have the
4194   // following loop.
4195   //
4196   //   for (int i = 0; i < n; ++i)
4197   //     b[i] = a[i] - a[i - 1];
4198   //
4199   // There is a first-order recurrence on "a". For this loop, the shorthand
4200   // scalar IR looks like:
4201   //
4202   //   scalar.ph:
4203   //     s_init = a[-1]
4204   //     br scalar.body
4205   //
4206   //   scalar.body:
4207   //     i = phi [0, scalar.ph], [i+1, scalar.body]
4208   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4209   //     s2 = a[i]
4210   //     b[i] = s2 - s1
4211   //     br cond, scalar.body, ...
4212   //
4213   // In this example, s1 is a recurrence because it's value depends on the
4214   // previous iteration. In the first phase of vectorization, we created a
4215   // vector phi v1 for s1. We now complete the vectorization and produce the
4216   // shorthand vector IR shown below (for VF = 4, UF = 1).
4217   //
4218   //   vector.ph:
4219   //     v_init = vector(..., ..., ..., a[-1])
4220   //     br vector.body
4221   //
4222   //   vector.body
4223   //     i = phi [0, vector.ph], [i+4, vector.body]
4224   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
4225   //     v2 = a[i, i+1, i+2, i+3];
4226   //     v3 = vector(v1(3), v2(0, 1, 2))
4227   //     b[i, i+1, i+2, i+3] = v2 - v3
4228   //     br cond, vector.body, middle.block
4229   //
4230   //   middle.block:
4231   //     x = v2(3)
4232   //     br scalar.ph
4233   //
4234   //   scalar.ph:
4235   //     s_init = phi [x, middle.block], [a[-1], otherwise]
4236   //     br scalar.body
4237   //
4238   // After execution completes the vector loop, we extract the next value of
4239   // the recurrence (x) to use as the initial value in the scalar loop.
4240
4241   // Extract the last vector element in the middle block. This will be the
4242   // initial value for the recurrence when jumping to the scalar loop.
4243   VPValue *PreviousDef = PhiR->getBackedgeValue();
4244   Value *Incoming = State.get(PreviousDef, UF - 1);
4245   auto *ExtractForScalar = Incoming;
4246   auto *IdxTy = Builder.getInt32Ty();
4247   if (VF.isVector()) {
4248     auto *One = ConstantInt::get(IdxTy, 1);
4249     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4250     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4251     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4252     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
4253                                                     "vector.recur.extract");
4254   }
4255   // Extract the second last element in the middle block if the
4256   // Phi is used outside the loop. We need to extract the phi itself
4257   // and not the last element (the phi update in the current iteration). This
4258   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4259   // when the scalar loop is not run at all.
4260   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4261   if (VF.isVector()) {
4262     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4263     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
4264     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4265         Incoming, Idx, "vector.recur.extract.for.phi");
4266   } else if (UF > 1)
4267     // When loop is unrolled without vectorizing, initialize
4268     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
4269     // of `Incoming`. This is analogous to the vectorized case above: extracting
4270     // the second last element when VF > 1.
4271     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4272
4273   // Fix the initial value of the original recurrence in the scalar loop.
4274   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4275   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
4276   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4277   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
4278   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4279     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4280     Start->addIncoming(Incoming, BB);
4281   }
4282
4283   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4284   Phi->setName("scalar.recur");
4285
4286   // Finally, fix users of the recurrence outside the loop. The users will need
4287   // either the last value of the scalar recurrence or the last value of the
4288   // vector recurrence we extracted in the middle block. Since the loop is in
4289   // LCSSA form, we just need to find all the phi nodes for the original scalar
4290   // recurrence in the exit block, and then add an edge for the middle block.
4291   // Note that LCSSA does not imply single entry when the original scalar loop
4292   // had multiple exiting edges (as we always run the last iteration in the
4293   // scalar epilogue); in that case, there is no edge from middle to exit and
4294   // and thus no phis which needed updated.
4295   if (!Cost->requiresScalarEpilogue(VF))
4296     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4297       if (any_of(LCSSAPhi.incoming_values(),
4298                  [Phi](Value *V) { return V == Phi; }))
4299         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4300 }
4301
4302 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
4303                                        VPTransformState &State) {
4304   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
4305   // Get it's reduction variable descriptor.
4306   assert(Legal->isReductionVariable(OrigPhi) &&
4307          "Unable to find the reduction variable");
4308   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4309
4310   RecurKind RK = RdxDesc.getRecurrenceKind();
4311   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4312   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4313   setDebugLocFromInst(ReductionStartValue);
4314
4315   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
4316   // This is the vector-clone of the value that leaves the loop.
4317   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4318
4319   // Wrap flags are in general invalid after vectorization, clear them.
4320   clearReductionWrapFlags(RdxDesc, State);
4321
4322   // Before each round, move the insertion point right between
4323   // the PHIs and the values we are going to write.
4324   // This allows us to write both PHINodes and the extractelement
4325   // instructions.
4326   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4327
4328   setDebugLocFromInst(LoopExitInst);
4329
4330   Type *PhiTy = OrigPhi->getType();
4331   // If tail is folded by masking, the vector value to leave the loop should be
4332   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4333   // instead of the former. For an inloop reduction the reduction will already
4334   // be predicated, and does not need to be handled here.
4335   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
4336     for (unsigned Part = 0; Part < UF; ++Part) {
4337       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4338       Value *Sel = nullptr;
4339       for (User *U : VecLoopExitInst->users()) {
4340         if (isa<SelectInst>(U)) {
4341           assert(!Sel && "Reduction exit feeding two selects");
4342           Sel = U;
4343         } else
4344           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4345       }
4346       assert(Sel && "Reduction exit feeds no select");
4347       State.reset(LoopExitInstDef, Sel, Part);
4348
4349       // If the target can create a predicated operator for the reduction at no
4350       // extra cost in the loop (for example a predicated vadd), it can be
4351       // cheaper for the select to remain in the loop than be sunk out of it,
4352       // and so use the select value for the phi instead of the old
4353       // LoopExitValue.
4354       if (PreferPredicatedReductionSelect ||
4355           TTI->preferPredicatedReductionSelect(
4356               RdxDesc.getOpcode(), PhiTy,
4357               TargetTransformInfo::ReductionFlags())) {
4358         auto *VecRdxPhi =
4359             cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part));
4360         VecRdxPhi->setIncomingValueForBlock(
4361             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4362       }
4363     }
4364   }
4365
4366   // If the vector reduction can be performed in a smaller type, we truncate
4367   // then extend the loop exit value to enable InstCombine to evaluate the
4368   // entire expression in the smaller type.
4369   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
4370     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
4371     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4372     Builder.SetInsertPoint(
4373         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4374     VectorParts RdxParts(UF);
4375     for (unsigned Part = 0; Part < UF; ++Part) {
4376       RdxParts[Part] = State.get(LoopExitInstDef, Part);
4377       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4378       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4379                                         : Builder.CreateZExt(Trunc, VecTy);
4380       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4381            UI != RdxParts[Part]->user_end();)
4382         if (*UI != Trunc) {
4383           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4384           RdxParts[Part] = Extnd;
4385         } else {
4386           ++UI;
4387         }
4388     }
4389     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4390     for (unsigned Part = 0; Part < UF; ++Part) {
4391       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4392       State.reset(LoopExitInstDef, RdxParts[Part], Part);
4393     }
4394   }
4395
4396   // Reduce all of the unrolled parts into a single vector.
4397   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4398   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4399
4400   // The middle block terminator has already been assigned a DebugLoc here (the
4401   // OrigLoop's single latch terminator). We want the whole middle block to
4402   // appear to execute on this line because: (a) it is all compiler generated,
4403   // (b) these instructions are always executed after evaluating the latch
4404   // conditional branch, and (c) other passes may add new predecessors which
4405   // terminate on this line. This is the easiest way to ensure we don't
4406   // accidentally cause an extra step back into the loop while debugging.
4407   setDebugLocFromInst(LoopMiddleBlock->getTerminator());
4408   if (PhiR->isOrdered())
4409     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
4410   else {
4411     // Floating-point operations should have some FMF to enable the reduction.
4412     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
4413     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
4414     for (unsigned Part = 1; Part < UF; ++Part) {
4415       Value *RdxPart = State.get(LoopExitInstDef, Part);
4416       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4417         ReducedPartRdx = Builder.CreateBinOp(
4418             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4419       } else {
4420         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4421       }
4422     }
4423   }
4424
4425   // Create the reduction after the loop. Note that inloop reductions create the
4426   // target reduction in the loop using a Reduction recipe.
4427   if (VF.isVector() && !PhiR->isInLoop()) {
4428     ReducedPartRdx =
4429         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
4430     // If the reduction can be performed in a smaller type, we need to extend
4431     // the reduction to the wider type before we branch to the original loop.
4432     if (PhiTy != RdxDesc.getRecurrenceType())
4433       ReducedPartRdx = RdxDesc.isSigned()
4434                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4435                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4436   }
4437
4438   // Create a phi node that merges control-flow from the backedge-taken check
4439   // block and the middle block.
4440   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4441                                         LoopScalarPreHeader->getTerminator());
4442   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4443     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4444   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4445
4446   // Now, we need to fix the users of the reduction variable
4447   // inside and outside of the scalar remainder loop.
4448
4449   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4450   // in the exit blocks.  See comment on analogous loop in
4451   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4452   if (!Cost->requiresScalarEpilogue(VF))
4453     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4454       if (any_of(LCSSAPhi.incoming_values(),
4455                  [LoopExitInst](Value *V) { return V == LoopExitInst; }))
4456         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4457
4458   // Fix the scalar loop reduction variable with the incoming reduction sum
4459   // from the vector body and from the backedge value.
4460   int IncomingEdgeBlockIdx =
4461       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4462   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4463   // Pick the other block.
4464   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4465   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4466   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4467 }
4468
4469 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
4470                                                   VPTransformState &State) {
4471   RecurKind RK = RdxDesc.getRecurrenceKind();
4472   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4473     return;
4474
4475   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4476   assert(LoopExitInstr && "null loop exit instruction");
4477   SmallVector<Instruction *, 8> Worklist;
4478   SmallPtrSet<Instruction *, 8> Visited;
4479   Worklist.push_back(LoopExitInstr);
4480   Visited.insert(LoopExitInstr);
4481
4482   while (!Worklist.empty()) {
4483     Instruction *Cur = Worklist.pop_back_val();
4484     if (isa<OverflowingBinaryOperator>(Cur))
4485       for (unsigned Part = 0; Part < UF; ++Part) {
4486         Value *V = State.get(State.Plan->getVPValue(Cur), Part);
4487         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4488       }
4489
4490     for (User *U : Cur->users()) {
4491       Instruction *UI = cast<Instruction>(U);
4492       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4493           Visited.insert(UI).second)
4494         Worklist.push_back(UI);
4495     }
4496   }
4497 }
4498
4499 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4500   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4501     if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4502       // Some phis were already hand updated by the reduction and recurrence
4503       // code above, leave them alone.
4504       continue;
4505
4506     auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4507     // Non-instruction incoming values will have only one value.
4508
4509     VPLane Lane = VPLane::getFirstLane();
4510     if (isa<Instruction>(IncomingValue) &&
4511         !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4512                                            VF))
4513       Lane = VPLane::getLastLaneForVF(VF);
4514
4515     // Can be a loop invariant incoming value or the last scalar value to be
4516     // extracted from the vectorized loop.
4517     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4518     Value *lastIncomingValue =
4519         OrigLoop->isLoopInvariant(IncomingValue)
4520             ? IncomingValue
4521             : State.get(State.Plan->getVPValue(IncomingValue),
4522                         VPIteration(UF - 1, Lane));
4523     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4524   }
4525 }
4526
4527 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4528   // The basic block and loop containing the predicated instruction.
4529   auto *PredBB = PredInst->getParent();
4530   auto *VectorLoop = LI->getLoopFor(PredBB);
4531
4532   // Initialize a worklist with the operands of the predicated instruction.
4533   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4534
4535   // Holds instructions that we need to analyze again. An instruction may be
4536   // reanalyzed if we don't yet know if we can sink it or not.
4537   SmallVector<Instruction *, 8> InstsToReanalyze;
4538
4539   // Returns true if a given use occurs in the predicated block. Phi nodes use
4540   // their operands in their corresponding predecessor blocks.
4541   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4542     auto *I = cast<Instruction>(U.getUser());
4543     BasicBlock *BB = I->getParent();
4544     if (auto *Phi = dyn_cast<PHINode>(I))
4545       BB = Phi->getIncomingBlock(
4546           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4547     return BB == PredBB;
4548   };
4549
4550   // Iteratively sink the scalarized operands of the predicated instruction
4551   // into the block we created for it. When an instruction is sunk, it's
4552   // operands are then added to the worklist. The algorithm ends after one pass
4553   // through the worklist doesn't sink a single instruction.
4554   bool Changed;
4555   do {
4556     // Add the instructions that need to be reanalyzed to the worklist, and
4557     // reset the changed indicator.
4558     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4559     InstsToReanalyze.clear();
4560     Changed = false;
4561
4562     while (!Worklist.empty()) {
4563       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4564
4565       // We can't sink an instruction if it is a phi node, is not in the loop,
4566       // or may have side effects.
4567       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4568           I->mayHaveSideEffects())
4569         continue;
4570
4571       // If the instruction is already in PredBB, check if we can sink its
4572       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4573       // sinking the scalar instruction I, hence it appears in PredBB; but it
4574       // may have failed to sink I's operands (recursively), which we try
4575       // (again) here.
4576       if (I->getParent() == PredBB) {
4577         Worklist.insert(I->op_begin(), I->op_end());
4578         continue;
4579       }
4580
4581       // It's legal to sink the instruction if all its uses occur in the
4582       // predicated block. Otherwise, there's nothing to do yet, and we may
4583       // need to reanalyze the instruction.
4584       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4585         InstsToReanalyze.push_back(I);
4586         continue;
4587       }
4588
4589       // Move the instruction to the beginning of the predicated block, and add
4590       // it's operands to the worklist.
4591       I->moveBefore(&*PredBB->getFirstInsertionPt());
4592       Worklist.insert(I->op_begin(), I->op_end());
4593
4594       // The sinking may have enabled other instructions to be sunk, so we will
4595       // need to iterate.
4596       Changed = true;
4597     }
4598   } while (Changed);
4599 }
4600
4601 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4602   for (PHINode *OrigPhi : OrigPHIsToFix) {
4603     VPWidenPHIRecipe *VPPhi =
4604         cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4605     PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4606     // Make sure the builder has a valid insert point.
4607     Builder.SetInsertPoint(NewPhi);
4608     for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4609       VPValue *Inc = VPPhi->getIncomingValue(i);
4610       VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4611       NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4612     }
4613   }
4614 }
4615
4616 bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
4617   return Cost->useOrderedReductions(RdxDesc);
4618 }
4619
4620 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4621                                    VPUser &Operands, unsigned UF,
4622                                    ElementCount VF, bool IsPtrLoopInvariant,
4623                                    SmallBitVector &IsIndexLoopInvariant,
4624                                    VPTransformState &State) {
4625   // Construct a vector GEP by widening the operands of the scalar GEP as
4626   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4627   // results in a vector of pointers when at least one operand of the GEP
4628   // is vector-typed. Thus, to keep the representation compact, we only use
4629   // vector-typed operands for loop-varying values.
4630
4631   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4632     // If we are vectorizing, but the GEP has only loop-invariant operands,
4633     // the GEP we build (by only using vector-typed operands for
4634     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4635     // produce a vector of pointers, we need to either arbitrarily pick an
4636     // operand to broadcast, or broadcast a clone of the original GEP.
4637     // Here, we broadcast a clone of the original.
4638     //
4639     // TODO: If at some point we decide to scalarize instructions having
4640     //       loop-invariant operands, this special case will no longer be
4641     //       required. We would add the scalarization decision to
4642     //       collectLoopScalars() and teach getVectorValue() to broadcast
4643     //       the lane-zero scalar value.
4644     auto *Clone = Builder.Insert(GEP->clone());
4645     for (unsigned Part = 0; Part < UF; ++Part) {
4646       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4647       State.set(VPDef, EntryPart, Part);
4648       addMetadata(EntryPart, GEP);
4649     }
4650   } else {
4651     // If the GEP has at least one loop-varying operand, we are sure to
4652     // produce a vector of pointers. But if we are only unrolling, we want
4653     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4654     // produce with the code below will be scalar (if VF == 1) or vector
4655     // (otherwise). Note that for the unroll-only case, we still maintain
4656     // values in the vector mapping with initVector, as we do for other
4657     // instructions.
4658     for (unsigned Part = 0; Part < UF; ++Part) {
4659       // The pointer operand of the new GEP. If it's loop-invariant, we
4660       // won't broadcast it.
4661       auto *Ptr = IsPtrLoopInvariant
4662                       ? State.get(Operands.getOperand(0), VPIteration(0, 0))
4663                       : State.get(Operands.getOperand(0), Part);
4664
4665       // Collect all the indices for the new GEP. If any index is
4666       // loop-invariant, we won't broadcast it.
4667       SmallVector<Value *, 4> Indices;
4668       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4669         VPValue *Operand = Operands.getOperand(I);
4670         if (IsIndexLoopInvariant[I - 1])
4671           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
4672         else
4673           Indices.push_back(State.get(Operand, Part));
4674       }
4675
4676       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4677       // but it should be a vector, otherwise.
4678       auto *NewGEP =
4679           GEP->isInBounds()
4680               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4681                                           Indices)
4682               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4683       assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
4684              "NewGEP is not a pointer vector");
4685       State.set(VPDef, NewGEP, Part);
4686       addMetadata(NewGEP, GEP);
4687     }
4688   }
4689 }
4690
4691 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4692                                               VPWidenPHIRecipe *PhiR,
4693                                               VPTransformState &State) {
4694   PHINode *P = cast<PHINode>(PN);
4695   if (EnableVPlanNativePath) {
4696     // Currently we enter here in the VPlan-native path for non-induction
4697     // PHIs where all control flow is uniform. We simply widen these PHIs.
4698     // Create a vector phi with no operands - the vector phi operands will be
4699     // set at the end of vector code generation.
4700     Type *VecTy = (State.VF.isScalar())
4701                       ? PN->getType()
4702                       : VectorType::get(PN->getType(), State.VF);
4703     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4704     State.set(PhiR, VecPhi, 0);
4705     OrigPHIsToFix.push_back(P);
4706
4707     return;
4708   }
4709
4710   assert(PN->getParent() == OrigLoop->getHeader() &&
4711          "Non-header phis should have been handled elsewhere");
4712
4713   // In order to support recurrences we need to be able to vectorize Phi nodes.
4714   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4715   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4716   // this value when we vectorize all of the instructions that use the PHI.
4717
4718   assert(!Legal->isReductionVariable(P) &&
4719          "reductions should be handled elsewhere");
4720
4721   setDebugLocFromInst(P);
4722
4723   // This PHINode must be an induction variable.
4724   // Make sure that we know about it.
4725   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4726
4727   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4728   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4729
4730   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4731   // which can be found from the original scalar operations.
4732   switch (II.getKind()) {
4733   case InductionDescriptor::IK_NoInduction:
4734     llvm_unreachable("Unknown induction");
4735   case InductionDescriptor::IK_IntInduction:
4736   case InductionDescriptor::IK_FpInduction:
4737     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4738   case InductionDescriptor::IK_PtrInduction: {
4739     // Handle the pointer induction variable case.
4740     assert(P->getType()->isPointerTy() && "Unexpected type.");
4741
4742     if (Cost->isScalarAfterVectorization(P, State.VF)) {
4743       // This is the normalized GEP that starts counting at zero.
4744       Value *PtrInd =
4745           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4746       // Determine the number of scalars we need to generate for each unroll
4747       // iteration. If the instruction is uniform, we only need to generate the
4748       // first lane. Otherwise, we generate all VF values.
4749       bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
4750       unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
4751
4752       bool NeedsVectorIndex = !IsUniform && VF.isScalable();
4753       Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr;
4754       if (NeedsVectorIndex) {
4755         Type *VecIVTy = VectorType::get(PtrInd->getType(), VF);
4756         UnitStepVec = Builder.CreateStepVector(VecIVTy);
4757         PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd);
4758       }
4759
4760       for (unsigned Part = 0; Part < UF; ++Part) {
4761         Value *PartStart = createStepForVF(
4762             Builder, ConstantInt::get(PtrInd->getType(), Part), VF);
4763
4764         if (NeedsVectorIndex) {
4765           Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart);
4766           Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec);
4767           Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices);
4768           Value *SclrGep =
4769               emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II);
4770           SclrGep->setName("next.gep");
4771           State.set(PhiR, SclrGep, Part);
4772           // We've cached the whole vector, which means we can support the
4773           // extraction of any lane.
4774           continue;
4775         }
4776
4777         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4778           Value *Idx = Builder.CreateAdd(
4779               PartStart, ConstantInt::get(PtrInd->getType(), Lane));
4780           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4781           Value *SclrGep =
4782               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4783           SclrGep->setName("next.gep");
4784           State.set(PhiR, SclrGep, VPIteration(Part, Lane));
4785         }
4786       }
4787       return;
4788     }
4789     assert(isa<SCEVConstant>(II.getStep()) &&
4790            "Induction step not a SCEV constant!");
4791     Type *PhiType = II.getStep()->getType();
4792
4793     // Build a pointer phi
4794     Value *ScalarStartValue = II.getStartValue();
4795     Type *ScStValueType = ScalarStartValue->getType();
4796     PHINode *NewPointerPhi =
4797         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4798     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4799
4800     // A pointer induction, performed by using a gep
4801     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4802     Instruction *InductionLoc = LoopLatch->getTerminator();
4803     const SCEV *ScalarStep = II.getStep();
4804     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4805     Value *ScalarStepValue =
4806         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4807     Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
4808     Value *NumUnrolledElems =
4809         Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
4810     Value *InductionGEP = GetElementPtrInst::Create(
4811         ScStValueType->getPointerElementType(), NewPointerPhi,
4812         Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
4813         InductionLoc);
4814     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4815
4816     // Create UF many actual address geps that use the pointer
4817     // phi as base and a vectorized version of the step value
4818     // (<step*0, ..., step*N>) as offset.
4819     for (unsigned Part = 0; Part < State.UF; ++Part) {
4820       Type *VecPhiType = VectorType::get(PhiType, State.VF);
4821       Value *StartOffsetScalar =
4822           Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
4823       Value *StartOffset =
4824           Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
4825       // Create a vector of consecutive numbers from zero to VF.
4826       StartOffset =
4827           Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
4828
4829       Value *GEP = Builder.CreateGEP(
4830           ScStValueType->getPointerElementType(), NewPointerPhi,
4831           Builder.CreateMul(
4832               StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
4833               "vector.gep"));
4834       State.set(PhiR, GEP, Part);
4835     }
4836   }
4837   }
4838 }
4839
4840 /// A helper function for checking whether an integer division-related
4841 /// instruction may divide by zero (in which case it must be predicated if
4842 /// executed conditionally in the scalar code).
4843 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4844 /// Non-zero divisors that are non compile-time constants will not be
4845 /// converted into multiplication, so we will still end up scalarizing
4846 /// the division, but can do so w/o predication.
4847 static bool mayDivideByZero(Instruction &I) {
4848   assert((I.getOpcode() == Instruction::UDiv ||
4849           I.getOpcode() == Instruction::SDiv ||
4850           I.getOpcode() == Instruction::URem ||
4851           I.getOpcode() == Instruction::SRem) &&
4852          "Unexpected instruction");
4853   Value *Divisor = I.getOperand(1);
4854   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4855   return !CInt || CInt->isZero();
4856 }
4857
4858 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4859                                            VPUser &User,
4860                                            VPTransformState &State) {
4861   switch (I.getOpcode()) {
4862   case Instruction::Call:
4863   case Instruction::Br:
4864   case Instruction::PHI:
4865   case Instruction::GetElementPtr:
4866   case Instruction::Select:
4867     llvm_unreachable("This instruction is handled by a different recipe.");
4868   case Instruction::UDiv:
4869   case Instruction::SDiv:
4870   case Instruction::SRem:
4871   case Instruction::URem:
4872   case Instruction::Add:
4873   case Instruction::FAdd:
4874   case Instruction::Sub:
4875   case Instruction::FSub:
4876   case Instruction::FNeg:
4877   case Instruction::Mul:
4878   case Instruction::FMul:
4879   case Instruction::FDiv:
4880   case Instruction::FRem:
4881   case Instruction::Shl:
4882   case Instruction::LShr:
4883   case Instruction::AShr:
4884   case Instruction::And:
4885   case Instruction::Or:
4886   case Instruction::Xor: {
4887     // Just widen unops and binops.
4888     setDebugLocFromInst(&I);
4889
4890     for (unsigned Part = 0; Part < UF; ++Part) {
4891       SmallVector<Value *, 2> Ops;
4892       for (VPValue *VPOp : User.operands())
4893         Ops.push_back(State.get(VPOp, Part));
4894
4895       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4896
4897       if (auto *VecOp = dyn_cast<Instruction>(V))
4898         VecOp->copyIRFlags(&I);
4899
4900       // Use this vector value for all users of the original instruction.
4901       State.set(Def, V, Part);
4902       addMetadata(V, &I);
4903     }
4904
4905     break;
4906   }
4907   case Instruction::ICmp:
4908   case Instruction::FCmp: {
4909     // Widen compares. Generate vector compares.
4910     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4911     auto *Cmp = cast<CmpInst>(&I);
4912     setDebugLocFromInst(Cmp);
4913     for (unsigned Part = 0; Part < UF; ++Part) {
4914       Value *A = State.get(User.getOperand(0), Part);
4915       Value *B = State.get(User.getOperand(1), Part);
4916       Value *C = nullptr;
4917       if (FCmp) {
4918         // Propagate fast math flags.
4919         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4920         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4921         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4922       } else {
4923         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4924       }
4925       State.set(Def, C, Part);
4926       addMetadata(C, &I);
4927     }
4928
4929     break;
4930   }
4931
4932   case Instruction::ZExt:
4933   case Instruction::SExt:
4934   case Instruction::FPToUI:
4935   case Instruction::FPToSI:
4936   case Instruction::FPExt:
4937   case Instruction::PtrToInt:
4938   case Instruction::IntToPtr:
4939   case Instruction::SIToFP:
4940   case Instruction::UIToFP:
4941   case Instruction::Trunc:
4942   case Instruction::FPTrunc:
4943   case Instruction::BitCast: {
4944     auto *CI = cast<CastInst>(&I);
4945     setDebugLocFromInst(CI);
4946
4947     /// Vectorize casts.
4948     Type *DestTy =
4949         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4950
4951     for (unsigned Part = 0; Part < UF; ++Part) {
4952       Value *A = State.get(User.getOperand(0), Part);
4953       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4954       State.set(Def, Cast, Part);
4955       addMetadata(Cast, &I);
4956     }
4957     break;
4958   }
4959   default:
4960     // This instruction is not vectorized by simple widening.
4961     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4962     llvm_unreachable("Unhandled instruction!");
4963   } // end of switch.
4964 }
4965
4966 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4967                                                VPUser &ArgOperands,
4968                                                VPTransformState &State) {
4969   assert(!isa<DbgInfoIntrinsic>(I) &&
4970          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4971   setDebugLocFromInst(&I);
4972
4973   Module *M = I.getParent()->getParent()->getParent();
4974   auto *CI = cast<CallInst>(&I);
4975
4976   SmallVector<Type *, 4> Tys;
4977   for (Value *ArgOperand : CI->arg_operands())
4978     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4979
4980   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4981
4982   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4983   // version of the instruction.
4984   // Is it beneficial to perform intrinsic call compared to lib call?
4985   bool NeedToScalarize = false;
4986   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4987   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4988   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4989   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4990          "Instruction should be scalarized elsewhere.");
4991   assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
4992          "Either the intrinsic cost or vector call cost must be valid");
4993
4994   for (unsigned Part = 0; Part < UF; ++Part) {
4995     SmallVector<Type *, 2> TysForDecl = {CI->getType()};
4996     SmallVector<Value *, 4> Args;
4997     for (auto &I : enumerate(ArgOperands.operands())) {
4998       // Some intrinsics have a scalar argument - don't replace it with a
4999       // vector.
5000       Value *Arg;
5001       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
5002         Arg = State.get(I.value(), Part);
5003       else {
5004         Arg = State.get(I.value(), VPIteration(0, 0));
5005         if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index()))
5006           TysForDecl.push_back(Arg->getType());
5007       }
5008       Args.push_back(Arg);
5009     }
5010
5011     Function *VectorF;
5012     if (UseVectorIntrinsic) {
5013       // Use vector version of the intrinsic.
5014       if (VF.isVector())
5015         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
5016       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
5017       assert(VectorF && "Can't retrieve vector intrinsic.");
5018     } else {
5019       // Use vector version of the function call.
5020       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
5021 #ifndef NDEBUG
5022       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
5023              "Can't create vector function.");
5024 #endif
5025         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
5026     }
5027       SmallVector<OperandBundleDef, 1> OpBundles;
5028       CI->getOperandBundlesAsDefs(OpBundles);
5029       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
5030
5031       if (isa<FPMathOperator>(V))
5032         V->copyFastMathFlags(CI);
5033
5034       State.set(Def, V, Part);
5035       addMetadata(V, &I);
5036   }
5037 }
5038
5039 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
5040                                                  VPUser &Operands,
5041                                                  bool InvariantCond,
5042                                                  VPTransformState &State) {
5043   setDebugLocFromInst(&I);
5044
5045   // The condition can be loop invariant  but still defined inside the
5046   // loop. This means that we can't just use the original 'cond' value.
5047   // We have to take the 'vectorized' value and pick the first lane.
5048   // Instcombine will make this a no-op.
5049   auto *InvarCond = InvariantCond
5050                         ? State.get(Operands.getOperand(0), VPIteration(0, 0))
5051                         : nullptr;
5052
5053   for (unsigned Part = 0; Part < UF; ++Part) {
5054     Value *Cond =
5055         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
5056     Value *Op0 = State.get(Operands.getOperand(1), Part);
5057     Value *Op1 = State.get(Operands.getOperand(2), Part);
5058     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
5059     State.set(VPDef, Sel, Part);
5060     addMetadata(Sel, &I);
5061   }
5062 }
5063
5064 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
5065   // We should not collect Scalars more than once per VF. Right now, this
5066   // function is called from collectUniformsAndScalars(), which already does
5067   // this check. Collecting Scalars for VF=1 does not make any sense.
5068   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
5069          "This function should not be visited twice for the same VF");
5070
5071   SmallSetVector<Instruction *, 8> Worklist;
5072
5073   // These sets are used to seed the analysis with pointers used by memory
5074   // accesses that will remain scalar.
5075   SmallSetVector<Instruction *, 8> ScalarPtrs;
5076   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
5077   auto *Latch = TheLoop->getLoopLatch();
5078
5079   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
5080   // The pointer operands of loads and stores will be scalar as long as the
5081   // memory access is not a gather or scatter operation. The value operand of a
5082   // store will remain scalar if the store is scalarized.
5083   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
5084     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
5085     assert(WideningDecision != CM_Unknown &&
5086            "Widening decision should be ready at this moment");
5087     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
5088       if (Ptr == Store->getValueOperand())
5089         return WideningDecision == CM_Scalarize;
5090     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
5091            "Ptr is neither a value or pointer operand");
5092     return WideningDecision != CM_GatherScatter;
5093   };
5094
5095   // A helper that returns true if the given value is a bitcast or
5096   // getelementptr instruction contained in the loop.
5097   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
5098     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
5099             isa<GetElementPtrInst>(V)) &&
5100            !TheLoop->isLoopInvariant(V);
5101   };
5102
5103   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
5104     if (!isa<PHINode>(Ptr) ||
5105         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
5106       return false;
5107     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
5108     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
5109       return false;
5110     return isScalarUse(MemAccess, Ptr);
5111   };
5112
5113   // A helper that evaluates a memory access's use of a pointer. If the
5114   // pointer is actually the pointer induction of a loop, it is being
5115   // inserted into Worklist. If the use will be a scalar use, and the
5116   // pointer is only used by memory accesses, we place the pointer in
5117   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
5118   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
5119     if (isScalarPtrInduction(MemAccess, Ptr)) {
5120       Worklist.insert(cast<Instruction>(Ptr));
5121       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
5122                         << "\n");
5123
5124       Instruction *Update = cast<Instruction>(
5125           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
5126       ScalarPtrs.insert(Update);
5127       return;
5128     }
5129     // We only care about bitcast and getelementptr instructions contained in
5130     // the loop.
5131     if (!isLoopVaryingBitCastOrGEP(Ptr))
5132       return;
5133
5134     // If the pointer has already been identified as scalar (e.g., if it was
5135     // also identified as uniform), there's nothing to do.
5136     auto *I = cast<Instruction>(Ptr);
5137     if (Worklist.count(I))
5138       return;
5139
5140     // If the use of the pointer will be a scalar use, and all users of the
5141     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5142     // place the pointer in PossibleNonScalarPtrs.
5143     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
5144           return isa<LoadInst>(U) || isa<StoreInst>(U);
5145         }))
5146       ScalarPtrs.insert(I);
5147     else
5148       PossibleNonScalarPtrs.insert(I);
5149   };
5150
5151   // We seed the scalars analysis with three classes of instructions: (1)
5152   // instructions marked uniform-after-vectorization and (2) bitcast,
5153   // getelementptr and (pointer) phi instructions used by memory accesses
5154   // requiring a scalar use.
5155   //
5156   // (1) Add to the worklist all instructions that have been identified as
5157   // uniform-after-vectorization.
5158   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5159
5160   // (2) Add to the worklist all bitcast and getelementptr instructions used by
5161   // memory accesses requiring a scalar use. The pointer operands of loads and
5162   // stores will be scalar as long as the memory accesses is not a gather or
5163   // scatter operation. The value operand of a store will remain scalar if the
5164   // store is scalarized.
5165   for (auto *BB : TheLoop->blocks())
5166     for (auto &I : *BB) {
5167       if (auto *Load = dyn_cast<LoadInst>(&I)) {
5168         evaluatePtrUse(Load, Load->getPointerOperand());
5169       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5170         evaluatePtrUse(Store, Store->getPointerOperand());
5171         evaluatePtrUse(Store, Store->getValueOperand());
5172       }
5173     }
5174   for (auto *I : ScalarPtrs)
5175     if (!PossibleNonScalarPtrs.count(I)) {
5176       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
5177       Worklist.insert(I);
5178     }
5179
5180   // Insert the forced scalars.
5181   // FIXME: Currently widenPHIInstruction() often creates a dead vector
5182   // induction variable when the PHI user is scalarized.
5183   auto ForcedScalar = ForcedScalars.find(VF);
5184   if (ForcedScalar != ForcedScalars.end())
5185     for (auto *I : ForcedScalar->second)
5186       Worklist.insert(I);
5187
5188   // Expand the worklist by looking through any bitcasts and getelementptr
5189   // instructions we've already identified as scalar. This is similar to the
5190   // expansion step in collectLoopUniforms(); however, here we're only
5191   // expanding to include additional bitcasts and getelementptr instructions.
5192   unsigned Idx = 0;
5193   while (Idx != Worklist.size()) {
5194     Instruction *Dst = Worklist[Idx++];
5195     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5196       continue;
5197     auto *Src = cast<Instruction>(Dst->getOperand(0));
5198     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5199           auto *J = cast<Instruction>(U);
5200           return !TheLoop->contains(J) || Worklist.count(J) ||
5201                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5202                   isScalarUse(J, Src));
5203         })) {
5204       Worklist.insert(Src);
5205       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
5206     }
5207   }
5208
5209   // An induction variable will remain scalar if all users of the induction
5210   // variable and induction variable update remain scalar.
5211   for (auto &Induction : Legal->getInductionVars()) {
5212     auto *Ind = Induction.first;
5213     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5214
5215     // If tail-folding is applied, the primary induction variable will be used
5216     // to feed a vector compare.
5217     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
5218       continue;
5219
5220     // Determine if all users of the induction variable are scalar after
5221     // vectorization.
5222     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5223       auto *I = cast<Instruction>(U);
5224       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5225     });
5226     if (!ScalarInd)
5227       continue;
5228
5229     // Determine if all users of the induction variable update instruction are
5230     // scalar after vectorization.
5231     auto ScalarIndUpdate =
5232         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5233           auto *I = cast<Instruction>(U);
5234           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5235         });
5236     if (!ScalarIndUpdate)
5237       continue;
5238
5239     // The induction variable and its update instruction will remain scalar.
5240     Worklist.insert(Ind);
5241     Worklist.insert(IndUpdate);
5242     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5243     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
5244                       << "\n");
5245   }
5246
5247   Scalars[VF].insert(Worklist.begin(), Worklist.end());
5248 }
5249
5250 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
5251   if (!blockNeedsPredication(I->getParent()))
5252     return false;
5253   switch(I->getOpcode()) {
5254   default:
5255     break;
5256   case Instruction::Load:
5257   case Instruction::Store: {
5258     if (!Legal->isMaskRequired(I))
5259       return false;
5260     auto *Ptr = getLoadStorePointerOperand(I);
5261     auto *Ty = getLoadStoreType(I);
5262     const Align Alignment = getLoadStoreAlignment(I);
5263     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5264                                 TTI.isLegalMaskedGather(Ty, Alignment))
5265                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5266                                 TTI.isLegalMaskedScatter(Ty, Alignment));
5267   }
5268   case Instruction::UDiv:
5269   case Instruction::SDiv:
5270   case Instruction::SRem:
5271   case Instruction::URem:
5272     return mayDivideByZero(*I);
5273   }
5274   return false;
5275 }
5276
5277 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5278     Instruction *I, ElementCount VF) {
5279   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
5280   assert(getWideningDecision(I, VF) == CM_Unknown &&
5281          "Decision should not be set yet.");
5282   auto *Group = getInterleavedAccessGroup(I);
5283   assert(Group && "Must have a group.");
5284
5285   // If the instruction's allocated size doesn't equal it's type size, it
5286   // requires padding and will be scalarized.
5287   auto &DL = I->getModule()->getDataLayout();
5288   auto *ScalarTy = getLoadStoreType(I);
5289   if (hasIrregularType(ScalarTy, DL))
5290     return false;
5291
5292   // Check if masking is required.
5293   // A Group may need masking for one of two reasons: it resides in a block that
5294   // needs predication, or it was decided to use masking to deal with gaps
5295   // (either a gap at the end of a load-access that may result in a speculative
5296   // load, or any gaps in a store-access).
5297   bool PredicatedAccessRequiresMasking =
5298       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
5299   bool LoadAccessWithGapsRequiresEpilogMasking =
5300       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
5301       !isScalarEpilogueAllowed();
5302   bool StoreAccessWithGapsRequiresMasking =
5303       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
5304   if (!PredicatedAccessRequiresMasking &&
5305       !LoadAccessWithGapsRequiresEpilogMasking &&
5306       !StoreAccessWithGapsRequiresMasking)
5307     return true;
5308
5309   // If masked interleaving is required, we expect that the user/target had
5310   // enabled it, because otherwise it either wouldn't have been created or
5311   // it should have been invalidated by the CostModel.
5312   assert(useMaskedInterleavedAccesses(TTI) &&
5313          "Masked interleave-groups for predicated accesses are not enabled.");
5314
5315   auto *Ty = getLoadStoreType(I);
5316   const Align Alignment = getLoadStoreAlignment(I);
5317   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5318                           : TTI.isLegalMaskedStore(Ty, Alignment);
5319 }
5320
5321 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5322     Instruction *I, ElementCount VF) {
5323   // Get and ensure we have a valid memory instruction.
5324   LoadInst *LI = dyn_cast<LoadInst>(I);
5325   StoreInst *SI = dyn_cast<StoreInst>(I);
5326   assert((LI || SI) && "Invalid memory instruction");
5327
5328   auto *Ptr = getLoadStorePointerOperand(I);
5329
5330   // In order to be widened, the pointer should be consecutive, first of all.
5331   if (!Legal->isConsecutivePtr(Ptr))
5332     return false;
5333
5334   // If the instruction is a store located in a predicated block, it will be
5335   // scalarized.
5336   if (isScalarWithPredication(I))
5337     return false;
5338
5339   // If the instruction's allocated size doesn't equal it's type size, it
5340   // requires padding and will be scalarized.
5341   auto &DL = I->getModule()->getDataLayout();
5342   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5343   if (hasIrregularType(ScalarTy, DL))
5344     return false;
5345
5346   return true;
5347 }
5348
5349 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5350   // We should not collect Uniforms more than once per VF. Right now,
5351   // this function is called from collectUniformsAndScalars(), which
5352   // already does this check. Collecting Uniforms for VF=1 does not make any
5353   // sense.
5354
5355   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5356          "This function should not be visited twice for the same VF");
5357
5358   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5359   // not analyze again.  Uniforms.count(VF) will return 1.
5360   Uniforms[VF].clear();
5361
5362   // We now know that the loop is vectorizable!
5363   // Collect instructions inside the loop that will remain uniform after
5364   // vectorization.
5365
5366   // Global values, params and instructions outside of current loop are out of
5367   // scope.
5368   auto isOutOfScope = [&](Value *V) -> bool {
5369     Instruction *I = dyn_cast<Instruction>(V);
5370     return (!I || !TheLoop->contains(I));
5371   };
5372
5373   SetVector<Instruction *> Worklist;
5374   BasicBlock *Latch = TheLoop->getLoopLatch();
5375
5376   // Instructions that are scalar with predication must not be considered
5377   // uniform after vectorization, because that would create an erroneous
5378   // replicating region where only a single instance out of VF should be formed.
5379   // TODO: optimize such seldom cases if found important, see PR40816.
5380   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5381     if (isOutOfScope(I)) {
5382       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5383                         << *I << "\n");
5384       return;
5385     }
5386     if (isScalarWithPredication(I)) {
5387       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5388                         << *I << "\n");
5389       return;
5390     }
5391     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5392     Worklist.insert(I);
5393   };
5394
5395   // Start with the conditional branch. If the branch condition is an
5396   // instruction contained in the loop that is only used by the branch, it is
5397   // uniform.
5398   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5399   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5400     addToWorklistIfAllowed(Cmp);
5401
5402   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5403     InstWidening WideningDecision = getWideningDecision(I, VF);
5404     assert(WideningDecision != CM_Unknown &&
5405            "Widening decision should be ready at this moment");
5406
5407     // A uniform memory op is itself uniform.  We exclude uniform stores
5408     // here as they demand the last lane, not the first one.
5409     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5410       assert(WideningDecision == CM_Scalarize);
5411       return true;
5412     }
5413
5414     return (WideningDecision == CM_Widen ||
5415             WideningDecision == CM_Widen_Reverse ||
5416             WideningDecision == CM_Interleave);
5417   };
5418
5419
5420   // Returns true if Ptr is the pointer operand of a memory access instruction
5421   // I, and I is known to not require scalarization.
5422   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5423     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5424   };
5425
5426   // Holds a list of values which are known to have at least one uniform use.
5427   // Note that there may be other uses which aren't uniform.  A "uniform use"
5428   // here is something which only demands lane 0 of the unrolled iterations;
5429   // it does not imply that all lanes produce the same value (e.g. this is not
5430   // the usual meaning of uniform)
5431   SetVector<Value *> HasUniformUse;
5432
5433   // Scan the loop for instructions which are either a) known to have only
5434   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5435   for (auto *BB : TheLoop->blocks())
5436     for (auto &I : *BB) {
5437       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
5438         switch (II->getIntrinsicID()) {
5439         case Intrinsic::sideeffect:
5440         case Intrinsic::experimental_noalias_scope_decl:
5441         case Intrinsic::assume:
5442         case Intrinsic::lifetime_start:
5443         case Intrinsic::lifetime_end:
5444           if (TheLoop->hasLoopInvariantOperands(&I))
5445             addToWorklistIfAllowed(&I);
5446           break;
5447         default:
5448           break;
5449         }
5450       }
5451
5452       // ExtractValue instructions must be uniform, because the operands are
5453       // known to be loop-invariant.
5454       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
5455         assert(isOutOfScope(EVI->getAggregateOperand()) &&
5456                "Expected aggregate value to be loop invariant");
5457         addToWorklistIfAllowed(EVI);
5458         continue;
5459       }
5460
5461       // If there's no pointer operand, there's nothing to do.
5462       auto *Ptr = getLoadStorePointerOperand(&I);
5463       if (!Ptr)
5464         continue;
5465
5466       // A uniform memory op is itself uniform.  We exclude uniform stores
5467       // here as they demand the last lane, not the first one.
5468       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5469         addToWorklistIfAllowed(&I);
5470
5471       if (isUniformDecision(&I, VF)) {
5472         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5473         HasUniformUse.insert(Ptr);
5474       }
5475     }
5476
5477   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5478   // demanding) users.  Since loops are assumed to be in LCSSA form, this
5479   // disallows uses outside the loop as well.
5480   for (auto *V : HasUniformUse) {
5481     if (isOutOfScope(V))
5482       continue;
5483     auto *I = cast<Instruction>(V);
5484     auto UsersAreMemAccesses =
5485       llvm::all_of(I->users(), [&](User *U) -> bool {
5486         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5487       });
5488     if (UsersAreMemAccesses)
5489       addToWorklistIfAllowed(I);
5490   }
5491
5492   // Expand Worklist in topological order: whenever a new instruction
5493   // is added , its users should be already inside Worklist.  It ensures
5494   // a uniform instruction will only be used by uniform instructions.
5495   unsigned idx = 0;
5496   while (idx != Worklist.size()) {
5497     Instruction *I = Worklist[idx++];
5498
5499     for (auto OV : I->operand_values()) {
5500       // isOutOfScope operands cannot be uniform instructions.
5501       if (isOutOfScope(OV))
5502         continue;
5503       // First order recurrence Phi's should typically be considered
5504       // non-uniform.
5505       auto *OP = dyn_cast<PHINode>(OV);
5506       if (OP && Legal->isFirstOrderRecurrence(OP))
5507         continue;
5508       // If all the users of the operand are uniform, then add the
5509       // operand into the uniform worklist.
5510       auto *OI = cast<Instruction>(OV);
5511       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5512             auto *J = cast<Instruction>(U);
5513             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5514           }))
5515         addToWorklistIfAllowed(OI);
5516     }
5517   }
5518
5519   // For an instruction to be added into Worklist above, all its users inside
5520   // the loop should also be in Worklist. However, this condition cannot be
5521   // true for phi nodes that form a cyclic dependence. We must process phi
5522   // nodes separately. An induction variable will remain uniform if all users
5523   // of the induction variable and induction variable update remain uniform.
5524   // The code below handles both pointer and non-pointer induction variables.
5525   for (auto &Induction : Legal->getInductionVars()) {
5526     auto *Ind = Induction.first;
5527     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5528
5529     // Determine if all users of the induction variable are uniform after
5530     // vectorization.
5531     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5532       auto *I = cast<Instruction>(U);
5533       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5534              isVectorizedMemAccessUse(I, Ind);
5535     });
5536     if (!UniformInd)
5537       continue;
5538
5539     // Determine if all users of the induction variable update instruction are
5540     // uniform after vectorization.
5541     auto UniformIndUpdate =
5542         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5543           auto *I = cast<Instruction>(U);
5544           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5545                  isVectorizedMemAccessUse(I, IndUpdate);
5546         });
5547     if (!UniformIndUpdate)
5548       continue;
5549
5550     // The induction variable and its update instruction will remain uniform.
5551     addToWorklistIfAllowed(Ind);
5552     addToWorklistIfAllowed(IndUpdate);
5553   }
5554
5555   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5556 }
5557
5558 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5559   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5560
5561   if (Legal->getRuntimePointerChecking()->Need) {
5562     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5563         "runtime pointer checks needed. Enable vectorization of this "
5564         "loop with '#pragma clang loop vectorize(enable)' when "
5565         "compiling with -Os/-Oz",
5566         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5567     return true;
5568   }
5569
5570   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5571     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5572         "runtime SCEV checks needed. Enable vectorization of this "
5573         "loop with '#pragma clang loop vectorize(enable)' when "
5574         "compiling with -Os/-Oz",
5575         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5576     return true;
5577   }
5578
5579   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5580   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5581     reportVectorizationFailure("Runtime stride check for small trip count",
5582         "runtime stride == 1 checks needed. Enable vectorization of "
5583         "this loop without such check by compiling with -Os/-Oz",
5584         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5585     return true;
5586   }
5587
5588   return false;
5589 }
5590
5591 ElementCount
5592 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
5593   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
5594     return ElementCount::getScalable(0);
5595
5596   if (Hints->isScalableVectorizationDisabled()) {
5597     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
5598                             "ScalableVectorizationDisabled", ORE, TheLoop);
5599     return ElementCount::getScalable(0);
5600   }
5601
5602   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
5603
5604   auto MaxScalableVF = ElementCount::getScalable(
5605       std::numeric_limits<ElementCount::ScalarTy>::max());
5606
5607   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
5608   // FIXME: While for scalable vectors this is currently sufficient, this should
5609   // be replaced by a more detailed mechanism that filters out specific VFs,
5610   // instead of invalidating vectorization for a whole set of VFs based on the
5611   // MaxVF.
5612
5613   // Disable scalable vectorization if the loop contains unsupported reductions.
5614   if (!canVectorizeReductions(MaxScalableVF)) {
5615     reportVectorizationInfo(
5616         "Scalable vectorization not supported for the reduction "
5617         "operations found in this loop.",
5618         "ScalableVFUnfeasible", ORE, TheLoop);
5619     return ElementCount::getScalable(0);
5620   }
5621
5622   // Disable scalable vectorization if the loop contains any instructions
5623   // with element types not supported for scalable vectors.
5624   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
5625         return !Ty->isVoidTy() &&
5626                !this->TTI.isElementTypeLegalForScalableVector(Ty);
5627       })) {
5628     reportVectorizationInfo("Scalable vectorization is not supported "
5629                             "for all element types found in this loop.",
5630                             "ScalableVFUnfeasible", ORE, TheLoop);
5631     return ElementCount::getScalable(0);
5632   }
5633
5634   if (Legal->isSafeForAnyVectorWidth())
5635     return MaxScalableVF;
5636
5637   // Limit MaxScalableVF by the maximum safe dependence distance.
5638   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
5639   if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5640     unsigned VScaleMax = TheFunction->getFnAttribute(Attribute::VScaleRange)
5641                              .getVScaleRangeArgs()
5642                              .second;
5643     if (VScaleMax > 0)
5644       MaxVScale = VScaleMax;
5645   }
5646   MaxScalableVF = ElementCount::getScalable(
5647       MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
5648   if (!MaxScalableVF)
5649     reportVectorizationInfo(
5650         "Max legal vector width too small, scalable vectorization "
5651         "unfeasible.",
5652         "ScalableVFUnfeasible", ORE, TheLoop);
5653
5654   return MaxScalableVF;
5655 }
5656
5657 FixedScalableVFPair
5658 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5659                                                  ElementCount UserVF) {
5660   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5661   unsigned SmallestType, WidestType;
5662   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5663
5664   // Get the maximum safe dependence distance in bits computed by LAA.
5665   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5666   // the memory accesses that is most restrictive (involved in the smallest
5667   // dependence distance).
5668   unsigned MaxSafeElements =
5669       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
5670
5671   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
5672   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
5673
5674   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
5675                     << ".\n");
5676   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
5677                     << ".\n");
5678
5679   // First analyze the UserVF, fall back if the UserVF should be ignored.
5680   if (UserVF) {
5681     auto MaxSafeUserVF =
5682         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
5683
5684     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
5685       // If `VF=vscale x N` is safe, then so is `VF=N`
5686       if (UserVF.isScalable())
5687         return FixedScalableVFPair(
5688             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
5689       else
5690         return UserVF;
5691     }
5692
5693     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
5694
5695     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
5696     // is better to ignore the hint and let the compiler choose a suitable VF.
5697     if (!UserVF.isScalable()) {
5698       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5699                         << " is unsafe, clamping to max safe VF="
5700                         << MaxSafeFixedVF << ".\n");
5701       ORE->emit([&]() {
5702         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5703                                           TheLoop->getStartLoc(),
5704                                           TheLoop->getHeader())
5705                << "User-specified vectorization factor "
5706                << ore::NV("UserVectorizationFactor", UserVF)
5707                << " is unsafe, clamping to maximum safe vectorization factor "
5708                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
5709       });
5710       return MaxSafeFixedVF;
5711     }
5712
5713     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
5714       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5715                         << " is ignored because scalable vectors are not "
5716                            "available.\n");
5717       ORE->emit([&]() {
5718         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5719                                           TheLoop->getStartLoc(),
5720                                           TheLoop->getHeader())
5721                << "User-specified vectorization factor "
5722                << ore::NV("UserVectorizationFactor", UserVF)
5723                << " is ignored because the target does not support scalable "
5724                   "vectors. The compiler will pick a more suitable value.";
5725       });
5726     } else {
5727       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5728                         << " is unsafe. Ignoring scalable UserVF.\n");
5729       ORE->emit([&]() {
5730         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5731                                           TheLoop->getStartLoc(),
5732                                           TheLoop->getHeader())
5733                << "User-specified vectorization factor "
5734                << ore::NV("UserVectorizationFactor", UserVF)
5735                << " is unsafe. Ignoring the hint to let the compiler pick a "
5736                   "more suitable value.";
5737       });
5738     }
5739   }
5740
5741   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5742                     << " / " << WidestType << " bits.\n");
5743
5744   FixedScalableVFPair Result(ElementCount::getFixed(1),
5745                              ElementCount::getScalable(0));
5746   if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
5747                                            WidestType, MaxSafeFixedVF))
5748     Result.FixedVF = MaxVF;
5749
5750   if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
5751                                            WidestType, MaxSafeScalableVF))
5752     if (MaxVF.isScalable()) {
5753       Result.ScalableVF = MaxVF;
5754       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
5755                         << "\n");
5756     }
5757
5758   return Result;
5759 }
5760
5761 FixedScalableVFPair
5762 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5763   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5764     // TODO: It may by useful to do since it's still likely to be dynamically
5765     // uniform if the target can skip.
5766     reportVectorizationFailure(
5767         "Not inserting runtime ptr check for divergent target",
5768         "runtime pointer checks needed. Not enabled for divergent target",
5769         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5770     return FixedScalableVFPair::getNone();
5771   }
5772
5773   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5774   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5775   if (TC == 1) {
5776     reportVectorizationFailure("Single iteration (non) loop",
5777         "loop trip count is one, irrelevant for vectorization",
5778         "SingleIterationLoop", ORE, TheLoop);
5779     return FixedScalableVFPair::getNone();
5780   }
5781
5782   switch (ScalarEpilogueStatus) {
5783   case CM_ScalarEpilogueAllowed:
5784     return computeFeasibleMaxVF(TC, UserVF);
5785   case CM_ScalarEpilogueNotAllowedUsePredicate:
5786     LLVM_FALLTHROUGH;
5787   case CM_ScalarEpilogueNotNeededUsePredicate:
5788     LLVM_DEBUG(
5789         dbgs() << "LV: vector predicate hint/switch found.\n"
5790                << "LV: Not allowing scalar epilogue, creating predicated "
5791                << "vector loop.\n");
5792     break;
5793   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5794     // fallthrough as a special case of OptForSize
5795   case CM_ScalarEpilogueNotAllowedOptSize:
5796     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5797       LLVM_DEBUG(
5798           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5799     else
5800       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5801                         << "count.\n");
5802
5803     // Bail if runtime checks are required, which are not good when optimising
5804     // for size.
5805     if (runtimeChecksRequired())
5806       return FixedScalableVFPair::getNone();
5807
5808     break;
5809   }
5810
5811   // The only loops we can vectorize without a scalar epilogue, are loops with
5812   // a bottom-test and a single exiting block. We'd have to handle the fact
5813   // that not every instruction executes on the last iteration.  This will
5814   // require a lane mask which varies through the vector loop body.  (TODO)
5815   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5816     // If there was a tail-folding hint/switch, but we can't fold the tail by
5817     // masking, fallback to a vectorization with a scalar epilogue.
5818     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5819       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5820                            "scalar epilogue instead.\n");
5821       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5822       return computeFeasibleMaxVF(TC, UserVF);
5823     }
5824     return FixedScalableVFPair::getNone();
5825   }
5826
5827   // Now try the tail folding
5828
5829   // Invalidate interleave groups that require an epilogue if we can't mask
5830   // the interleave-group.
5831   if (!useMaskedInterleavedAccesses(TTI)) {
5832     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5833            "No decisions should have been taken at this point");
5834     // Note: There is no need to invalidate any cost modeling decisions here, as
5835     // non where taken so far.
5836     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5837   }
5838
5839   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF);
5840   // Avoid tail folding if the trip count is known to be a multiple of any VF
5841   // we chose.
5842   // FIXME: The condition below pessimises the case for fixed-width vectors,
5843   // when scalable VFs are also candidates for vectorization.
5844   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5845     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5846     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5847            "MaxFixedVF must be a power of 2");
5848     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5849                                    : MaxFixedVF.getFixedValue();
5850     ScalarEvolution *SE = PSE.getSE();
5851     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5852     const SCEV *ExitCount = SE->getAddExpr(
5853         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5854     const SCEV *Rem = SE->getURemExpr(
5855         SE->applyLoopGuards(ExitCount, TheLoop),
5856         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5857     if (Rem->isZero()) {
5858       // Accept MaxFixedVF if we do not have a tail.
5859       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5860       return MaxFactors;
5861     }
5862   }
5863
5864   // For scalable vectors, don't use tail folding as this is currently not yet
5865   // supported. The code is likely to have ended up here if the tripcount is
5866   // low, in which case it makes sense not to use scalable vectors.
5867   if (MaxFactors.ScalableVF.isVector())
5868     MaxFactors.ScalableVF = ElementCount::getScalable(0);
5869
5870   // If we don't know the precise trip count, or if the trip count that we
5871   // found modulo the vectorization factor is not zero, try to fold the tail
5872   // by masking.
5873   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5874   if (Legal->prepareToFoldTailByMasking()) {
5875     FoldTailByMasking = true;
5876     return MaxFactors;
5877   }
5878
5879   // If there was a tail-folding hint/switch, but we can't fold the tail by
5880   // masking, fallback to a vectorization with a scalar epilogue.
5881   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5882     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5883                          "scalar epilogue instead.\n");
5884     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5885     return MaxFactors;
5886   }
5887
5888   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5889     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5890     return FixedScalableVFPair::getNone();
5891   }
5892
5893   if (TC == 0) {
5894     reportVectorizationFailure(
5895         "Unable to calculate the loop count due to complex control flow",
5896         "unable to calculate the loop count due to complex control flow",
5897         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5898     return FixedScalableVFPair::getNone();
5899   }
5900
5901   reportVectorizationFailure(
5902       "Cannot optimize for size and vectorize at the same time.",
5903       "cannot optimize for size and vectorize at the same time. "
5904       "Enable vectorization of this loop with '#pragma clang loop "
5905       "vectorize(enable)' when compiling with -Os/-Oz",
5906       "NoTailLoopWithOptForSize", ORE, TheLoop);
5907   return FixedScalableVFPair::getNone();
5908 }
5909
5910 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5911     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5912     const ElementCount &MaxSafeVF) {
5913   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5914   TypeSize WidestRegister = TTI.getRegisterBitWidth(
5915       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5916                            : TargetTransformInfo::RGK_FixedWidthVector);
5917
5918   // Convenience function to return the minimum of two ElementCounts.
5919   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5920     assert((LHS.isScalable() == RHS.isScalable()) &&
5921            "Scalable flags must match");
5922     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5923   };
5924
5925   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5926   // Note that both WidestRegister and WidestType may not be a powers of 2.
5927   auto MaxVectorElementCount = ElementCount::get(
5928       PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5929       ComputeScalableMaxVF);
5930   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5931   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5932                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5933
5934   if (!MaxVectorElementCount) {
5935     LLVM_DEBUG(dbgs() << "LV: The target has no "
5936                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
5937                       << " vector registers.\n");
5938     return ElementCount::getFixed(1);
5939   }
5940
5941   const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5942   if (ConstTripCount &&
5943       ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5944       isPowerOf2_32(ConstTripCount)) {
5945     // We need to clamp the VF to be the ConstTripCount. There is no point in
5946     // choosing a higher viable VF as done in the loop below. If
5947     // MaxVectorElementCount is scalable, we only fall back on a fixed VF when
5948     // the TC is less than or equal to the known number of lanes.
5949     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5950                       << ConstTripCount << "\n");
5951     return TripCountEC;
5952   }
5953
5954   ElementCount MaxVF = MaxVectorElementCount;
5955   if (TTI.shouldMaximizeVectorBandwidth() ||
5956       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5957     auto MaxVectorElementCountMaxBW = ElementCount::get(
5958         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5959         ComputeScalableMaxVF);
5960     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5961
5962     // Collect all viable vectorization factors larger than the default MaxVF
5963     // (i.e. MaxVectorElementCount).
5964     SmallVector<ElementCount, 8> VFs;
5965     for (ElementCount VS = MaxVectorElementCount * 2;
5966          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5967       VFs.push_back(VS);
5968
5969     // For each VF calculate its register usage.
5970     auto RUs = calculateRegisterUsage(VFs);
5971
5972     // Select the largest VF which doesn't require more registers than existing
5973     // ones.
5974     for (int i = RUs.size() - 1; i >= 0; --i) {
5975       bool Selected = true;
5976       for (auto &pair : RUs[i].MaxLocalUsers) {
5977         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5978         if (pair.second > TargetNumRegisters)
5979           Selected = false;
5980       }
5981       if (Selected) {
5982         MaxVF = VFs[i];
5983         break;
5984       }
5985     }
5986     if (ElementCount MinVF =
5987             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5988       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5989         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5990                           << ") with target's minimum: " << MinVF << '\n');
5991         MaxVF = MinVF;
5992       }
5993     }
5994   }
5995   return MaxVF;
5996 }
5997
5998 bool LoopVectorizationCostModel::isMoreProfitable(
5999     const VectorizationFactor &A, const VectorizationFactor &B) const {
6000   InstructionCost CostA = A.Cost;
6001   InstructionCost CostB = B.Cost;
6002
6003   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
6004
6005   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
6006       MaxTripCount) {
6007     // If we are folding the tail and the trip count is a known (possibly small)
6008     // constant, the trip count will be rounded up to an integer number of
6009     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
6010     // which we compare directly. When not folding the tail, the total cost will
6011     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
6012     // approximated with the per-lane cost below instead of using the tripcount
6013     // as here.
6014     auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
6015     auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
6016     return RTCostA < RTCostB;
6017   }
6018
6019   // When set to preferred, for now assume vscale may be larger than 1, so
6020   // that scalable vectorization is slightly favorable over fixed-width
6021   // vectorization.
6022   if (Hints->isScalableVectorizationPreferred())
6023     if (A.Width.isScalable() && !B.Width.isScalable())
6024       return (CostA * B.Width.getKnownMinValue()) <=
6025              (CostB * A.Width.getKnownMinValue());
6026
6027   // To avoid the need for FP division:
6028   //      (CostA / A.Width) < (CostB / B.Width)
6029   // <=>  (CostA * B.Width) < (CostB * A.Width)
6030   return (CostA * B.Width.getKnownMinValue()) <
6031          (CostB * A.Width.getKnownMinValue());
6032 }
6033
6034 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
6035     const ElementCountSet &VFCandidates) {
6036   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
6037   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
6038   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
6039   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
6040          "Expected Scalar VF to be a candidate");
6041
6042   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
6043   VectorizationFactor ChosenFactor = ScalarCost;
6044
6045   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
6046   if (ForceVectorization && VFCandidates.size() > 1) {
6047     // Ignore scalar width, because the user explicitly wants vectorization.
6048     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
6049     // evaluation.
6050     ChosenFactor.Cost = InstructionCost::getMax();
6051   }
6052
6053   SmallVector<InstructionVFPair> InvalidCosts;
6054   for (const auto &i : VFCandidates) {
6055     // The cost for scalar VF=1 is already calculated, so ignore it.
6056     if (i.isScalar())
6057       continue;
6058
6059     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
6060     VectorizationFactor Candidate(i, C.first);
6061     LLVM_DEBUG(
6062         dbgs() << "LV: Vector loop of width " << i << " costs: "
6063                << (Candidate.Cost / Candidate.Width.getKnownMinValue())
6064                << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "")
6065                << ".\n");
6066
6067     if (!C.second && !ForceVectorization) {
6068       LLVM_DEBUG(
6069           dbgs() << "LV: Not considering vector loop of width " << i
6070                  << " because it will not generate any vector instructions.\n");
6071       continue;
6072     }
6073
6074     // If profitable add it to ProfitableVF list.
6075     if (isMoreProfitable(Candidate, ScalarCost))
6076       ProfitableVFs.push_back(Candidate);
6077
6078     if (isMoreProfitable(Candidate, ChosenFactor))
6079       ChosenFactor = Candidate;
6080   }
6081
6082   // Emit a report of VFs with invalid costs in the loop.
6083   if (!InvalidCosts.empty()) {
6084     // Group the remarks per instruction, keeping the instruction order from
6085     // InvalidCosts.
6086     std::map<Instruction *, unsigned> Numbering;
6087     unsigned I = 0;
6088     for (auto &Pair : InvalidCosts)
6089       if (!Numbering.count(Pair.first))
6090         Numbering[Pair.first] = I++;
6091
6092     // Sort the list, first on instruction(number) then on VF.
6093     llvm::sort(InvalidCosts,
6094                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
6095                  if (Numbering[A.first] != Numbering[B.first])
6096                    return Numbering[A.first] < Numbering[B.first];
6097                  ElementCountComparator ECC;
6098                  return ECC(A.second, B.second);
6099                });
6100
6101     // For a list of ordered instruction-vf pairs:
6102     //   [(load, vf1), (load, vf2), (store, vf1)]
6103     // Group the instructions together to emit separate remarks for:
6104     //   load  (vf1, vf2)
6105     //   store (vf1)
6106     auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
6107     auto Subset = ArrayRef<InstructionVFPair>();
6108     do {
6109       if (Subset.empty())
6110         Subset = Tail.take_front(1);
6111
6112       Instruction *I = Subset.front().first;
6113
6114       // If the next instruction is different, or if there are no other pairs,
6115       // emit a remark for the collated subset. e.g.
6116       //   [(load, vf1), (load, vf2))]
6117       // to emit:
6118       //  remark: invalid costs for 'load' at VF=(vf, vf2)
6119       if (Subset == Tail || Tail[Subset.size()].first != I) {
6120         std::string OutString;
6121         raw_string_ostream OS(OutString);
6122         assert(!Subset.empty() && "Unexpected empty range");
6123         OS << "Instruction with invalid costs prevented vectorization at VF=(";
6124         for (auto &Pair : Subset)
6125           OS << (Pair.second == Subset.front().second ? "" : ", ")
6126              << Pair.second;
6127         OS << "):";
6128         if (auto *CI = dyn_cast<CallInst>(I))
6129           OS << " call to " << CI->getCalledFunction()->getName();
6130         else
6131           OS << " " << I->getOpcodeName();
6132         OS.flush();
6133         reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
6134         Tail = Tail.drop_front(Subset.size());
6135         Subset = {};
6136       } else
6137         // Grow the subset by one element
6138         Subset = Tail.take_front(Subset.size() + 1);
6139     } while (!Tail.empty());
6140   }
6141
6142   if (!EnableCondStoresVectorization && NumPredStores) {
6143     reportVectorizationFailure("There are conditional stores.",
6144         "store that is conditionally executed prevents vectorization",
6145         "ConditionalStore", ORE, TheLoop);
6146     ChosenFactor = ScalarCost;
6147   }
6148
6149   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
6150                  ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
6151              << "LV: Vectorization seems to be not beneficial, "
6152              << "but was forced by a user.\n");
6153   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
6154   return ChosenFactor;
6155 }
6156
6157 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
6158     const Loop &L, ElementCount VF) const {
6159   // Cross iteration phis such as reductions need special handling and are
6160   // currently unsupported.
6161   if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
6162         return Legal->isFirstOrderRecurrence(&Phi) ||
6163                Legal->isReductionVariable(&Phi);
6164       }))
6165     return false;
6166
6167   // Phis with uses outside of the loop require special handling and are
6168   // currently unsupported.
6169   for (auto &Entry : Legal->getInductionVars()) {
6170     // Look for uses of the value of the induction at the last iteration.
6171     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
6172     for (User *U : PostInc->users())
6173       if (!L.contains(cast<Instruction>(U)))
6174         return false;
6175     // Look for uses of penultimate value of the induction.
6176     for (User *U : Entry.first->users())
6177       if (!L.contains(cast<Instruction>(U)))
6178         return false;
6179   }
6180
6181   // Induction variables that are widened require special handling that is
6182   // currently not supported.
6183   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
6184         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
6185                  this->isProfitableToScalarize(Entry.first, VF));
6186       }))
6187     return false;
6188
6189   // Epilogue vectorization code has not been auditted to ensure it handles
6190   // non-latch exits properly.  It may be fine, but it needs auditted and
6191   // tested.
6192   if (L.getExitingBlock() != L.getLoopLatch())
6193     return false;
6194
6195   return true;
6196 }
6197
6198 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
6199     const ElementCount VF) const {
6200   // FIXME: We need a much better cost-model to take different parameters such
6201   // as register pressure, code size increase and cost of extra branches into
6202   // account. For now we apply a very crude heuristic and only consider loops
6203   // with vectorization factors larger than a certain value.
6204   // We also consider epilogue vectorization unprofitable for targets that don't
6205   // consider interleaving beneficial (eg. MVE).
6206   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
6207     return false;
6208   if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
6209     return true;
6210   return false;
6211 }
6212
6213 VectorizationFactor
6214 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
6215     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
6216   VectorizationFactor Result = VectorizationFactor::Disabled();
6217   if (!EnableEpilogueVectorization) {
6218     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
6219     return Result;
6220   }
6221
6222   if (!isScalarEpilogueAllowed()) {
6223     LLVM_DEBUG(
6224         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
6225                   "allowed.\n";);
6226     return Result;
6227   }
6228
6229   // FIXME: This can be fixed for scalable vectors later, because at this stage
6230   // the LoopVectorizer will only consider vectorizing a loop with scalable
6231   // vectors when the loop has a hint to enable vectorization for a given VF.
6232   if (MainLoopVF.isScalable()) {
6233     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not "
6234                          "yet supported.\n");
6235     return Result;
6236   }
6237
6238   // Not really a cost consideration, but check for unsupported cases here to
6239   // simplify the logic.
6240   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
6241     LLVM_DEBUG(
6242         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
6243                   "not a supported candidate.\n";);
6244     return Result;
6245   }
6246
6247   if (EpilogueVectorizationForceVF > 1) {
6248     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
6249     if (LVP.hasPlanWithVFs(
6250             {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
6251       return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
6252     else {
6253       LLVM_DEBUG(
6254           dbgs()
6255               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
6256       return Result;
6257     }
6258   }
6259
6260   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
6261       TheLoop->getHeader()->getParent()->hasMinSize()) {
6262     LLVM_DEBUG(
6263         dbgs()
6264             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
6265     return Result;
6266   }
6267
6268   if (!isEpilogueVectorizationProfitable(MainLoopVF))
6269     return Result;
6270
6271   for (auto &NextVF : ProfitableVFs)
6272     if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
6273         (Result.Width.getFixedValue() == 1 ||
6274          isMoreProfitable(NextVF, Result)) &&
6275         LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
6276       Result = NextVF;
6277
6278   if (Result != VectorizationFactor::Disabled())
6279     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
6280                       << Result.Width.getFixedValue() << "\n";);
6281   return Result;
6282 }
6283
6284 std::pair<unsigned, unsigned>
6285 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
6286   unsigned MinWidth = -1U;
6287   unsigned MaxWidth = 8;
6288   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
6289   for (Type *T : ElementTypesInLoop) {
6290     MinWidth = std::min<unsigned>(
6291         MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
6292     MaxWidth = std::max<unsigned>(
6293         MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
6294   }
6295   return {MinWidth, MaxWidth};
6296 }
6297
6298 void LoopVectorizationCostModel::collectElementTypesForWidening() {
6299   ElementTypesInLoop.clear();
6300   // For each block.
6301   for (BasicBlock *BB : TheLoop->blocks()) {
6302     // For each instruction in the loop.
6303     for (Instruction &I : BB->instructionsWithoutDebug()) {
6304       Type *T = I.getType();
6305
6306       // Skip ignored values.
6307       if (ValuesToIgnore.count(&I))
6308         continue;
6309
6310       // Only examine Loads, Stores and PHINodes.
6311       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
6312         continue;
6313
6314       // Examine PHI nodes that are reduction variables. Update the type to
6315       // account for the recurrence type.
6316       if (auto *PN = dyn_cast<PHINode>(&I)) {
6317         if (!Legal->isReductionVariable(PN))
6318           continue;
6319         const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[PN];
6320         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
6321             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
6322                                       RdxDesc.getRecurrenceType(),
6323                                       TargetTransformInfo::ReductionFlags()))
6324           continue;
6325         T = RdxDesc.getRecurrenceType();
6326       }
6327
6328       // Examine the stored values.
6329       if (auto *ST = dyn_cast<StoreInst>(&I))
6330         T = ST->getValueOperand()->getType();
6331
6332       // Ignore loaded pointer types and stored pointer types that are not
6333       // vectorizable.
6334       //
6335       // FIXME: The check here attempts to predict whether a load or store will
6336       //        be vectorized. We only know this for certain after a VF has
6337       //        been selected. Here, we assume that if an access can be
6338       //        vectorized, it will be. We should also look at extending this
6339       //        optimization to non-pointer types.
6340       //
6341       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
6342           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
6343         continue;
6344
6345       ElementTypesInLoop.insert(T);
6346     }
6347   }
6348 }
6349
6350 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
6351                                                            unsigned LoopCost) {
6352   // -- The interleave heuristics --
6353   // We interleave the loop in order to expose ILP and reduce the loop overhead.
6354   // There are many micro-architectural considerations that we can't predict
6355   // at this level. For example, frontend pressure (on decode or fetch) due to
6356   // code size, or the number and capabilities of the execution ports.
6357   //
6358   // We use the following heuristics to select the interleave count:
6359   // 1. If the code has reductions, then we interleave to break the cross
6360   // iteration dependency.
6361   // 2. If the loop is really small, then we interleave to reduce the loop
6362   // overhead.
6363   // 3. We don't interleave if we think that we will spill registers to memory
6364   // due to the increased register pressure.
6365
6366   if (!isScalarEpilogueAllowed())
6367     return 1;
6368
6369   // We used the distance for the interleave count.
6370   if (Legal->getMaxSafeDepDistBytes() != -1U)
6371     return 1;
6372
6373   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
6374   const bool HasReductions = !Legal->getReductionVars().empty();
6375   // Do not interleave loops with a relatively small known or estimated trip
6376   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
6377   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
6378   // because with the above conditions interleaving can expose ILP and break
6379   // cross iteration dependences for reductions.
6380   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
6381       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
6382     return 1;
6383
6384   RegisterUsage R = calculateRegisterUsage({VF})[0];
6385   // We divide by these constants so assume that we have at least one
6386   // instruction that uses at least one register.
6387   for (auto& pair : R.MaxLocalUsers) {
6388     pair.second = std::max(pair.second, 1U);
6389   }
6390
6391   // We calculate the interleave count using the following formula.
6392   // Subtract the number of loop invariants from the number of available
6393   // registers. These registers are used by all of the interleaved instances.
6394   // Next, divide the remaining registers by the number of registers that is
6395   // required by the loop, in order to estimate how many parallel instances
6396   // fit without causing spills. All of this is rounded down if necessary to be
6397   // a power of two. We want power of two interleave count to simplify any
6398   // addressing operations or alignment considerations.
6399   // We also want power of two interleave counts to ensure that the induction
6400   // variable of the vector loop wraps to zero, when tail is folded by masking;
6401   // this currently happens when OptForSize, in which case IC is set to 1 above.
6402   unsigned IC = UINT_MAX;
6403
6404   for (auto& pair : R.MaxLocalUsers) {
6405     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
6406     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
6407                       << " registers of "
6408                       << TTI.getRegisterClassName(pair.first) << " register class\n");
6409     if (VF.isScalar()) {
6410       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
6411         TargetNumRegisters = ForceTargetNumScalarRegs;
6412     } else {
6413       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
6414         TargetNumRegisters = ForceTargetNumVectorRegs;
6415     }
6416     unsigned MaxLocalUsers = pair.second;
6417     unsigned LoopInvariantRegs = 0;
6418     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
6419       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
6420
6421     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
6422     // Don't count the induction variable as interleaved.
6423     if (EnableIndVarRegisterHeur) {
6424       TmpIC =
6425           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
6426                         std::max(1U, (MaxLocalUsers - 1)));
6427     }
6428
6429     IC = std::min(IC, TmpIC);
6430   }
6431
6432   // Clamp the interleave ranges to reasonable counts.
6433   unsigned MaxInterleaveCount =
6434       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
6435
6436   // Check if the user has overridden the max.
6437   if (VF.isScalar()) {
6438     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6439       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6440   } else {
6441     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6442       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6443   }
6444
6445   // If trip count is known or estimated compile time constant, limit the
6446   // interleave count to be less than the trip count divided by VF, provided it
6447   // is at least 1.
6448   //
6449   // For scalable vectors we can't know if interleaving is beneficial. It may
6450   // not be beneficial for small loops if none of the lanes in the second vector
6451   // iterations is enabled. However, for larger loops, there is likely to be a
6452   // similar benefit as for fixed-width vectors. For now, we choose to leave
6453   // the InterleaveCount as if vscale is '1', although if some information about
6454   // the vector is known (e.g. min vector size), we can make a better decision.
6455   if (BestKnownTC) {
6456     MaxInterleaveCount =
6457         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
6458     // Make sure MaxInterleaveCount is greater than 0.
6459     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
6460   }
6461
6462   assert(MaxInterleaveCount > 0 &&
6463          "Maximum interleave count must be greater than 0");
6464
6465   // Clamp the calculated IC to be between the 1 and the max interleave count
6466   // that the target and trip count allows.
6467   if (IC > MaxInterleaveCount)
6468     IC = MaxInterleaveCount;
6469   else
6470     // Make sure IC is greater than 0.
6471     IC = std::max(1u, IC);
6472
6473   assert(IC > 0 && "Interleave count must be greater than 0.");
6474
6475   // If we did not calculate the cost for VF (because the user selected the VF)
6476   // then we calculate the cost of VF here.
6477   if (LoopCost == 0) {
6478     InstructionCost C = expectedCost(VF).first;
6479     assert(C.isValid() && "Expected to have chosen a VF with valid cost");
6480     LoopCost = *C.getValue();
6481   }
6482
6483   assert(LoopCost && "Non-zero loop cost expected");
6484
6485   // Interleave if we vectorized this loop and there is a reduction that could
6486   // benefit from interleaving.
6487   if (VF.isVector() && HasReductions) {
6488     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6489     return IC;
6490   }
6491
6492   // Note that if we've already vectorized the loop we will have done the
6493   // runtime check and so interleaving won't require further checks.
6494   bool InterleavingRequiresRuntimePointerCheck =
6495       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6496
6497   // We want to interleave small loops in order to reduce the loop overhead and
6498   // potentially expose ILP opportunities.
6499   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6500                     << "LV: IC is " << IC << '\n'
6501                     << "LV: VF is " << VF << '\n');
6502   const bool AggressivelyInterleaveReductions =
6503       TTI.enableAggressiveInterleaving(HasReductions);
6504   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6505     // We assume that the cost overhead is 1 and we use the cost model
6506     // to estimate the cost of the loop and interleave until the cost of the
6507     // loop overhead is about 5% of the cost of the loop.
6508     unsigned SmallIC =
6509         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6510
6511     // Interleave until store/load ports (estimated by max interleave count) are
6512     // saturated.
6513     unsigned NumStores = Legal->getNumStores();
6514     unsigned NumLoads = Legal->getNumLoads();
6515     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6516     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6517
6518     // If we have a scalar reduction (vector reductions are already dealt with
6519     // by this point), we can increase the critical path length if the loop
6520     // we're interleaving is inside another loop. For tree-wise reductions
6521     // set the limit to 2, and for ordered reductions it's best to disable
6522     // interleaving entirely.
6523     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6524       bool HasOrderedReductions =
6525           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
6526             const RecurrenceDescriptor &RdxDesc = Reduction.second;
6527             return RdxDesc.isOrdered();
6528           });
6529       if (HasOrderedReductions) {
6530         LLVM_DEBUG(
6531             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
6532         return 1;
6533       }
6534
6535       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6536       SmallIC = std::min(SmallIC, F);
6537       StoresIC = std::min(StoresIC, F);
6538       LoadsIC = std::min(LoadsIC, F);
6539     }
6540
6541     if (EnableLoadStoreRuntimeInterleave &&
6542         std::max(StoresIC, LoadsIC) > SmallIC) {
6543       LLVM_DEBUG(
6544           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6545       return std::max(StoresIC, LoadsIC);
6546     }
6547
6548     // If there are scalar reductions and TTI has enabled aggressive
6549     // interleaving for reductions, we will interleave to expose ILP.
6550     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6551         AggressivelyInterleaveReductions) {
6552       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6553       // Interleave no less than SmallIC but not as aggressive as the normal IC
6554       // to satisfy the rare situation when resources are too limited.
6555       return std::max(IC / 2, SmallIC);
6556     } else {
6557       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6558       return SmallIC;
6559     }
6560   }
6561
6562   // Interleave if this is a large loop (small loops are already dealt with by
6563   // this point) that could benefit from interleaving.
6564   if (AggressivelyInterleaveReductions) {
6565     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6566     return IC;
6567   }
6568
6569   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6570   return 1;
6571 }
6572
6573 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6574 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6575   // This function calculates the register usage by measuring the highest number
6576   // of values that are alive at a single location. Obviously, this is a very
6577   // rough estimation. We scan the loop in a topological order in order and
6578   // assign a number to each instruction. We use RPO to ensure that defs are
6579   // met before their users. We assume that each instruction that has in-loop
6580   // users starts an interval. We record every time that an in-loop value is
6581   // used, so we have a list of the first and last occurrences of each
6582   // instruction. Next, we transpose this data structure into a multi map that
6583   // holds the list of intervals that *end* at a specific location. This multi
6584   // map allows us to perform a linear search. We scan the instructions linearly
6585   // and record each time that a new interval starts, by placing it in a set.
6586   // If we find this value in the multi-map then we remove it from the set.
6587   // The max register usage is the maximum size of the set.
6588   // We also search for instructions that are defined outside the loop, but are
6589   // used inside the loop. We need this number separately from the max-interval
6590   // usage number because when we unroll, loop-invariant values do not take
6591   // more register.
6592   LoopBlocksDFS DFS(TheLoop);
6593   DFS.perform(LI);
6594
6595   RegisterUsage RU;
6596
6597   // Each 'key' in the map opens a new interval. The values
6598   // of the map are the index of the 'last seen' usage of the
6599   // instruction that is the key.
6600   using IntervalMap = DenseMap<Instruction *, unsigned>;
6601
6602   // Maps instruction to its index.
6603   SmallVector<Instruction *, 64> IdxToInstr;
6604   // Marks the end of each interval.
6605   IntervalMap EndPoint;
6606   // Saves the list of instruction indices that are used in the loop.
6607   SmallPtrSet<Instruction *, 8> Ends;
6608   // Saves the list of values that are used in the loop but are
6609   // defined outside the loop, such as arguments and constants.
6610   SmallPtrSet<Value *, 8> LoopInvariants;
6611
6612   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6613     for (Instruction &I : BB->instructionsWithoutDebug()) {
6614       IdxToInstr.push_back(&I);
6615
6616       // Save the end location of each USE.
6617       for (Value *U : I.operands()) {
6618         auto *Instr = dyn_cast<Instruction>(U);
6619
6620         // Ignore non-instruction values such as arguments, constants, etc.
6621         if (!Instr)
6622           continue;
6623
6624         // If this instruction is outside the loop then record it and continue.
6625         if (!TheLoop->contains(Instr)) {
6626           LoopInvariants.insert(Instr);
6627           continue;
6628         }
6629
6630         // Overwrite previous end points.
6631         EndPoint[Instr] = IdxToInstr.size();
6632         Ends.insert(Instr);
6633       }
6634     }
6635   }
6636
6637   // Saves the list of intervals that end with the index in 'key'.
6638   using InstrList = SmallVector<Instruction *, 2>;
6639   DenseMap<unsigned, InstrList> TransposeEnds;
6640
6641   // Transpose the EndPoints to a list of values that end at each index.
6642   for (auto &Interval : EndPoint)
6643     TransposeEnds[Interval.second].push_back(Interval.first);
6644
6645   SmallPtrSet<Instruction *, 8> OpenIntervals;
6646   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6647   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6648
6649   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6650
6651   // A lambda that gets the register usage for the given type and VF.
6652   const auto &TTICapture = TTI;
6653   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
6654     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6655       return 0;
6656     InstructionCost::CostType RegUsage =
6657         *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
6658     assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
6659            "Nonsensical values for register usage.");
6660     return RegUsage;
6661   };
6662
6663   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6664     Instruction *I = IdxToInstr[i];
6665
6666     // Remove all of the instructions that end at this location.
6667     InstrList &List = TransposeEnds[i];
6668     for (Instruction *ToRemove : List)
6669       OpenIntervals.erase(ToRemove);
6670
6671     // Ignore instructions that are never used within the loop.
6672     if (!Ends.count(I))
6673       continue;
6674
6675     // Skip ignored values.
6676     if (ValuesToIgnore.count(I))
6677       continue;
6678
6679     // For each VF find the maximum usage of registers.
6680     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6681       // Count the number of live intervals.
6682       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6683
6684       if (VFs[j].isScalar()) {
6685         for (auto Inst : OpenIntervals) {
6686           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6687           if (RegUsage.find(ClassID) == RegUsage.end())
6688             RegUsage[ClassID] = 1;
6689           else
6690             RegUsage[ClassID] += 1;
6691         }
6692       } else {
6693         collectUniformsAndScalars(VFs[j]);
6694         for (auto Inst : OpenIntervals) {
6695           // Skip ignored values for VF > 1.
6696           if (VecValuesToIgnore.count(Inst))
6697             continue;
6698           if (isScalarAfterVectorization(Inst, VFs[j])) {
6699             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6700             if (RegUsage.find(ClassID) == RegUsage.end())
6701               RegUsage[ClassID] = 1;
6702             else
6703               RegUsage[ClassID] += 1;
6704           } else {
6705             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6706             if (RegUsage.find(ClassID) == RegUsage.end())
6707               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6708             else
6709               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6710           }
6711         }
6712       }
6713
6714       for (auto& pair : RegUsage) {
6715         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6716           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6717         else
6718           MaxUsages[j][pair.first] = pair.second;
6719       }
6720     }
6721
6722     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6723                       << OpenIntervals.size() << '\n');
6724
6725     // Add the current instruction to the list of open intervals.
6726     OpenIntervals.insert(I);
6727   }
6728
6729   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6730     SmallMapVector<unsigned, unsigned, 4> Invariant;
6731
6732     for (auto Inst : LoopInvariants) {
6733       unsigned Usage =
6734           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6735       unsigned ClassID =
6736           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6737       if (Invariant.find(ClassID) == Invariant.end())
6738         Invariant[ClassID] = Usage;
6739       else
6740         Invariant[ClassID] += Usage;
6741     }
6742
6743     LLVM_DEBUG({
6744       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6745       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6746              << " item\n";
6747       for (const auto &pair : MaxUsages[i]) {
6748         dbgs() << "LV(REG): RegisterClass: "
6749                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6750                << " registers\n";
6751       }
6752       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6753              << " item\n";
6754       for (const auto &pair : Invariant) {
6755         dbgs() << "LV(REG): RegisterClass: "
6756                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6757                << " registers\n";
6758       }
6759     });
6760
6761     RU.LoopInvariantRegs = Invariant;
6762     RU.MaxLocalUsers = MaxUsages[i];
6763     RUs[i] = RU;
6764   }
6765
6766   return RUs;
6767 }
6768
6769 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6770   // TODO: Cost model for emulated masked load/store is completely
6771   // broken. This hack guides the cost model to use an artificially
6772   // high enough value to practically disable vectorization with such
6773   // operations, except where previously deployed legality hack allowed
6774   // using very low cost values. This is to avoid regressions coming simply
6775   // from moving "masked load/store" check from legality to cost model.
6776   // Masked Load/Gather emulation was previously never allowed.
6777   // Limited number of Masked Store/Scatter emulation was allowed.
6778   assert(isPredicatedInst(I) &&
6779          "Expecting a scalar emulated instruction");
6780   return isa<LoadInst>(I) ||
6781          (isa<StoreInst>(I) &&
6782           NumPredStores > NumberOfStoresToPredicate);
6783 }
6784
6785 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6786   // If we aren't vectorizing the loop, or if we've already collected the
6787   // instructions to scalarize, there's nothing to do. Collection may already
6788   // have occurred if we have a user-selected VF and are now computing the
6789   // expected cost for interleaving.
6790   if (VF.isScalar() || VF.isZero() ||
6791       InstsToScalarize.find(VF) != InstsToScalarize.end())
6792     return;
6793
6794   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6795   // not profitable to scalarize any instructions, the presence of VF in the
6796   // map will indicate that we've analyzed it already.
6797   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6798
6799   // Find all the instructions that are scalar with predication in the loop and
6800   // determine if it would be better to not if-convert the blocks they are in.
6801   // If so, we also record the instructions to scalarize.
6802   for (BasicBlock *BB : TheLoop->blocks()) {
6803     if (!blockNeedsPredication(BB))
6804       continue;
6805     for (Instruction &I : *BB)
6806       if (isScalarWithPredication(&I)) {
6807         ScalarCostsTy ScalarCosts;
6808         // Do not apply discount if scalable, because that would lead to
6809         // invalid scalarization costs.
6810         // Do not apply discount logic if hacked cost is needed
6811         // for emulated masked memrefs.
6812         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) &&
6813             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6814           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6815         // Remember that BB will remain after vectorization.
6816         PredicatedBBsAfterVectorization.insert(BB);
6817       }
6818   }
6819 }
6820
6821 int LoopVectorizationCostModel::computePredInstDiscount(
6822     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6823   assert(!isUniformAfterVectorization(PredInst, VF) &&
6824          "Instruction marked uniform-after-vectorization will be predicated");
6825
6826   // Initialize the discount to zero, meaning that the scalar version and the
6827   // vector version cost the same.
6828   InstructionCost Discount = 0;
6829
6830   // Holds instructions to analyze. The instructions we visit are mapped in
6831   // ScalarCosts. Those instructions are the ones that would be scalarized if
6832   // we find that the scalar version costs less.
6833   SmallVector<Instruction *, 8> Worklist;
6834
6835   // Returns true if the given instruction can be scalarized.
6836   auto canBeScalarized = [&](Instruction *I) -> bool {
6837     // We only attempt to scalarize instructions forming a single-use chain
6838     // from the original predicated block that would otherwise be vectorized.
6839     // Although not strictly necessary, we give up on instructions we know will
6840     // already be scalar to avoid traversing chains that are unlikely to be
6841     // beneficial.
6842     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6843         isScalarAfterVectorization(I, VF))
6844       return false;
6845
6846     // If the instruction is scalar with predication, it will be analyzed
6847     // separately. We ignore it within the context of PredInst.
6848     if (isScalarWithPredication(I))
6849       return false;
6850
6851     // If any of the instruction's operands are uniform after vectorization,
6852     // the instruction cannot be scalarized. This prevents, for example, a
6853     // masked load from being scalarized.
6854     //
6855     // We assume we will only emit a value for lane zero of an instruction
6856     // marked uniform after vectorization, rather than VF identical values.
6857     // Thus, if we scalarize an instruction that uses a uniform, we would
6858     // create uses of values corresponding to the lanes we aren't emitting code
6859     // for. This behavior can be changed by allowing getScalarValue to clone
6860     // the lane zero values for uniforms rather than asserting.
6861     for (Use &U : I->operands())
6862       if (auto *J = dyn_cast<Instruction>(U.get()))
6863         if (isUniformAfterVectorization(J, VF))
6864           return false;
6865
6866     // Otherwise, we can scalarize the instruction.
6867     return true;
6868   };
6869
6870   // Compute the expected cost discount from scalarizing the entire expression
6871   // feeding the predicated instruction. We currently only consider expressions
6872   // that are single-use instruction chains.
6873   Worklist.push_back(PredInst);
6874   while (!Worklist.empty()) {
6875     Instruction *I = Worklist.pop_back_val();
6876
6877     // If we've already analyzed the instruction, there's nothing to do.
6878     if (ScalarCosts.find(I) != ScalarCosts.end())
6879       continue;
6880
6881     // Compute the cost of the vector instruction. Note that this cost already
6882     // includes the scalarization overhead of the predicated instruction.
6883     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6884
6885     // Compute the cost of the scalarized instruction. This cost is the cost of
6886     // the instruction as if it wasn't if-converted and instead remained in the
6887     // predicated block. We will scale this cost by block probability after
6888     // computing the scalarization overhead.
6889     InstructionCost ScalarCost =
6890         VF.getFixedValue() *
6891         getInstructionCost(I, ElementCount::getFixed(1)).first;
6892
6893     // Compute the scalarization overhead of needed insertelement instructions
6894     // and phi nodes.
6895     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6896       ScalarCost += TTI.getScalarizationOverhead(
6897           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6898           APInt::getAllOnesValue(VF.getFixedValue()), true, false);
6899       ScalarCost +=
6900           VF.getFixedValue() *
6901           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6902     }
6903
6904     // Compute the scalarization overhead of needed extractelement
6905     // instructions. For each of the instruction's operands, if the operand can
6906     // be scalarized, add it to the worklist; otherwise, account for the
6907     // overhead.
6908     for (Use &U : I->operands())
6909       if (auto *J = dyn_cast<Instruction>(U.get())) {
6910         assert(VectorType::isValidElementType(J->getType()) &&
6911                "Instruction has non-scalar type");
6912         if (canBeScalarized(J))
6913           Worklist.push_back(J);
6914         else if (needsExtract(J, VF)) {
6915           ScalarCost += TTI.getScalarizationOverhead(
6916               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6917               APInt::getAllOnesValue(VF.getFixedValue()), false, true);
6918         }
6919       }
6920
6921     // Scale the total scalar cost by block probability.
6922     ScalarCost /= getReciprocalPredBlockProb();
6923
6924     // Compute the discount. A non-negative discount means the vector version
6925     // of the instruction costs more, and scalarizing would be beneficial.
6926     Discount += VectorCost - ScalarCost;
6927     ScalarCosts[I] = ScalarCost;
6928   }
6929
6930   return *Discount.getValue();
6931 }
6932
6933 LoopVectorizationCostModel::VectorizationCostTy
6934 LoopVectorizationCostModel::expectedCost(
6935     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6936   VectorizationCostTy Cost;
6937
6938   // For each block.
6939   for (BasicBlock *BB : TheLoop->blocks()) {
6940     VectorizationCostTy BlockCost;
6941
6942     // For each instruction in the old loop.
6943     for (Instruction &I : BB->instructionsWithoutDebug()) {
6944       // Skip ignored values.
6945       if (ValuesToIgnore.count(&I) ||
6946           (VF.isVector() && VecValuesToIgnore.count(&I)))
6947         continue;
6948
6949       VectorizationCostTy C = getInstructionCost(&I, VF);
6950
6951       // Check if we should override the cost.
6952       if (C.first.isValid() &&
6953           ForceTargetInstructionCost.getNumOccurrences() > 0)
6954         C.first = InstructionCost(ForceTargetInstructionCost);
6955
6956       // Keep a list of instructions with invalid costs.
6957       if (Invalid && !C.first.isValid())
6958         Invalid->emplace_back(&I, VF);
6959
6960       BlockCost.first += C.first;
6961       BlockCost.second |= C.second;
6962       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6963                         << " for VF " << VF << " For instruction: " << I
6964                         << '\n');
6965     }
6966
6967     // If we are vectorizing a predicated block, it will have been
6968     // if-converted. This means that the block's instructions (aside from
6969     // stores and instructions that may divide by zero) will now be
6970     // unconditionally executed. For the scalar case, we may not always execute
6971     // the predicated block, if it is an if-else block. Thus, scale the block's
6972     // cost by the probability of executing it. blockNeedsPredication from
6973     // Legal is used so as to not include all blocks in tail folded loops.
6974     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6975       BlockCost.first /= getReciprocalPredBlockProb();
6976
6977     Cost.first += BlockCost.first;
6978     Cost.second |= BlockCost.second;
6979   }
6980
6981   return Cost;
6982 }
6983
6984 /// Gets Address Access SCEV after verifying that the access pattern
6985 /// is loop invariant except the induction variable dependence.
6986 ///
6987 /// This SCEV can be sent to the Target in order to estimate the address
6988 /// calculation cost.
6989 static const SCEV *getAddressAccessSCEV(
6990               Value *Ptr,
6991               LoopVectorizationLegality *Legal,
6992               PredicatedScalarEvolution &PSE,
6993               const Loop *TheLoop) {
6994
6995   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6996   if (!Gep)
6997     return nullptr;
6998
6999   // We are looking for a gep with all loop invariant indices except for one
7000   // which should be an induction variable.
7001   auto SE = PSE.getSE();
7002   unsigned NumOperands = Gep->getNumOperands();
7003   for (unsigned i = 1; i < NumOperands; ++i) {
7004     Value *Opd = Gep->getOperand(i);
7005     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
7006         !Legal->isInductionVariable(Opd))
7007       return nullptr;
7008   }
7009
7010   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
7011   return PSE.getSCEV(Ptr);
7012 }
7013
7014 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
7015   return Legal->hasStride(I->getOperand(0)) ||
7016          Legal->hasStride(I->getOperand(1));
7017 }
7018
7019 InstructionCost
7020 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
7021                                                         ElementCount VF) {
7022   assert(VF.isVector() &&
7023          "Scalarization cost of instruction implies vectorization.");
7024   if (VF.isScalable())
7025     return InstructionCost::getInvalid();
7026
7027   Type *ValTy = getLoadStoreType(I);
7028   auto SE = PSE.getSE();
7029
7030   unsigned AS = getLoadStoreAddressSpace(I);
7031   Value *Ptr = getLoadStorePointerOperand(I);
7032   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
7033
7034   // Figure out whether the access is strided and get the stride value
7035   // if it's known in compile time
7036   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
7037
7038   // Get the cost of the scalar memory instruction and address computation.
7039   InstructionCost Cost =
7040       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
7041
7042   // Don't pass *I here, since it is scalar but will actually be part of a
7043   // vectorized loop where the user of it is a vectorized instruction.
7044   const Align Alignment = getLoadStoreAlignment(I);
7045   Cost += VF.getKnownMinValue() *
7046           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
7047                               AS, TTI::TCK_RecipThroughput);
7048
7049   // Get the overhead of the extractelement and insertelement instructions
7050   // we might create due to scalarization.
7051   Cost += getScalarizationOverhead(I, VF);
7052
7053   // If we have a predicated load/store, it will need extra i1 extracts and
7054   // conditional branches, but may not be executed for each vector lane. Scale
7055   // the cost by the probability of executing the predicated block.
7056   if (isPredicatedInst(I)) {
7057     Cost /= getReciprocalPredBlockProb();
7058
7059     // Add the cost of an i1 extract and a branch
7060     auto *Vec_i1Ty =
7061         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
7062     Cost += TTI.getScalarizationOverhead(
7063         Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
7064         /*Insert=*/false, /*Extract=*/true);
7065     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
7066
7067     if (useEmulatedMaskMemRefHack(I))
7068       // Artificially setting to a high enough value to practically disable
7069       // vectorization with such operations.
7070       Cost = 3000000;
7071   }
7072
7073   return Cost;
7074 }
7075
7076 InstructionCost
7077 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
7078                                                     ElementCount VF) {
7079   Type *ValTy = getLoadStoreType(I);
7080   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
7081   Value *Ptr = getLoadStorePointerOperand(I);
7082   unsigned AS = getLoadStoreAddressSpace(I);
7083   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
7084   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7085
7086   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7087          "Stride should be 1 or -1 for consecutive memory access");
7088   const Align Alignment = getLoadStoreAlignment(I);
7089   InstructionCost Cost = 0;
7090   if (Legal->isMaskRequired(I))
7091     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
7092                                       CostKind);
7093   else
7094     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
7095                                 CostKind, I);
7096
7097   bool Reverse = ConsecutiveStride < 0;
7098   if (Reverse)
7099     Cost +=
7100         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
7101   return Cost;
7102 }
7103
7104 InstructionCost
7105 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
7106                                                 ElementCount VF) {
7107   assert(Legal->isUniformMemOp(*I));
7108
7109   Type *ValTy = getLoadStoreType(I);
7110   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
7111   const Align Alignment = getLoadStoreAlignment(I);
7112   unsigned AS = getLoadStoreAddressSpace(I);
7113   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7114   if (isa<LoadInst>(I)) {
7115     return TTI.getAddressComputationCost(ValTy) +
7116            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
7117                                CostKind) +
7118            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
7119   }
7120   StoreInst *SI = cast<StoreInst>(I);
7121
7122   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
7123   return TTI.getAddressComputationCost(ValTy) +
7124          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
7125                              CostKind) +
7126          (isLoopInvariantStoreValue
7127               ? 0
7128               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
7129                                        VF.getKnownMinValue() - 1));
7130 }
7131
7132 InstructionCost
7133 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
7134                                                  ElementCount VF) {
7135   Type *ValTy = getLoadStoreType(I);
7136   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
7137   const Align Alignment = getLoadStoreAlignment(I);
7138   const Value *Ptr = getLoadStorePointerOperand(I);
7139
7140   return TTI.getAddressComputationCost(VectorTy) +
7141          TTI.getGatherScatterOpCost(
7142              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
7143              TargetTransformInfo::TCK_RecipThroughput, I);
7144 }
7145
7146 InstructionCost
7147 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
7148                                                    ElementCount VF) {
7149   // TODO: Once we have support for interleaving with scalable vectors
7150   // we can calculate the cost properly here.
7151   if (VF.isScalable())
7152     return InstructionCost::getInvalid();
7153
7154   Type *ValTy = getLoadStoreType(I);
7155   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
7156   unsigned AS = getLoadStoreAddressSpace(I);
7157
7158   auto Group = getInterleavedAccessGroup(I);
7159   assert(Group && "Fail to get an interleaved access group.");
7160
7161   unsigned InterleaveFactor = Group->getFactor();
7162   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
7163
7164   // Holds the indices of existing members in the interleaved group.
7165   SmallVector<unsigned, 4> Indices;
7166   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
7167     if (Group->getMember(IF))
7168       Indices.push_back(IF);
7169
7170   // Calculate the cost of the whole interleaved group.
7171   bool UseMaskForGaps =
7172       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
7173       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
7174   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
7175       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
7176       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
7177
7178   if (Group->isReverse()) {
7179     // TODO: Add support for reversed masked interleaved access.
7180     assert(!Legal->isMaskRequired(I) &&
7181            "Reverse masked interleaved access not supported.");
7182     Cost +=
7183         Group->getNumMembers() *
7184         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
7185   }
7186   return Cost;
7187 }
7188
7189 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
7190     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
7191   using namespace llvm::PatternMatch;
7192   // Early exit for no inloop reductions
7193   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
7194     return None;
7195   auto *VectorTy = cast<VectorType>(Ty);
7196
7197   // We are looking for a pattern of, and finding the minimal acceptable cost:
7198   //  reduce(mul(ext(A), ext(B))) or
7199   //  reduce(mul(A, B)) or
7200   //  reduce(ext(A)) or
7201   //  reduce(A).
7202   // The basic idea is that we walk down the tree to do that, finding the root
7203   // reduction instruction in InLoopReductionImmediateChains. From there we find
7204   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
7205   // of the components. If the reduction cost is lower then we return it for the
7206   // reduction instruction and 0 for the other instructions in the pattern. If
7207   // it is not we return an invalid cost specifying the orignal cost method
7208   // should be used.
7209   Instruction *RetI = I;
7210   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
7211     if (!RetI->hasOneUser())
7212       return None;
7213     RetI = RetI->user_back();
7214   }
7215   if (match(RetI, m_Mul(m_Value(), m_Value())) &&
7216       RetI->user_back()->getOpcode() == Instruction::Add) {
7217     if (!RetI->hasOneUser())
7218       return None;
7219     RetI = RetI->user_back();
7220   }
7221
7222   // Test if the found instruction is a reduction, and if not return an invalid
7223   // cost specifying the parent to use the original cost modelling.
7224   if (!InLoopReductionImmediateChains.count(RetI))
7225     return None;
7226
7227   // Find the reduction this chain is a part of and calculate the basic cost of
7228   // the reduction on its own.
7229   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
7230   Instruction *ReductionPhi = LastChain;
7231   while (!isa<PHINode>(ReductionPhi))
7232     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
7233
7234   const RecurrenceDescriptor &RdxDesc =
7235       Legal->getReductionVars()[cast<PHINode>(ReductionPhi)];
7236
7237   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
7238       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
7239
7240   // If we're using ordered reductions then we can just return the base cost
7241   // here, since getArithmeticReductionCost calculates the full ordered
7242   // reduction cost when FP reassociation is not allowed.
7243   if (useOrderedReductions(RdxDesc))
7244     return BaseCost;
7245
7246   // Get the operand that was not the reduction chain and match it to one of the
7247   // patterns, returning the better cost if it is found.
7248   Instruction *RedOp = RetI->getOperand(1) == LastChain
7249                            ? dyn_cast<Instruction>(RetI->getOperand(0))
7250                            : dyn_cast<Instruction>(RetI->getOperand(1));
7251
7252   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
7253
7254   Instruction *Op0, *Op1;
7255   if (RedOp &&
7256       match(RedOp,
7257             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
7258       match(Op0, m_ZExtOrSExt(m_Value())) &&
7259       Op0->getOpcode() == Op1->getOpcode() &&
7260       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
7261       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
7262       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
7263
7264     // Matched reduce(ext(mul(ext(A), ext(B)))
7265     // Note that the extend opcodes need to all match, or if A==B they will have
7266     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
7267     // which is equally fine.
7268     bool IsUnsigned = isa<ZExtInst>(Op0);
7269     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
7270     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
7271
7272     InstructionCost ExtCost =
7273         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
7274                              TTI::CastContextHint::None, CostKind, Op0);
7275     InstructionCost MulCost =
7276         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
7277     InstructionCost Ext2Cost =
7278         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
7279                              TTI::CastContextHint::None, CostKind, RedOp);
7280
7281     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7282         /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7283         CostKind);
7284
7285     if (RedCost.isValid() &&
7286         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
7287       return I == RetI ? RedCost : 0;
7288   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
7289              !TheLoop->isLoopInvariant(RedOp)) {
7290     // Matched reduce(ext(A))
7291     bool IsUnsigned = isa<ZExtInst>(RedOp);
7292     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
7293     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7294         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7295         CostKind);
7296
7297     InstructionCost ExtCost =
7298         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
7299                              TTI::CastContextHint::None, CostKind, RedOp);
7300     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
7301       return I == RetI ? RedCost : 0;
7302   } else if (RedOp &&
7303              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
7304     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
7305         Op0->getOpcode() == Op1->getOpcode() &&
7306         Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
7307         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
7308       bool IsUnsigned = isa<ZExtInst>(Op0);
7309       auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
7310       // Matched reduce(mul(ext, ext))
7311       InstructionCost ExtCost =
7312           TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType,
7313                                TTI::CastContextHint::None, CostKind, Op0);
7314       InstructionCost MulCost =
7315           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7316
7317       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7318           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7319           CostKind);
7320
7321       if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost)
7322         return I == RetI ? RedCost : 0;
7323     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
7324       // Matched reduce(mul())
7325       InstructionCost MulCost =
7326           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7327
7328       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7329           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
7330           CostKind);
7331
7332       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
7333         return I == RetI ? RedCost : 0;
7334     }
7335   }
7336
7337   return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
7338 }
7339
7340 InstructionCost
7341 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
7342                                                      ElementCount VF) {
7343   // Calculate scalar cost only. Vectorization cost should be ready at this
7344   // moment.
7345   if (VF.isScalar()) {
7346     Type *ValTy = getLoadStoreType(I);
7347     const Align Alignment = getLoadStoreAlignment(I);
7348     unsigned AS = getLoadStoreAddressSpace(I);
7349
7350     return TTI.getAddressComputationCost(ValTy) +
7351            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
7352                                TTI::TCK_RecipThroughput, I);
7353   }
7354   return getWideningCost(I, VF);
7355 }
7356
7357 LoopVectorizationCostModel::VectorizationCostTy
7358 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
7359                                                ElementCount VF) {
7360   // If we know that this instruction will remain uniform, check the cost of
7361   // the scalar version.
7362   if (isUniformAfterVectorization(I, VF))
7363     VF = ElementCount::getFixed(1);
7364
7365   if (VF.isVector() && isProfitableToScalarize(I, VF))
7366     return VectorizationCostTy(InstsToScalarize[VF][I], false);
7367
7368   // Forced scalars do not have any scalarization overhead.
7369   auto ForcedScalar = ForcedScalars.find(VF);
7370   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
7371     auto InstSet = ForcedScalar->second;
7372     if (InstSet.count(I))
7373       return VectorizationCostTy(
7374           (getInstructionCost(I, ElementCount::getFixed(1)).first *
7375            VF.getKnownMinValue()),
7376           false);
7377   }
7378
7379   Type *VectorTy;
7380   InstructionCost C = getInstructionCost(I, VF, VectorTy);
7381
7382   bool TypeNotScalarized =
7383       VF.isVector() && VectorTy->isVectorTy() &&
7384       TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
7385   return VectorizationCostTy(C, TypeNotScalarized);
7386 }
7387
7388 InstructionCost
7389 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
7390                                                      ElementCount VF) const {
7391
7392   // There is no mechanism yet to create a scalable scalarization loop,
7393   // so this is currently Invalid.
7394   if (VF.isScalable())
7395     return InstructionCost::getInvalid();
7396
7397   if (VF.isScalar())
7398     return 0;
7399
7400   InstructionCost Cost = 0;
7401   Type *RetTy = ToVectorTy(I->getType(), VF);
7402   if (!RetTy->isVoidTy() &&
7403       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
7404     Cost += TTI.getScalarizationOverhead(
7405         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
7406         true, false);
7407
7408   // Some targets keep addresses scalar.
7409   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
7410     return Cost;
7411
7412   // Some targets support efficient element stores.
7413   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
7414     return Cost;
7415
7416   // Collect operands to consider.
7417   CallInst *CI = dyn_cast<CallInst>(I);
7418   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
7419
7420   // Skip operands that do not require extraction/scalarization and do not incur
7421   // any overhead.
7422   SmallVector<Type *> Tys;
7423   for (auto *V : filterExtractingOperands(Ops, VF))
7424     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
7425   return Cost + TTI.getOperandsScalarizationOverhead(
7426                     filterExtractingOperands(Ops, VF), Tys);
7427 }
7428
7429 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
7430   if (VF.isScalar())
7431     return;
7432   NumPredStores = 0;
7433   for (BasicBlock *BB : TheLoop->blocks()) {
7434     // For each instruction in the old loop.
7435     for (Instruction &I : *BB) {
7436       Value *Ptr =  getLoadStorePointerOperand(&I);
7437       if (!Ptr)
7438         continue;
7439
7440       // TODO: We should generate better code and update the cost model for
7441       // predicated uniform stores. Today they are treated as any other
7442       // predicated store (see added test cases in
7443       // invariant-store-vectorization.ll).
7444       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
7445         NumPredStores++;
7446
7447       if (Legal->isUniformMemOp(I)) {
7448         // TODO: Avoid replicating loads and stores instead of
7449         // relying on instcombine to remove them.
7450         // Load: Scalar load + broadcast
7451         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
7452         InstructionCost Cost;
7453         if (isa<StoreInst>(&I) && VF.isScalable() &&
7454             isLegalGatherOrScatter(&I)) {
7455           Cost = getGatherScatterCost(&I, VF);
7456           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
7457         } else {
7458           assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
7459                  "Cannot yet scalarize uniform stores");
7460           Cost = getUniformMemOpCost(&I, VF);
7461           setWideningDecision(&I, VF, CM_Scalarize, Cost);
7462         }
7463         continue;
7464       }
7465
7466       // We assume that widening is the best solution when possible.
7467       if (memoryInstructionCanBeWidened(&I, VF)) {
7468         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
7469         int ConsecutiveStride =
7470                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
7471         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7472                "Expected consecutive stride.");
7473         InstWidening Decision =
7474             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
7475         setWideningDecision(&I, VF, Decision, Cost);
7476         continue;
7477       }
7478
7479       // Choose between Interleaving, Gather/Scatter or Scalarization.
7480       InstructionCost InterleaveCost = InstructionCost::getInvalid();
7481       unsigned NumAccesses = 1;
7482       if (isAccessInterleaved(&I)) {
7483         auto Group = getInterleavedAccessGroup(&I);
7484         assert(Group && "Fail to get an interleaved access group.");
7485
7486         // Make one decision for the whole group.
7487         if (getWideningDecision(&I, VF) != CM_Unknown)
7488           continue;
7489
7490         NumAccesses = Group->getNumMembers();
7491         if (interleavedAccessCanBeWidened(&I, VF))
7492           InterleaveCost = getInterleaveGroupCost(&I, VF);
7493       }
7494
7495       InstructionCost GatherScatterCost =
7496           isLegalGatherOrScatter(&I)
7497               ? getGatherScatterCost(&I, VF) * NumAccesses
7498               : InstructionCost::getInvalid();
7499
7500       InstructionCost ScalarizationCost =
7501           getMemInstScalarizationCost(&I, VF) * NumAccesses;
7502
7503       // Choose better solution for the current VF,
7504       // write down this decision and use it during vectorization.
7505       InstructionCost Cost;
7506       InstWidening Decision;
7507       if (InterleaveCost <= GatherScatterCost &&
7508           InterleaveCost < ScalarizationCost) {
7509         Decision = CM_Interleave;
7510         Cost = InterleaveCost;
7511       } else if (GatherScatterCost < ScalarizationCost) {
7512         Decision = CM_GatherScatter;
7513         Cost = GatherScatterCost;
7514       } else {
7515         Decision = CM_Scalarize;
7516         Cost = ScalarizationCost;
7517       }
7518       // If the instructions belongs to an interleave group, the whole group
7519       // receives the same decision. The whole group receives the cost, but
7520       // the cost will actually be assigned to one instruction.
7521       if (auto Group = getInterleavedAccessGroup(&I))
7522         setWideningDecision(Group, VF, Decision, Cost);
7523       else
7524         setWideningDecision(&I, VF, Decision, Cost);
7525     }
7526   }
7527
7528   // Make sure that any load of address and any other address computation
7529   // remains scalar unless there is gather/scatter support. This avoids
7530   // inevitable extracts into address registers, and also has the benefit of
7531   // activating LSR more, since that pass can't optimize vectorized
7532   // addresses.
7533   if (TTI.prefersVectorizedAddressing())
7534     return;
7535
7536   // Start with all scalar pointer uses.
7537   SmallPtrSet<Instruction *, 8> AddrDefs;
7538   for (BasicBlock *BB : TheLoop->blocks())
7539     for (Instruction &I : *BB) {
7540       Instruction *PtrDef =
7541         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
7542       if (PtrDef && TheLoop->contains(PtrDef) &&
7543           getWideningDecision(&I, VF) != CM_GatherScatter)
7544         AddrDefs.insert(PtrDef);
7545     }
7546
7547   // Add all instructions used to generate the addresses.
7548   SmallVector<Instruction *, 4> Worklist;
7549   append_range(Worklist, AddrDefs);
7550   while (!Worklist.empty()) {
7551     Instruction *I = Worklist.pop_back_val();
7552     for (auto &Op : I->operands())
7553       if (auto *InstOp = dyn_cast<Instruction>(Op))
7554         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7555             AddrDefs.insert(InstOp).second)
7556           Worklist.push_back(InstOp);
7557   }
7558
7559   for (auto *I : AddrDefs) {
7560     if (isa<LoadInst>(I)) {
7561       // Setting the desired widening decision should ideally be handled in
7562       // by cost functions, but since this involves the task of finding out
7563       // if the loaded register is involved in an address computation, it is
7564       // instead changed here when we know this is the case.
7565       InstWidening Decision = getWideningDecision(I, VF);
7566       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
7567         // Scalarize a widened load of address.
7568         setWideningDecision(
7569             I, VF, CM_Scalarize,
7570             (VF.getKnownMinValue() *
7571              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
7572       else if (auto Group = getInterleavedAccessGroup(I)) {
7573         // Scalarize an interleave group of address loads.
7574         for (unsigned I = 0; I < Group->getFactor(); ++I) {
7575           if (Instruction *Member = Group->getMember(I))
7576             setWideningDecision(
7577                 Member, VF, CM_Scalarize,
7578                 (VF.getKnownMinValue() *
7579                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7580         }
7581       }
7582     } else
7583       // Make sure I gets scalarized and a cost estimate without
7584       // scalarization overhead.
7585       ForcedScalars[VF].insert(I);
7586   }
7587 }
7588
7589 InstructionCost
7590 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7591                                                Type *&VectorTy) {
7592   Type *RetTy = I->getType();
7593   if (canTruncateToMinimalBitwidth(I, VF))
7594     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7595   auto SE = PSE.getSE();
7596   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7597
7598   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
7599                                                 ElementCount VF) -> bool {
7600     if (VF.isScalar())
7601       return true;
7602
7603     auto Scalarized = InstsToScalarize.find(VF);
7604     assert(Scalarized != InstsToScalarize.end() &&
7605            "VF not yet analyzed for scalarization profitability");
7606     return !Scalarized->second.count(I) &&
7607            llvm::all_of(I->users(), [&](User *U) {
7608              auto *UI = cast<Instruction>(U);
7609              return !Scalarized->second.count(UI);
7610            });
7611   };
7612   (void) hasSingleCopyAfterVectorization;
7613
7614   if (isScalarAfterVectorization(I, VF)) {
7615     // With the exception of GEPs and PHIs, after scalarization there should
7616     // only be one copy of the instruction generated in the loop. This is
7617     // because the VF is either 1, or any instructions that need scalarizing
7618     // have already been dealt with by the the time we get here. As a result,
7619     // it means we don't have to multiply the instruction cost by VF.
7620     assert(I->getOpcode() == Instruction::GetElementPtr ||
7621            I->getOpcode() == Instruction::PHI ||
7622            (I->getOpcode() == Instruction::BitCast &&
7623             I->getType()->isPointerTy()) ||
7624            hasSingleCopyAfterVectorization(I, VF));
7625     VectorTy = RetTy;
7626   } else
7627     VectorTy = ToVectorTy(RetTy, VF);
7628
7629   // TODO: We need to estimate the cost of intrinsic calls.
7630   switch (I->getOpcode()) {
7631   case Instruction::GetElementPtr:
7632     // We mark this instruction as zero-cost because the cost of GEPs in
7633     // vectorized code depends on whether the corresponding memory instruction
7634     // is scalarized or not. Therefore, we handle GEPs with the memory
7635     // instruction cost.
7636     return 0;
7637   case Instruction::Br: {
7638     // In cases of scalarized and predicated instructions, there will be VF
7639     // predicated blocks in the vectorized loop. Each branch around these
7640     // blocks requires also an extract of its vector compare i1 element.
7641     bool ScalarPredicatedBB = false;
7642     BranchInst *BI = cast<BranchInst>(I);
7643     if (VF.isVector() && BI->isConditional() &&
7644         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7645          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7646       ScalarPredicatedBB = true;
7647
7648     if (ScalarPredicatedBB) {
7649       // Not possible to scalarize scalable vector with predicated instructions.
7650       if (VF.isScalable())
7651         return InstructionCost::getInvalid();
7652       // Return cost for branches around scalarized and predicated blocks.
7653       auto *Vec_i1Ty =
7654           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7655       return (
7656           TTI.getScalarizationOverhead(
7657               Vec_i1Ty, APInt::getAllOnesValue(VF.getFixedValue()), false,
7658               true) +
7659           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7660     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7661       // The back-edge branch will remain, as will all scalar branches.
7662       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7663     else
7664       // This branch will be eliminated by if-conversion.
7665       return 0;
7666     // Note: We currently assume zero cost for an unconditional branch inside
7667     // a predicated block since it will become a fall-through, although we
7668     // may decide in the future to call TTI for all branches.
7669   }
7670   case Instruction::PHI: {
7671     auto *Phi = cast<PHINode>(I);
7672
7673     // First-order recurrences are replaced by vector shuffles inside the loop.
7674     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7675     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7676       return TTI.getShuffleCost(
7677           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7678           None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7679
7680     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7681     // converted into select instructions. We require N - 1 selects per phi
7682     // node, where N is the number of incoming values.
7683     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7684       return (Phi->getNumIncomingValues() - 1) *
7685              TTI.getCmpSelInstrCost(
7686                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7687                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7688                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7689
7690     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7691   }
7692   case Instruction::UDiv:
7693   case Instruction::SDiv:
7694   case Instruction::URem:
7695   case Instruction::SRem:
7696     // If we have a predicated instruction, it may not be executed for each
7697     // vector lane. Get the scalarization cost and scale this amount by the
7698     // probability of executing the predicated block. If the instruction is not
7699     // predicated, we fall through to the next case.
7700     if (VF.isVector() && isScalarWithPredication(I)) {
7701       InstructionCost Cost = 0;
7702
7703       // These instructions have a non-void type, so account for the phi nodes
7704       // that we will create. This cost is likely to be zero. The phi node
7705       // cost, if any, should be scaled by the block probability because it
7706       // models a copy at the end of each predicated block.
7707       Cost += VF.getKnownMinValue() *
7708               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7709
7710       // The cost of the non-predicated instruction.
7711       Cost += VF.getKnownMinValue() *
7712               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7713
7714       // The cost of insertelement and extractelement instructions needed for
7715       // scalarization.
7716       Cost += getScalarizationOverhead(I, VF);
7717
7718       // Scale the cost by the probability of executing the predicated blocks.
7719       // This assumes the predicated block for each vector lane is equally
7720       // likely.
7721       return Cost / getReciprocalPredBlockProb();
7722     }
7723     LLVM_FALLTHROUGH;
7724   case Instruction::Add:
7725   case Instruction::FAdd:
7726   case Instruction::Sub:
7727   case Instruction::FSub:
7728   case Instruction::Mul:
7729   case Instruction::FMul:
7730   case Instruction::FDiv:
7731   case Instruction::FRem:
7732   case Instruction::Shl:
7733   case Instruction::LShr:
7734   case Instruction::AShr:
7735   case Instruction::And:
7736   case Instruction::Or:
7737   case Instruction::Xor: {
7738     // Since we will replace the stride by 1 the multiplication should go away.
7739     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7740       return 0;
7741
7742     // Detect reduction patterns
7743     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7744       return *RedCost;
7745
7746     // Certain instructions can be cheaper to vectorize if they have a constant
7747     // second vector operand. One example of this are shifts on x86.
7748     Value *Op2 = I->getOperand(1);
7749     TargetTransformInfo::OperandValueProperties Op2VP;
7750     TargetTransformInfo::OperandValueKind Op2VK =
7751         TTI.getOperandInfo(Op2, Op2VP);
7752     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7753       Op2VK = TargetTransformInfo::OK_UniformValue;
7754
7755     SmallVector<const Value *, 4> Operands(I->operand_values());
7756     return TTI.getArithmeticInstrCost(
7757         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7758         Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7759   }
7760   case Instruction::FNeg: {
7761     return TTI.getArithmeticInstrCost(
7762         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7763         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7764         TargetTransformInfo::OP_None, I->getOperand(0), I);
7765   }
7766   case Instruction::Select: {
7767     SelectInst *SI = cast<SelectInst>(I);
7768     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7769     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7770
7771     const Value *Op0, *Op1;
7772     using namespace llvm::PatternMatch;
7773     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7774                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7775       // select x, y, false --> x & y
7776       // select x, true, y --> x | y
7777       TTI::OperandValueProperties Op1VP = TTI::OP_None;
7778       TTI::OperandValueProperties Op2VP = TTI::OP_None;
7779       TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7780       TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7781       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7782               Op1->getType()->getScalarSizeInBits() == 1);
7783
7784       SmallVector<const Value *, 2> Operands{Op0, Op1};
7785       return TTI.getArithmeticInstrCost(
7786           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7787           CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7788     }
7789
7790     Type *CondTy = SI->getCondition()->getType();
7791     if (!ScalarCond)
7792       CondTy = VectorType::get(CondTy, VF);
7793     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
7794                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7795   }
7796   case Instruction::ICmp:
7797   case Instruction::FCmp: {
7798     Type *ValTy = I->getOperand(0)->getType();
7799     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7800     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7801       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7802     VectorTy = ToVectorTy(ValTy, VF);
7803     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7804                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7805   }
7806   case Instruction::Store:
7807   case Instruction::Load: {
7808     ElementCount Width = VF;
7809     if (Width.isVector()) {
7810       InstWidening Decision = getWideningDecision(I, Width);
7811       assert(Decision != CM_Unknown &&
7812              "CM decision should be taken at this point");
7813       if (Decision == CM_Scalarize)
7814         Width = ElementCount::getFixed(1);
7815     }
7816     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7817     return getMemoryInstructionCost(I, VF);
7818   }
7819   case Instruction::BitCast:
7820     if (I->getType()->isPointerTy())
7821       return 0;
7822     LLVM_FALLTHROUGH;
7823   case Instruction::ZExt:
7824   case Instruction::SExt:
7825   case Instruction::FPToUI:
7826   case Instruction::FPToSI:
7827   case Instruction::FPExt:
7828   case Instruction::PtrToInt:
7829   case Instruction::IntToPtr:
7830   case Instruction::SIToFP:
7831   case Instruction::UIToFP:
7832   case Instruction::Trunc:
7833   case Instruction::FPTrunc: {
7834     // Computes the CastContextHint from a Load/Store instruction.
7835     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7836       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7837              "Expected a load or a store!");
7838
7839       if (VF.isScalar() || !TheLoop->contains(I))
7840         return TTI::CastContextHint::Normal;
7841
7842       switch (getWideningDecision(I, VF)) {
7843       case LoopVectorizationCostModel::CM_GatherScatter:
7844         return TTI::CastContextHint::GatherScatter;
7845       case LoopVectorizationCostModel::CM_Interleave:
7846         return TTI::CastContextHint::Interleave;
7847       case LoopVectorizationCostModel::CM_Scalarize:
7848       case LoopVectorizationCostModel::CM_Widen:
7849         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7850                                         : TTI::CastContextHint::Normal;
7851       case LoopVectorizationCostModel::CM_Widen_Reverse:
7852         return TTI::CastContextHint::Reversed;
7853       case LoopVectorizationCostModel::CM_Unknown:
7854         llvm_unreachable("Instr did not go through cost modelling?");
7855       }
7856
7857       llvm_unreachable("Unhandled case!");
7858     };
7859
7860     unsigned Opcode = I->getOpcode();
7861     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7862     // For Trunc, the context is the only user, which must be a StoreInst.
7863     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7864       if (I->hasOneUse())
7865         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7866           CCH = ComputeCCH(Store);
7867     }
7868     // For Z/Sext, the context is the operand, which must be a LoadInst.
7869     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7870              Opcode == Instruction::FPExt) {
7871       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7872         CCH = ComputeCCH(Load);
7873     }
7874
7875     // We optimize the truncation of induction variables having constant
7876     // integer steps. The cost of these truncations is the same as the scalar
7877     // operation.
7878     if (isOptimizableIVTruncate(I, VF)) {
7879       auto *Trunc = cast<TruncInst>(I);
7880       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7881                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7882     }
7883
7884     // Detect reduction patterns
7885     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7886       return *RedCost;
7887
7888     Type *SrcScalarTy = I->getOperand(0)->getType();
7889     Type *SrcVecTy =
7890         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7891     if (canTruncateToMinimalBitwidth(I, VF)) {
7892       // This cast is going to be shrunk. This may remove the cast or it might
7893       // turn it into slightly different cast. For example, if MinBW == 16,
7894       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7895       //
7896       // Calculate the modified src and dest types.
7897       Type *MinVecTy = VectorTy;
7898       if (Opcode == Instruction::Trunc) {
7899         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7900         VectorTy =
7901             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7902       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7903         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7904         VectorTy =
7905             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7906       }
7907     }
7908
7909     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7910   }
7911   case Instruction::Call: {
7912     bool NeedToScalarize;
7913     CallInst *CI = cast<CallInst>(I);
7914     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7915     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7916       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7917       return std::min(CallCost, IntrinsicCost);
7918     }
7919     return CallCost;
7920   }
7921   case Instruction::ExtractValue:
7922     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7923   case Instruction::Alloca:
7924     // We cannot easily widen alloca to a scalable alloca, as
7925     // the result would need to be a vector of pointers.
7926     if (VF.isScalable())
7927       return InstructionCost::getInvalid();
7928     LLVM_FALLTHROUGH;
7929   default:
7930     // This opcode is unknown. Assume that it is the same as 'mul'.
7931     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7932   } // end of switch.
7933 }
7934
7935 char LoopVectorize::ID = 0;
7936
7937 static const char lv_name[] = "Loop Vectorization";
7938
7939 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7940 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7941 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7942 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7943 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7944 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7945 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7946 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7947 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7948 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7949 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7950 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7951 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7952 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7953 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7954 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7955
7956 namespace llvm {
7957
7958 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7959
7960 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7961                               bool VectorizeOnlyWhenForced) {
7962   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7963 }
7964
7965 } // end namespace llvm
7966
7967 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7968   // Check if the pointer operand of a load or store instruction is
7969   // consecutive.
7970   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7971     return Legal->isConsecutivePtr(Ptr);
7972   return false;
7973 }
7974
7975 void LoopVectorizationCostModel::collectValuesToIgnore() {
7976   // Ignore ephemeral values.
7977   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7978
7979   // Ignore type-promoting instructions we identified during reduction
7980   // detection.
7981   for (auto &Reduction : Legal->getReductionVars()) {
7982     RecurrenceDescriptor &RedDes = Reduction.second;
7983     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7984     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7985   }
7986   // Ignore type-casting instructions we identified during induction
7987   // detection.
7988   for (auto &Induction : Legal->getInductionVars()) {
7989     InductionDescriptor &IndDes = Induction.second;
7990     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7991     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7992   }
7993 }
7994
7995 void LoopVectorizationCostModel::collectInLoopReductions() {
7996   for (auto &Reduction : Legal->getReductionVars()) {
7997     PHINode *Phi = Reduction.first;
7998     RecurrenceDescriptor &RdxDesc = Reduction.second;
7999
8000     // We don't collect reductions that are type promoted (yet).
8001     if (RdxDesc.getRecurrenceType() != Phi->getType())
8002       continue;
8003
8004     // If the target would prefer this reduction to happen "in-loop", then we
8005     // want to record it as such.
8006     unsigned Opcode = RdxDesc.getOpcode();
8007     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
8008         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
8009                                    TargetTransformInfo::ReductionFlags()))
8010       continue;
8011
8012     // Check that we can correctly put the reductions into the loop, by
8013     // finding the chain of operations that leads from the phi to the loop
8014     // exit value.
8015     SmallVector<Instruction *, 4> ReductionOperations =
8016         RdxDesc.getReductionOpChain(Phi, TheLoop);
8017     bool InLoop = !ReductionOperations.empty();
8018     if (InLoop) {
8019       InLoopReductionChains[Phi] = ReductionOperations;
8020       // Add the elements to InLoopReductionImmediateChains for cost modelling.
8021       Instruction *LastChain = Phi;
8022       for (auto *I : ReductionOperations) {
8023         InLoopReductionImmediateChains[I] = LastChain;
8024         LastChain = I;
8025       }
8026     }
8027     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
8028                       << " reduction for phi: " << *Phi << "\n");
8029   }
8030 }
8031
8032 // TODO: we could return a pair of values that specify the max VF and
8033 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
8034 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
8035 // doesn't have a cost model that can choose which plan to execute if
8036 // more than one is generated.
8037 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
8038                                  LoopVectorizationCostModel &CM) {
8039   unsigned WidestType;
8040   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
8041   return WidestVectorRegBits / WidestType;
8042 }
8043
8044 VectorizationFactor
8045 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
8046   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
8047   ElementCount VF = UserVF;
8048   // Outer loop handling: They may require CFG and instruction level
8049   // transformations before even evaluating whether vectorization is profitable.
8050   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8051   // the vectorization pipeline.
8052   if (!OrigLoop->isInnermost()) {
8053     // If the user doesn't provide a vectorization factor, determine a
8054     // reasonable one.
8055     if (UserVF.isZero()) {
8056       VF = ElementCount::getFixed(determineVPlanVF(
8057           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
8058               .getFixedSize(),
8059           CM));
8060       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
8061
8062       // Make sure we have a VF > 1 for stress testing.
8063       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
8064         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
8065                           << "overriding computed VF.\n");
8066         VF = ElementCount::getFixed(4);
8067       }
8068     }
8069     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8070     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
8071            "VF needs to be a power of two");
8072     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
8073                       << "VF " << VF << " to build VPlans.\n");
8074     buildVPlans(VF, VF);
8075
8076     // For VPlan build stress testing, we bail out after VPlan construction.
8077     if (VPlanBuildStressTest)
8078       return VectorizationFactor::Disabled();
8079
8080     return {VF, 0 /*Cost*/};
8081   }
8082
8083   LLVM_DEBUG(
8084       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
8085                 "VPlan-native path.\n");
8086   return VectorizationFactor::Disabled();
8087 }
8088
8089 Optional<VectorizationFactor>
8090 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
8091   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8092   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
8093   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
8094     return None;
8095
8096   // Invalidate interleave groups if all blocks of loop will be predicated.
8097   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
8098       !useMaskedInterleavedAccesses(*TTI)) {
8099     LLVM_DEBUG(
8100         dbgs()
8101         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
8102            "which requires masked-interleaved support.\n");
8103     if (CM.InterleaveInfo.invalidateGroups())
8104       // Invalidating interleave groups also requires invalidating all decisions
8105       // based on them, which includes widening decisions and uniform and scalar
8106       // values.
8107       CM.invalidateCostModelingDecisions();
8108   }
8109
8110   ElementCount MaxUserVF =
8111       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
8112   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
8113   if (!UserVF.isZero() && UserVFIsLegal) {
8114     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
8115            "VF needs to be a power of two");
8116     // Collect the instructions (and their associated costs) that will be more
8117     // profitable to scalarize.
8118     if (CM.selectUserVectorizationFactor(UserVF)) {
8119       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
8120       CM.collectInLoopReductions();
8121       buildVPlansWithVPRecipes(UserVF, UserVF);
8122       LLVM_DEBUG(printPlans(dbgs()));
8123       return {{UserVF, 0}};
8124     } else
8125       reportVectorizationInfo("UserVF ignored because of invalid costs.",
8126                               "InvalidCost", ORE, OrigLoop);
8127   }
8128
8129   // Populate the set of Vectorization Factor Candidates.
8130   ElementCountSet VFCandidates;
8131   for (auto VF = ElementCount::getFixed(1);
8132        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
8133     VFCandidates.insert(VF);
8134   for (auto VF = ElementCount::getScalable(1);
8135        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
8136     VFCandidates.insert(VF);
8137
8138   for (const auto &VF : VFCandidates) {
8139     // Collect Uniform and Scalar instructions after vectorization with VF.
8140     CM.collectUniformsAndScalars(VF);
8141
8142     // Collect the instructions (and their associated costs) that will be more
8143     // profitable to scalarize.
8144     if (VF.isVector())
8145       CM.collectInstsToScalarize(VF);
8146   }
8147
8148   CM.collectInLoopReductions();
8149   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
8150   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
8151
8152   LLVM_DEBUG(printPlans(dbgs()));
8153   if (!MaxFactors.hasVector())
8154     return VectorizationFactor::Disabled();
8155
8156   // Select the optimal vectorization factor.
8157   auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
8158
8159   // Check if it is profitable to vectorize with runtime checks.
8160   unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
8161   if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
8162     bool PragmaThresholdReached =
8163         NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
8164     bool ThresholdReached =
8165         NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
8166     if ((ThresholdReached && !Hints.allowReordering()) ||
8167         PragmaThresholdReached) {
8168       ORE->emit([&]() {
8169         return OptimizationRemarkAnalysisAliasing(
8170                    DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(),
8171                    OrigLoop->getHeader())
8172                << "loop not vectorized: cannot prove it is safe to reorder "
8173                   "memory operations";
8174       });
8175       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
8176       Hints.emitRemarkWithHints();
8177       return VectorizationFactor::Disabled();
8178     }
8179   }
8180   return SelectedVF;
8181 }
8182
8183 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
8184   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
8185                     << '\n');
8186   BestVF = VF;
8187   BestUF = UF;
8188
8189   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
8190     return !Plan->hasVF(VF);
8191   });
8192   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
8193 }
8194
8195 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
8196                                            DominatorTree *DT) {
8197   // Perform the actual loop transformation.
8198
8199   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
8200   assert(BestVF.hasValue() && "Vectorization Factor is missing");
8201   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
8202
8203   VPTransformState State{
8204       *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()};
8205   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
8206   State.TripCount = ILV.getOrCreateTripCount(nullptr);
8207   State.CanonicalIV = ILV.Induction;
8208
8209   ILV.printDebugTracesAtStart();
8210
8211   //===------------------------------------------------===//
8212   //
8213   // Notice: any optimization or new instruction that go
8214   // into the code below should also be implemented in
8215   // the cost-model.
8216   //
8217   //===------------------------------------------------===//
8218
8219   // 2. Copy and widen instructions from the old loop into the new loop.
8220   VPlans.front()->execute(&State);
8221
8222   // 3. Fix the vectorized code: take care of header phi's, live-outs,
8223   //    predication, updating analyses.
8224   ILV.fixVectorizedLoop(State);
8225
8226   ILV.printDebugTracesAtEnd();
8227 }
8228
8229 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
8230 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
8231   for (const auto &Plan : VPlans)
8232     if (PrintVPlansInDotFormat)
8233       Plan->printDOT(O);
8234     else
8235       Plan->print(O);
8236 }
8237 #endif
8238
8239 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
8240     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
8241
8242   // We create new control-flow for the vectorized loop, so the original exit
8243   // conditions will be dead after vectorization if it's only used by the
8244   // terminator
8245   SmallVector<BasicBlock*> ExitingBlocks;
8246   OrigLoop->getExitingBlocks(ExitingBlocks);
8247   for (auto *BB : ExitingBlocks) {
8248     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
8249     if (!Cmp || !Cmp->hasOneUse())
8250       continue;
8251
8252     // TODO: we should introduce a getUniqueExitingBlocks on Loop
8253     if (!DeadInstructions.insert(Cmp).second)
8254       continue;
8255
8256     // The operands of the icmp is often a dead trunc, used by IndUpdate.
8257     // TODO: can recurse through operands in general
8258     for (Value *Op : Cmp->operands()) {
8259       if (isa<TruncInst>(Op) && Op->hasOneUse())
8260           DeadInstructions.insert(cast<Instruction>(Op));
8261     }
8262   }
8263
8264   // We create new "steps" for induction variable updates to which the original
8265   // induction variables map. An original update instruction will be dead if
8266   // all its users except the induction variable are dead.
8267   auto *Latch = OrigLoop->getLoopLatch();
8268   for (auto &Induction : Legal->getInductionVars()) {
8269     PHINode *Ind = Induction.first;
8270     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
8271
8272     // If the tail is to be folded by masking, the primary induction variable,
8273     // if exists, isn't dead: it will be used for masking. Don't kill it.
8274     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
8275       continue;
8276
8277     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
8278           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
8279         }))
8280       DeadInstructions.insert(IndUpdate);
8281
8282     // We record as "Dead" also the type-casting instructions we had identified
8283     // during induction analysis. We don't need any handling for them in the
8284     // vectorized loop because we have proven that, under a proper runtime
8285     // test guarding the vectorized loop, the value of the phi, and the casted
8286     // value of the phi, are the same. The last instruction in this casting chain
8287     // will get its scalar/vector/widened def from the scalar/vector/widened def
8288     // of the respective phi node. Any other casts in the induction def-use chain
8289     // have no other uses outside the phi update chain, and will be ignored.
8290     InductionDescriptor &IndDes = Induction.second;
8291     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
8292     DeadInstructions.insert(Casts.begin(), Casts.end());
8293   }
8294 }
8295
8296 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
8297
8298 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
8299
8300 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
8301                                         Instruction::BinaryOps BinOp) {
8302   // When unrolling and the VF is 1, we only need to add a simple scalar.
8303   Type *Ty = Val->getType();
8304   assert(!Ty->isVectorTy() && "Val must be a scalar");
8305
8306   if (Ty->isFloatingPointTy()) {
8307     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
8308
8309     // Floating-point operations inherit FMF via the builder's flags.
8310     Value *MulOp = Builder.CreateFMul(C, Step);
8311     return Builder.CreateBinOp(BinOp, Val, MulOp);
8312   }
8313   Constant *C = ConstantInt::get(Ty, StartIdx);
8314   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
8315 }
8316
8317 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
8318   SmallVector<Metadata *, 4> MDs;
8319   // Reserve first location for self reference to the LoopID metadata node.
8320   MDs.push_back(nullptr);
8321   bool IsUnrollMetadata = false;
8322   MDNode *LoopID = L->getLoopID();
8323   if (LoopID) {
8324     // First find existing loop unrolling disable metadata.
8325     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
8326       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
8327       if (MD) {
8328         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
8329         IsUnrollMetadata =
8330             S && S->getString().startswith("llvm.loop.unroll.disable");
8331       }
8332       MDs.push_back(LoopID->getOperand(i));
8333     }
8334   }
8335
8336   if (!IsUnrollMetadata) {
8337     // Add runtime unroll disable metadata.
8338     LLVMContext &Context = L->getHeader()->getContext();
8339     SmallVector<Metadata *, 1> DisableOperands;
8340     DisableOperands.push_back(
8341         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
8342     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
8343     MDs.push_back(DisableNode);
8344     MDNode *NewLoopID = MDNode::get(Context, MDs);
8345     // Set operand 0 to refer to the loop id itself.
8346     NewLoopID->replaceOperandWith(0, NewLoopID);
8347     L->setLoopID(NewLoopID);
8348   }
8349 }
8350
8351 //===--------------------------------------------------------------------===//
8352 // EpilogueVectorizerMainLoop
8353 //===--------------------------------------------------------------------===//
8354
8355 /// This function is partially responsible for generating the control flow
8356 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8357 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
8358   MDNode *OrigLoopID = OrigLoop->getLoopID();
8359   Loop *Lp = createVectorLoopSkeleton("");
8360
8361   // Generate the code to check the minimum iteration count of the vector
8362   // epilogue (see below).
8363   EPI.EpilogueIterationCountCheck =
8364       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
8365   EPI.EpilogueIterationCountCheck->setName("iter.check");
8366
8367   // Generate the code to check any assumptions that we've made for SCEV
8368   // expressions.
8369   EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader);
8370
8371   // Generate the code that checks at runtime if arrays overlap. We put the
8372   // checks into a separate block to make the more common case of few elements
8373   // faster.
8374   EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
8375
8376   // Generate the iteration count check for the main loop, *after* the check
8377   // for the epilogue loop, so that the path-length is shorter for the case
8378   // that goes directly through the vector epilogue. The longer-path length for
8379   // the main loop is compensated for, by the gain from vectorizing the larger
8380   // trip count. Note: the branch will get updated later on when we vectorize
8381   // the epilogue.
8382   EPI.MainLoopIterationCountCheck =
8383       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
8384
8385   // Generate the induction variable.
8386   OldInduction = Legal->getPrimaryInduction();
8387   Type *IdxTy = Legal->getWidestInductionType();
8388   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8389   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
8390   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8391   EPI.VectorTripCount = CountRoundDown;
8392   Induction =
8393       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8394                               getDebugLocFromInstOrOperands(OldInduction));
8395
8396   // Skip induction resume value creation here because they will be created in
8397   // the second pass. If we created them here, they wouldn't be used anyway,
8398   // because the vplan in the second pass still contains the inductions from the
8399   // original loop.
8400
8401   return completeLoopSkeleton(Lp, OrigLoopID);
8402 }
8403
8404 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
8405   LLVM_DEBUG({
8406     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
8407            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
8408            << ", Main Loop UF:" << EPI.MainLoopUF
8409            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
8410            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8411   });
8412 }
8413
8414 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
8415   DEBUG_WITH_TYPE(VerboseDebug, {
8416     dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
8417   });
8418 }
8419
8420 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
8421     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
8422   assert(L && "Expected valid Loop.");
8423   assert(Bypass && "Expected valid bypass basic block.");
8424   unsigned VFactor =
8425       ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
8426   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
8427   Value *Count = getOrCreateTripCount(L);
8428   // Reuse existing vector loop preheader for TC checks.
8429   // Note that new preheader block is generated for vector loop.
8430   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
8431   IRBuilder<> Builder(TCCheckBlock->getTerminator());
8432
8433   // Generate code to check if the loop's trip count is less than VF * UF of the
8434   // main vector loop.
8435   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
8436       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8437
8438   Value *CheckMinIters = Builder.CreateICmp(
8439       P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
8440       "min.iters.check");
8441
8442   if (!ForEpilogue)
8443     TCCheckBlock->setName("vector.main.loop.iter.check");
8444
8445   // Create new preheader for vector loop.
8446   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
8447                                    DT, LI, nullptr, "vector.ph");
8448
8449   if (ForEpilogue) {
8450     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
8451                                  DT->getNode(Bypass)->getIDom()) &&
8452            "TC check is expected to dominate Bypass");
8453
8454     // Update dominator for Bypass & LoopExit.
8455     DT->changeImmediateDominator(Bypass, TCCheckBlock);
8456     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8457       // For loops with multiple exits, there's no edge from the middle block
8458       // to exit blocks (as the epilogue must run) and thus no need to update
8459       // the immediate dominator of the exit blocks.
8460       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
8461
8462     LoopBypassBlocks.push_back(TCCheckBlock);
8463
8464     // Save the trip count so we don't have to regenerate it in the
8465     // vec.epilog.iter.check. This is safe to do because the trip count
8466     // generated here dominates the vector epilog iter check.
8467     EPI.TripCount = Count;
8468   }
8469
8470   ReplaceInstWithInst(
8471       TCCheckBlock->getTerminator(),
8472       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8473
8474   return TCCheckBlock;
8475 }
8476
8477 //===--------------------------------------------------------------------===//
8478 // EpilogueVectorizerEpilogueLoop
8479 //===--------------------------------------------------------------------===//
8480
8481 /// This function is partially responsible for generating the control flow
8482 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8483 BasicBlock *
8484 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
8485   MDNode *OrigLoopID = OrigLoop->getLoopID();
8486   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
8487
8488   // Now, compare the remaining count and if there aren't enough iterations to
8489   // execute the vectorized epilogue skip to the scalar part.
8490   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
8491   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
8492   LoopVectorPreHeader =
8493       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
8494                  LI, nullptr, "vec.epilog.ph");
8495   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
8496                                           VecEpilogueIterationCountCheck);
8497
8498   // Adjust the control flow taking the state info from the main loop
8499   // vectorization into account.
8500   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
8501          "expected this to be saved from the previous pass.");
8502   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
8503       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
8504
8505   DT->changeImmediateDominator(LoopVectorPreHeader,
8506                                EPI.MainLoopIterationCountCheck);
8507
8508   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
8509       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8510
8511   if (EPI.SCEVSafetyCheck)
8512     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
8513         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8514   if (EPI.MemSafetyCheck)
8515     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
8516         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8517
8518   DT->changeImmediateDominator(
8519       VecEpilogueIterationCountCheck,
8520       VecEpilogueIterationCountCheck->getSinglePredecessor());
8521
8522   DT->changeImmediateDominator(LoopScalarPreHeader,
8523                                EPI.EpilogueIterationCountCheck);
8524   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8525     // If there is an epilogue which must run, there's no edge from the
8526     // middle block to exit blocks  and thus no need to update the immediate
8527     // dominator of the exit blocks.
8528     DT->changeImmediateDominator(LoopExitBlock,
8529                                  EPI.EpilogueIterationCountCheck);
8530
8531   // Keep track of bypass blocks, as they feed start values to the induction
8532   // phis in the scalar loop preheader.
8533   if (EPI.SCEVSafetyCheck)
8534     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
8535   if (EPI.MemSafetyCheck)
8536     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
8537   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
8538
8539   // Generate a resume induction for the vector epilogue and put it in the
8540   // vector epilogue preheader
8541   Type *IdxTy = Legal->getWidestInductionType();
8542   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
8543                                          LoopVectorPreHeader->getFirstNonPHI());
8544   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
8545   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
8546                            EPI.MainLoopIterationCountCheck);
8547
8548   // Generate the induction variable.
8549   OldInduction = Legal->getPrimaryInduction();
8550   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8551   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
8552   Value *StartIdx = EPResumeVal;
8553   Induction =
8554       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8555                               getDebugLocFromInstOrOperands(OldInduction));
8556
8557   // Generate induction resume values. These variables save the new starting
8558   // indexes for the scalar loop. They are used to test if there are any tail
8559   // iterations left once the vector loop has completed.
8560   // Note that when the vectorized epilogue is skipped due to iteration count
8561   // check, then the resume value for the induction variable comes from
8562   // the trip count of the main vector loop, hence passing the AdditionalBypass
8563   // argument.
8564   createInductionResumeValues(Lp, CountRoundDown,
8565                               {VecEpilogueIterationCountCheck,
8566                                EPI.VectorTripCount} /* AdditionalBypass */);
8567
8568   AddRuntimeUnrollDisableMetaData(Lp);
8569   return completeLoopSkeleton(Lp, OrigLoopID);
8570 }
8571
8572 BasicBlock *
8573 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8574     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
8575
8576   assert(EPI.TripCount &&
8577          "Expected trip count to have been safed in the first pass.");
8578   assert(
8579       (!isa<Instruction>(EPI.TripCount) ||
8580        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8581       "saved trip count does not dominate insertion point.");
8582   Value *TC = EPI.TripCount;
8583   IRBuilder<> Builder(Insert->getTerminator());
8584   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8585
8586   // Generate code to check if the loop's trip count is less than VF * UF of the
8587   // vector epilogue loop.
8588   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
8589       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8590
8591   Value *CheckMinIters = Builder.CreateICmp(
8592       P, Count,
8593       ConstantInt::get(Count->getType(),
8594                        EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
8595       "min.epilog.iters.check");
8596
8597   ReplaceInstWithInst(
8598       Insert->getTerminator(),
8599       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8600
8601   LoopBypassBlocks.push_back(Insert);
8602   return Insert;
8603 }
8604
8605 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8606   LLVM_DEBUG({
8607     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8608            << "Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
8609            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8610   });
8611 }
8612
8613 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8614   DEBUG_WITH_TYPE(VerboseDebug, {
8615     dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
8616   });
8617 }
8618
8619 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8620     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8621   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8622   bool PredicateAtRangeStart = Predicate(Range.Start);
8623
8624   for (ElementCount TmpVF = Range.Start * 2;
8625        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8626     if (Predicate(TmpVF) != PredicateAtRangeStart) {
8627       Range.End = TmpVF;
8628       break;
8629     }
8630
8631   return PredicateAtRangeStart;
8632 }
8633
8634 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8635 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8636 /// of VF's starting at a given VF and extending it as much as possible. Each
8637 /// vectorization decision can potentially shorten this sub-range during
8638 /// buildVPlan().
8639 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8640                                            ElementCount MaxVF) {
8641   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8642   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8643     VFRange SubRange = {VF, MaxVFPlusOne};
8644     VPlans.push_back(buildVPlan(SubRange));
8645     VF = SubRange.End;
8646   }
8647 }
8648
8649 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8650                                          VPlanPtr &Plan) {
8651   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8652
8653   // Look for cached value.
8654   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8655   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8656   if (ECEntryIt != EdgeMaskCache.end())
8657     return ECEntryIt->second;
8658
8659   VPValue *SrcMask = createBlockInMask(Src, Plan);
8660
8661   // The terminator has to be a branch inst!
8662   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8663   assert(BI && "Unexpected terminator found");
8664
8665   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8666     return EdgeMaskCache[Edge] = SrcMask;
8667
8668   // If source is an exiting block, we know the exit edge is dynamically dead
8669   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8670   // adding uses of an otherwise potentially dead instruction.
8671   if (OrigLoop->isLoopExiting(Src))
8672     return EdgeMaskCache[Edge] = SrcMask;
8673
8674   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8675   assert(EdgeMask && "No Edge Mask found for condition");
8676
8677   if (BI->getSuccessor(0) != Dst)
8678     EdgeMask = Builder.createNot(EdgeMask);
8679
8680   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8681     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8682     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8683     // The select version does not introduce new UB if SrcMask is false and
8684     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8685     VPValue *False = Plan->getOrAddVPValue(
8686         ConstantInt::getFalse(BI->getCondition()->getType()));
8687     EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False);
8688   }
8689
8690   return EdgeMaskCache[Edge] = EdgeMask;
8691 }
8692
8693 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8694   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8695
8696   // Look for cached value.
8697   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8698   if (BCEntryIt != BlockMaskCache.end())
8699     return BCEntryIt->second;
8700
8701   // All-one mask is modelled as no-mask following the convention for masked
8702   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8703   VPValue *BlockMask = nullptr;
8704
8705   if (OrigLoop->getHeader() == BB) {
8706     if (!CM.blockNeedsPredication(BB))
8707       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8708
8709     // Create the block in mask as the first non-phi instruction in the block.
8710     VPBuilder::InsertPointGuard Guard(Builder);
8711     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
8712     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
8713
8714     // Introduce the early-exit compare IV <= BTC to form header block mask.
8715     // This is used instead of IV < TC because TC may wrap, unlike BTC.
8716     // Start by constructing the desired canonical IV.
8717     VPValue *IV = nullptr;
8718     if (Legal->getPrimaryInduction())
8719       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
8720     else {
8721       auto IVRecipe = new VPWidenCanonicalIVRecipe();
8722       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
8723       IV = IVRecipe->getVPSingleValue();
8724     }
8725     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8726     bool TailFolded = !CM.isScalarEpilogueAllowed();
8727
8728     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
8729       // While ActiveLaneMask is a binary op that consumes the loop tripcount
8730       // as a second argument, we only pass the IV here and extract the
8731       // tripcount from the transform state where codegen of the VP instructions
8732       // happen.
8733       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
8734     } else {
8735       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8736     }
8737     return BlockMaskCache[BB] = BlockMask;
8738   }
8739
8740   // This is the block mask. We OR all incoming edges.
8741   for (auto *Predecessor : predecessors(BB)) {
8742     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8743     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8744       return BlockMaskCache[BB] = EdgeMask;
8745
8746     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8747       BlockMask = EdgeMask;
8748       continue;
8749     }
8750
8751     BlockMask = Builder.createOr(BlockMask, EdgeMask);
8752   }
8753
8754   return BlockMaskCache[BB] = BlockMask;
8755 }
8756
8757 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8758                                                 ArrayRef<VPValue *> Operands,
8759                                                 VFRange &Range,
8760                                                 VPlanPtr &Plan) {
8761   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8762          "Must be called with either a load or store");
8763
8764   auto willWiden = [&](ElementCount VF) -> bool {
8765     if (VF.isScalar())
8766       return false;
8767     LoopVectorizationCostModel::InstWidening Decision =
8768         CM.getWideningDecision(I, VF);
8769     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8770            "CM decision should be taken at this point.");
8771     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8772       return true;
8773     if (CM.isScalarAfterVectorization(I, VF) ||
8774         CM.isProfitableToScalarize(I, VF))
8775       return false;
8776     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8777   };
8778
8779   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8780     return nullptr;
8781
8782   VPValue *Mask = nullptr;
8783   if (Legal->isMaskRequired(I))
8784     Mask = createBlockInMask(I->getParent(), Plan);
8785
8786   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8787     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask);
8788
8789   StoreInst *Store = cast<StoreInst>(I);
8790   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8791                                             Mask);
8792 }
8793
8794 VPWidenIntOrFpInductionRecipe *
8795 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi,
8796                                            ArrayRef<VPValue *> Operands) const {
8797   // Check if this is an integer or fp induction. If so, build the recipe that
8798   // produces its scalar and vector values.
8799   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8800   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
8801       II.getKind() == InductionDescriptor::IK_FpInduction) {
8802     assert(II.getStartValue() ==
8803            Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8804     const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts();
8805     return new VPWidenIntOrFpInductionRecipe(
8806         Phi, Operands[0], Casts.empty() ? nullptr : Casts.front());
8807   }
8808
8809   return nullptr;
8810 }
8811
8812 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8813     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range,
8814     VPlan &Plan) const {
8815   // Optimize the special case where the source is a constant integer
8816   // induction variable. Notice that we can only optimize the 'trunc' case
8817   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8818   // (c) other casts depend on pointer size.
8819
8820   // Determine whether \p K is a truncation based on an induction variable that
8821   // can be optimized.
8822   auto isOptimizableIVTruncate =
8823       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8824     return [=](ElementCount VF) -> bool {
8825       return CM.isOptimizableIVTruncate(K, VF);
8826     };
8827   };
8828
8829   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8830           isOptimizableIVTruncate(I), Range)) {
8831
8832     InductionDescriptor II =
8833         Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0)));
8834     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8835     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
8836                                              Start, nullptr, I);
8837   }
8838   return nullptr;
8839 }
8840
8841 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8842                                                 ArrayRef<VPValue *> Operands,
8843                                                 VPlanPtr &Plan) {
8844   // If all incoming values are equal, the incoming VPValue can be used directly
8845   // instead of creating a new VPBlendRecipe.
8846   VPValue *FirstIncoming = Operands[0];
8847   if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8848         return FirstIncoming == Inc;
8849       })) {
8850     return Operands[0];
8851   }
8852
8853   // We know that all PHIs in non-header blocks are converted into selects, so
8854   // we don't have to worry about the insertion order and we can just use the
8855   // builder. At this point we generate the predication tree. There may be
8856   // duplications since this is a simple recursive scan, but future
8857   // optimizations will clean it up.
8858   SmallVector<VPValue *, 2> OperandsWithMask;
8859   unsigned NumIncoming = Phi->getNumIncomingValues();
8860
8861   for (unsigned In = 0; In < NumIncoming; In++) {
8862     VPValue *EdgeMask =
8863       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8864     assert((EdgeMask || NumIncoming == 1) &&
8865            "Multiple predecessors with one having a full mask");
8866     OperandsWithMask.push_back(Operands[In]);
8867     if (EdgeMask)
8868       OperandsWithMask.push_back(EdgeMask);
8869   }
8870   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8871 }
8872
8873 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8874                                                    ArrayRef<VPValue *> Operands,
8875                                                    VFRange &Range) const {
8876
8877   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8878       [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); },
8879       Range);
8880
8881   if (IsPredicated)
8882     return nullptr;
8883
8884   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8885   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8886              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8887              ID == Intrinsic::pseudoprobe ||
8888              ID == Intrinsic::experimental_noalias_scope_decl))
8889     return nullptr;
8890
8891   auto willWiden = [&](ElementCount VF) -> bool {
8892     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8893     // The following case may be scalarized depending on the VF.
8894     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8895     // version of the instruction.
8896     // Is it beneficial to perform intrinsic call compared to lib call?
8897     bool NeedToScalarize = false;
8898     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8899     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8900     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8901     return UseVectorIntrinsic || !NeedToScalarize;
8902   };
8903
8904   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8905     return nullptr;
8906
8907   ArrayRef<VPValue *> Ops = Operands.take_front(CI->getNumArgOperands());
8908   return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8909 }
8910
8911 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8912   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8913          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8914   // Instruction should be widened, unless it is scalar after vectorization,
8915   // scalarization is profitable or it is predicated.
8916   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8917     return CM.isScalarAfterVectorization(I, VF) ||
8918            CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I);
8919   };
8920   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8921                                                              Range);
8922 }
8923
8924 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8925                                            ArrayRef<VPValue *> Operands) const {
8926   auto IsVectorizableOpcode = [](unsigned Opcode) {
8927     switch (Opcode) {
8928     case Instruction::Add:
8929     case Instruction::And:
8930     case Instruction::AShr:
8931     case Instruction::BitCast:
8932     case Instruction::FAdd:
8933     case Instruction::FCmp:
8934     case Instruction::FDiv:
8935     case Instruction::FMul:
8936     case Instruction::FNeg:
8937     case Instruction::FPExt:
8938     case Instruction::FPToSI:
8939     case Instruction::FPToUI:
8940     case Instruction::FPTrunc:
8941     case Instruction::FRem:
8942     case Instruction::FSub:
8943     case Instruction::ICmp:
8944     case Instruction::IntToPtr:
8945     case Instruction::LShr:
8946     case Instruction::Mul:
8947     case Instruction::Or:
8948     case Instruction::PtrToInt:
8949     case Instruction::SDiv:
8950     case Instruction::Select:
8951     case Instruction::SExt:
8952     case Instruction::Shl:
8953     case Instruction::SIToFP:
8954     case Instruction::SRem:
8955     case Instruction::Sub:
8956     case Instruction::Trunc:
8957     case Instruction::UDiv:
8958     case Instruction::UIToFP:
8959     case Instruction::URem:
8960     case Instruction::Xor:
8961     case Instruction::ZExt:
8962       return true;
8963     }
8964     return false;
8965   };
8966
8967   if (!IsVectorizableOpcode(I->getOpcode()))
8968     return nullptr;
8969
8970   // Success: widen this instruction.
8971   return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8972 }
8973
8974 void VPRecipeBuilder::fixHeaderPhis() {
8975   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8976   for (VPWidenPHIRecipe *R : PhisToFix) {
8977     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8978     VPRecipeBase *IncR =
8979         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8980     R->addOperand(IncR->getVPSingleValue());
8981   }
8982 }
8983
8984 VPBasicBlock *VPRecipeBuilder::handleReplication(
8985     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8986     VPlanPtr &Plan) {
8987   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8988       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8989       Range);
8990
8991   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8992       [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range);
8993
8994   // Even if the instruction is not marked as uniform, there are certain
8995   // intrinsic calls that can be effectively treated as such, so we check for
8996   // them here. Conservatively, we only do this for scalable vectors, since
8997   // for fixed-width VFs we can always fall back on full scalarization.
8998   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8999     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
9000     case Intrinsic::assume:
9001     case Intrinsic::lifetime_start:
9002     case Intrinsic::lifetime_end:
9003       // For scalable vectors if one of the operands is variant then we still
9004       // want to mark as uniform, which will generate one instruction for just
9005       // the first lane of the vector. We can't scalarize the call in the same
9006       // way as for fixed-width vectors because we don't know how many lanes
9007       // there are.
9008       //
9009       // The reasons for doing it this way for scalable vectors are:
9010       //   1. For the assume intrinsic generating the instruction for the first
9011       //      lane is still be better than not generating any at all. For
9012       //      example, the input may be a splat across all lanes.
9013       //   2. For the lifetime start/end intrinsics the pointer operand only
9014       //      does anything useful when the input comes from a stack object,
9015       //      which suggests it should always be uniform. For non-stack objects
9016       //      the effect is to poison the object, which still allows us to
9017       //      remove the call.
9018       IsUniform = true;
9019       break;
9020     default:
9021       break;
9022     }
9023   }
9024
9025   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
9026                                        IsUniform, IsPredicated);
9027   setRecipe(I, Recipe);
9028   Plan->addVPValue(I, Recipe);
9029
9030   // Find if I uses a predicated instruction. If so, it will use its scalar
9031   // value. Avoid hoisting the insert-element which packs the scalar value into
9032   // a vector value, as that happens iff all users use the vector value.
9033   for (VPValue *Op : Recipe->operands()) {
9034     auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
9035     if (!PredR)
9036       continue;
9037     auto *RepR =
9038         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
9039     assert(RepR->isPredicated() &&
9040            "expected Replicate recipe to be predicated");
9041     RepR->setAlsoPack(false);
9042   }
9043
9044   // Finalize the recipe for Instr, first if it is not predicated.
9045   if (!IsPredicated) {
9046     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
9047     VPBB->appendRecipe(Recipe);
9048     return VPBB;
9049   }
9050   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
9051   assert(VPBB->getSuccessors().empty() &&
9052          "VPBB has successors when handling predicated replication.");
9053   // Record predicated instructions for above packing optimizations.
9054   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
9055   VPBlockUtils::insertBlockAfter(Region, VPBB);
9056   auto *RegSucc = new VPBasicBlock();
9057   VPBlockUtils::insertBlockAfter(RegSucc, Region);
9058   return RegSucc;
9059 }
9060
9061 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
9062                                                       VPRecipeBase *PredRecipe,
9063                                                       VPlanPtr &Plan) {
9064   // Instructions marked for predication are replicated and placed under an
9065   // if-then construct to prevent side-effects.
9066
9067   // Generate recipes to compute the block mask for this region.
9068   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
9069
9070   // Build the triangular if-then region.
9071   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
9072   assert(Instr->getParent() && "Predicated instruction not in any basic block");
9073   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
9074   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
9075   auto *PHIRecipe = Instr->getType()->isVoidTy()
9076                         ? nullptr
9077                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
9078   if (PHIRecipe) {
9079     Plan->removeVPValueFor(Instr);
9080     Plan->addVPValue(Instr, PHIRecipe);
9081   }
9082   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
9083   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
9084   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
9085
9086   // Note: first set Entry as region entry and then connect successors starting
9087   // from it in order, to propagate the "parent" of each VPBasicBlock.
9088   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
9089   VPBlockUtils::connectBlocks(Pred, Exit);
9090
9091   return Region;
9092 }
9093
9094 VPRecipeOrVPValueTy
9095 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
9096                                         ArrayRef<VPValue *> Operands,
9097                                         VFRange &Range, VPlanPtr &Plan) {
9098   // First, check for specific widening recipes that deal with calls, memory
9099   // operations, inductions and Phi nodes.
9100   if (auto *CI = dyn_cast<CallInst>(Instr))
9101     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
9102
9103   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
9104     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
9105
9106   VPRecipeBase *Recipe;
9107   if (auto Phi = dyn_cast<PHINode>(Instr)) {
9108     if (Phi->getParent() != OrigLoop->getHeader())
9109       return tryToBlend(Phi, Operands, Plan);
9110     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands)))
9111       return toVPRecipeResult(Recipe);
9112
9113     VPWidenPHIRecipe *PhiRecipe = nullptr;
9114     if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
9115       VPValue *StartV = Operands[0];
9116       if (Legal->isReductionVariable(Phi)) {
9117         RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
9118         assert(RdxDesc.getRecurrenceStartValue() ==
9119                Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
9120         PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
9121                                              CM.isInLoopReduction(Phi),
9122                                              CM.useOrderedReductions(RdxDesc));
9123       } else {
9124         PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
9125       }
9126
9127       // Record the incoming value from the backedge, so we can add the incoming
9128       // value from the backedge after all recipes have been created.
9129       recordRecipeOf(cast<Instruction>(
9130           Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
9131       PhisToFix.push_back(PhiRecipe);
9132     } else {
9133       // TODO: record start and backedge value for remaining pointer induction
9134       // phis.
9135       assert(Phi->getType()->isPointerTy() &&
9136              "only pointer phis should be handled here");
9137       PhiRecipe = new VPWidenPHIRecipe(Phi);
9138     }
9139
9140     return toVPRecipeResult(PhiRecipe);
9141   }
9142
9143   if (isa<TruncInst>(Instr) &&
9144       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
9145                                                Range, *Plan)))
9146     return toVPRecipeResult(Recipe);
9147
9148   if (!shouldWiden(Instr, Range))
9149     return nullptr;
9150
9151   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
9152     return toVPRecipeResult(new VPWidenGEPRecipe(
9153         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
9154
9155   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
9156     bool InvariantCond =
9157         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
9158     return toVPRecipeResult(new VPWidenSelectRecipe(
9159         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
9160   }
9161
9162   return toVPRecipeResult(tryToWiden(Instr, Operands));
9163 }
9164
9165 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
9166                                                         ElementCount MaxVF) {
9167   assert(OrigLoop->isInnermost() && "Inner loop expected.");
9168
9169   // Collect instructions from the original loop that will become trivially dead
9170   // in the vectorized loop. We don't need to vectorize these instructions. For
9171   // example, original induction update instructions can become dead because we
9172   // separately emit induction "steps" when generating code for the new loop.
9173   // Similarly, we create a new latch condition when setting up the structure
9174   // of the new loop, so the old one can become dead.
9175   SmallPtrSet<Instruction *, 4> DeadInstructions;
9176   collectTriviallyDeadInstructions(DeadInstructions);
9177
9178   // Add assume instructions we need to drop to DeadInstructions, to prevent
9179   // them from being added to the VPlan.
9180   // TODO: We only need to drop assumes in blocks that get flattend. If the
9181   // control flow is preserved, we should keep them.
9182   auto &ConditionalAssumes = Legal->getConditionalAssumes();
9183   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
9184
9185   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
9186   // Dead instructions do not need sinking. Remove them from SinkAfter.
9187   for (Instruction *I : DeadInstructions)
9188     SinkAfter.erase(I);
9189
9190   // Cannot sink instructions after dead instructions (there won't be any
9191   // recipes for them). Instead, find the first non-dead previous instruction.
9192   for (auto &P : Legal->getSinkAfter()) {
9193     Instruction *SinkTarget = P.second;
9194     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
9195     (void)FirstInst;
9196     while (DeadInstructions.contains(SinkTarget)) {
9197       assert(
9198           SinkTarget != FirstInst &&
9199           "Must find a live instruction (at least the one feeding the "
9200           "first-order recurrence PHI) before reaching beginning of the block");
9201       SinkTarget = SinkTarget->getPrevNode();
9202       assert(SinkTarget != P.first &&
9203              "sink source equals target, no sinking required");
9204     }
9205     P.second = SinkTarget;
9206   }
9207
9208   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
9209   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
9210     VFRange SubRange = {VF, MaxVFPlusOne};
9211     VPlans.push_back(
9212         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
9213     VF = SubRange.End;
9214   }
9215 }
9216
9217 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
9218     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
9219     const MapVector<Instruction *, Instruction *> &SinkAfter) {
9220
9221   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
9222
9223   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
9224
9225   // ---------------------------------------------------------------------------
9226   // Pre-construction: record ingredients whose recipes we'll need to further
9227   // process after constructing the initial VPlan.
9228   // ---------------------------------------------------------------------------
9229
9230   // Mark instructions we'll need to sink later and their targets as
9231   // ingredients whose recipe we'll need to record.
9232   for (auto &Entry : SinkAfter) {
9233     RecipeBuilder.recordRecipeOf(Entry.first);
9234     RecipeBuilder.recordRecipeOf(Entry.second);
9235   }
9236   for (auto &Reduction : CM.getInLoopReductionChains()) {
9237     PHINode *Phi = Reduction.first;
9238     RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind();
9239     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9240
9241     RecipeBuilder.recordRecipeOf(Phi);
9242     for (auto &R : ReductionOperations) {
9243       RecipeBuilder.recordRecipeOf(R);
9244       // For min/max reducitons, where we have a pair of icmp/select, we also
9245       // need to record the ICmp recipe, so it can be removed later.
9246       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
9247         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
9248     }
9249   }
9250
9251   // For each interleave group which is relevant for this (possibly trimmed)
9252   // Range, add it to the set of groups to be later applied to the VPlan and add
9253   // placeholders for its members' Recipes which we'll be replacing with a
9254   // single VPInterleaveRecipe.
9255   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
9256     auto applyIG = [IG, this](ElementCount VF) -> bool {
9257       return (VF.isVector() && // Query is illegal for VF == 1
9258               CM.getWideningDecision(IG->getInsertPos(), VF) ==
9259                   LoopVectorizationCostModel::CM_Interleave);
9260     };
9261     if (!getDecisionAndClampRange(applyIG, Range))
9262       continue;
9263     InterleaveGroups.insert(IG);
9264     for (unsigned i = 0; i < IG->getFactor(); i++)
9265       if (Instruction *Member = IG->getMember(i))
9266         RecipeBuilder.recordRecipeOf(Member);
9267   };
9268
9269   // ---------------------------------------------------------------------------
9270   // Build initial VPlan: Scan the body of the loop in a topological order to
9271   // visit each basic block after having visited its predecessor basic blocks.
9272   // ---------------------------------------------------------------------------
9273
9274   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
9275   auto Plan = std::make_unique<VPlan>();
9276   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
9277   Plan->setEntry(VPBB);
9278
9279   // Scan the body of the loop in a topological order to visit each basic block
9280   // after having visited its predecessor basic blocks.
9281   LoopBlocksDFS DFS(OrigLoop);
9282   DFS.perform(LI);
9283
9284   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9285     // Relevant instructions from basic block BB will be grouped into VPRecipe
9286     // ingredients and fill a new VPBasicBlock.
9287     unsigned VPBBsForBB = 0;
9288     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
9289     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
9290     VPBB = FirstVPBBForBB;
9291     Builder.setInsertPoint(VPBB);
9292
9293     // Introduce each ingredient into VPlan.
9294     // TODO: Model and preserve debug instrinsics in VPlan.
9295     for (Instruction &I : BB->instructionsWithoutDebug()) {
9296       Instruction *Instr = &I;
9297
9298       // First filter out irrelevant instructions, to ensure no recipes are
9299       // built for them.
9300       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
9301         continue;
9302
9303       SmallVector<VPValue *, 4> Operands;
9304       auto *Phi = dyn_cast<PHINode>(Instr);
9305       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
9306         Operands.push_back(Plan->getOrAddVPValue(
9307             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9308       } else {
9309         auto OpRange = Plan->mapToVPValues(Instr->operands());
9310         Operands = {OpRange.begin(), OpRange.end()};
9311       }
9312       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
9313               Instr, Operands, Range, Plan)) {
9314         // If Instr can be simplified to an existing VPValue, use it.
9315         if (RecipeOrValue.is<VPValue *>()) {
9316           auto *VPV = RecipeOrValue.get<VPValue *>();
9317           Plan->addVPValue(Instr, VPV);
9318           // If the re-used value is a recipe, register the recipe for the
9319           // instruction, in case the recipe for Instr needs to be recorded.
9320           if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
9321             RecipeBuilder.setRecipe(Instr, R);
9322           continue;
9323         }
9324         // Otherwise, add the new recipe.
9325         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
9326         for (auto *Def : Recipe->definedValues()) {
9327           auto *UV = Def->getUnderlyingValue();
9328           Plan->addVPValue(UV, Def);
9329         }
9330
9331         RecipeBuilder.setRecipe(Instr, Recipe);
9332         VPBB->appendRecipe(Recipe);
9333         continue;
9334       }
9335
9336       // Otherwise, if all widening options failed, Instruction is to be
9337       // replicated. This may create a successor for VPBB.
9338       VPBasicBlock *NextVPBB =
9339           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
9340       if (NextVPBB != VPBB) {
9341         VPBB = NextVPBB;
9342         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
9343                                     : "");
9344       }
9345     }
9346   }
9347
9348   RecipeBuilder.fixHeaderPhis();
9349
9350   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
9351   // may also be empty, such as the last one VPBB, reflecting original
9352   // basic-blocks with no recipes.
9353   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
9354   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
9355   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
9356   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
9357   delete PreEntry;
9358
9359   // ---------------------------------------------------------------------------
9360   // Transform initial VPlan: Apply previously taken decisions, in order, to
9361   // bring the VPlan to its final state.
9362   // ---------------------------------------------------------------------------
9363
9364   // Apply Sink-After legal constraints.
9365   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
9366     auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
9367     if (Region && Region->isReplicator()) {
9368       assert(Region->getNumSuccessors() == 1 &&
9369              Region->getNumPredecessors() == 1 && "Expected SESE region!");
9370       assert(R->getParent()->size() == 1 &&
9371              "A recipe in an original replicator region must be the only "
9372              "recipe in its block");
9373       return Region;
9374     }
9375     return nullptr;
9376   };
9377   for (auto &Entry : SinkAfter) {
9378     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
9379     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
9380
9381     auto *TargetRegion = GetReplicateRegion(Target);
9382     auto *SinkRegion = GetReplicateRegion(Sink);
9383     if (!SinkRegion) {
9384       // If the sink source is not a replicate region, sink the recipe directly.
9385       if (TargetRegion) {
9386         // The target is in a replication region, make sure to move Sink to
9387         // the block after it, not into the replication region itself.
9388         VPBasicBlock *NextBlock =
9389             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
9390         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
9391       } else
9392         Sink->moveAfter(Target);
9393       continue;
9394     }
9395
9396     // The sink source is in a replicate region. Unhook the region from the CFG.
9397     auto *SinkPred = SinkRegion->getSinglePredecessor();
9398     auto *SinkSucc = SinkRegion->getSingleSuccessor();
9399     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
9400     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
9401     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
9402
9403     if (TargetRegion) {
9404       // The target recipe is also in a replicate region, move the sink region
9405       // after the target region.
9406       auto *TargetSucc = TargetRegion->getSingleSuccessor();
9407       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
9408       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
9409       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
9410     } else {
9411       // The sink source is in a replicate region, we need to move the whole
9412       // replicate region, which should only contain a single recipe in the
9413       // main block.
9414       auto *SplitBlock =
9415           Target->getParent()->splitAt(std::next(Target->getIterator()));
9416
9417       auto *SplitPred = SplitBlock->getSinglePredecessor();
9418
9419       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
9420       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
9421       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
9422       if (VPBB == SplitPred)
9423         VPBB = SplitBlock;
9424     }
9425   }
9426
9427   // Introduce a recipe to combine the incoming and previous values of a
9428   // first-order recurrence.
9429   for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9430     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
9431     if (!RecurPhi)
9432       continue;
9433
9434     auto *RecurSplice = cast<VPInstruction>(
9435         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
9436                              {RecurPhi, RecurPhi->getBackedgeValue()}));
9437
9438     VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
9439     if (auto *Region = GetReplicateRegion(PrevRecipe)) {
9440       VPBasicBlock *Succ = cast<VPBasicBlock>(Region->getSingleSuccessor());
9441       RecurSplice->moveBefore(*Succ, Succ->getFirstNonPhi());
9442     } else
9443       RecurSplice->moveAfter(PrevRecipe);
9444     RecurPhi->replaceAllUsesWith(RecurSplice);
9445     // Set the first operand of RecurSplice to RecurPhi again, after replacing
9446     // all users.
9447     RecurSplice->setOperand(0, RecurPhi);
9448   }
9449
9450   // Interleave memory: for each Interleave Group we marked earlier as relevant
9451   // for this VPlan, replace the Recipes widening its memory instructions with a
9452   // single VPInterleaveRecipe at its insertion point.
9453   for (auto IG : InterleaveGroups) {
9454     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9455         RecipeBuilder.getRecipe(IG->getInsertPos()));
9456     SmallVector<VPValue *, 4> StoredValues;
9457     for (unsigned i = 0; i < IG->getFactor(); ++i)
9458       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
9459         auto *StoreR =
9460             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
9461         StoredValues.push_back(StoreR->getStoredValue());
9462       }
9463
9464     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9465                                         Recipe->getMask());
9466     VPIG->insertBefore(Recipe);
9467     unsigned J = 0;
9468     for (unsigned i = 0; i < IG->getFactor(); ++i)
9469       if (Instruction *Member = IG->getMember(i)) {
9470         if (!Member->getType()->isVoidTy()) {
9471           VPValue *OriginalV = Plan->getVPValue(Member);
9472           Plan->removeVPValueFor(Member);
9473           Plan->addVPValue(Member, VPIG->getVPValue(J));
9474           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9475           J++;
9476         }
9477         RecipeBuilder.getRecipe(Member)->eraseFromParent();
9478       }
9479   }
9480
9481   // Adjust the recipes for any inloop reductions.
9482   adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start);
9483
9484   VPlanTransforms::sinkScalarOperands(*Plan);
9485   VPlanTransforms::mergeReplicateRegions(*Plan);
9486
9487   std::string PlanName;
9488   raw_string_ostream RSO(PlanName);
9489   ElementCount VF = Range.Start;
9490   Plan->addVF(VF);
9491   RSO << "Initial VPlan for VF={" << VF;
9492   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
9493     Plan->addVF(VF);
9494     RSO << "," << VF;
9495   }
9496   RSO << "},UF>=1";
9497   RSO.flush();
9498   Plan->setName(PlanName);
9499
9500   return Plan;
9501 }
9502
9503 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9504   // Outer loop handling: They may require CFG and instruction level
9505   // transformations before even evaluating whether vectorization is profitable.
9506   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9507   // the vectorization pipeline.
9508   assert(!OrigLoop->isInnermost());
9509   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9510
9511   // Create new empty VPlan
9512   auto Plan = std::make_unique<VPlan>();
9513
9514   // Build hierarchical CFG
9515   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9516   HCFGBuilder.buildHierarchicalCFG();
9517
9518   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9519        VF *= 2)
9520     Plan->addVF(VF);
9521
9522   if (EnableVPlanPredication) {
9523     VPlanPredicator VPP(*Plan);
9524     VPP.predicate();
9525
9526     // Avoid running transformation to recipes until masked code generation in
9527     // VPlan-native path is in place.
9528     return Plan;
9529   }
9530
9531   SmallPtrSet<Instruction *, 1> DeadInstructions;
9532   VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan,
9533                                              Legal->getInductionVars(),
9534                                              DeadInstructions, *PSE.getSE());
9535   return Plan;
9536 }
9537
9538 // Adjust the recipes for reductions. For in-loop reductions the chain of
9539 // instructions leading from the loop exit instr to the phi need to be converted
9540 // to reductions, with one operand being vector and the other being the scalar
9541 // reduction chain. For other reductions, a select is introduced between the phi
9542 // and live-out recipes when folding the tail.
9543 void LoopVectorizationPlanner::adjustRecipesForReductions(
9544     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9545     ElementCount MinVF) {
9546   for (auto &Reduction : CM.getInLoopReductionChains()) {
9547     PHINode *Phi = Reduction.first;
9548     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
9549     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9550
9551     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9552       continue;
9553
9554     // ReductionOperations are orders top-down from the phi's use to the
9555     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9556     // which of the two operands will remain scalar and which will be reduced.
9557     // For minmax the chain will be the select instructions.
9558     Instruction *Chain = Phi;
9559     for (Instruction *R : ReductionOperations) {
9560       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9561       RecurKind Kind = RdxDesc.getRecurrenceKind();
9562
9563       VPValue *ChainOp = Plan->getVPValue(Chain);
9564       unsigned FirstOpId;
9565       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9566         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9567                "Expected to replace a VPWidenSelectSC");
9568         FirstOpId = 1;
9569       } else {
9570         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe)) &&
9571                "Expected to replace a VPWidenSC");
9572         FirstOpId = 0;
9573       }
9574       unsigned VecOpId =
9575           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9576       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9577
9578       auto *CondOp = CM.foldTailByMasking()
9579                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9580                          : nullptr;
9581       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
9582           &RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9583       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9584       Plan->removeVPValueFor(R);
9585       Plan->addVPValue(R, RedRecipe);
9586       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
9587       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9588       WidenRecipe->eraseFromParent();
9589
9590       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9591         VPRecipeBase *CompareRecipe =
9592             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9593         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9594                "Expected to replace a VPWidenSC");
9595         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9596                "Expected no remaining users");
9597         CompareRecipe->eraseFromParent();
9598       }
9599       Chain = R;
9600     }
9601   }
9602
9603   // If tail is folded by masking, introduce selects between the phi
9604   // and the live-out instruction of each reduction, at the end of the latch.
9605   if (CM.foldTailByMasking()) {
9606     for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9607       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9608       if (!PhiR || PhiR->isInLoop())
9609         continue;
9610       Builder.setInsertPoint(LatchVPBB);
9611       VPValue *Cond =
9612           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9613       VPValue *Red = PhiR->getBackedgeValue();
9614       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9615     }
9616   }
9617 }
9618
9619 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9620 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9621                                VPSlotTracker &SlotTracker) const {
9622   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9623   IG->getInsertPos()->printAsOperand(O, false);
9624   O << ", ";
9625   getAddr()->printAsOperand(O, SlotTracker);
9626   VPValue *Mask = getMask();
9627   if (Mask) {
9628     O << ", ";
9629     Mask->printAsOperand(O, SlotTracker);
9630   }
9631
9632   unsigned OpIdx = 0;
9633   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9634     if (!IG->getMember(i))
9635       continue;
9636     if (getNumStoreOperands() > 0) {
9637       O << "\n" << Indent << "  store ";
9638       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9639       O << " to index " << i;
9640     } else {
9641       O << "\n" << Indent << "  ";
9642       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9643       O << " = load from index " << i;
9644     }
9645     ++OpIdx;
9646   }
9647 }
9648 #endif
9649
9650 void VPWidenCallRecipe::execute(VPTransformState &State) {
9651   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9652                                   *this, State);
9653 }
9654
9655 void VPWidenSelectRecipe::execute(VPTransformState &State) {
9656   State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
9657                                     this, *this, InvariantCond, State);
9658 }
9659
9660 void VPWidenRecipe::execute(VPTransformState &State) {
9661   State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
9662 }
9663
9664 void VPWidenGEPRecipe::execute(VPTransformState &State) {
9665   State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
9666                       *this, State.UF, State.VF, IsPtrLoopInvariant,
9667                       IsIndexLoopInvariant, State);
9668 }
9669
9670 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9671   assert(!State.Instance && "Int or FP induction being replicated.");
9672   State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(),
9673                                    getTruncInst(), getVPValue(0),
9674                                    getCastValue(), State);
9675 }
9676
9677 void VPWidenPHIRecipe::execute(VPTransformState &State) {
9678   State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this,
9679                                  State);
9680 }
9681
9682 void VPBlendRecipe::execute(VPTransformState &State) {
9683   State.ILV->setDebugLocFromInst(Phi, &State.Builder);
9684   // We know that all PHIs in non-header blocks are converted into
9685   // selects, so we don't have to worry about the insertion order and we
9686   // can just use the builder.
9687   // At this point we generate the predication tree. There may be
9688   // duplications since this is a simple recursive scan, but future
9689   // optimizations will clean it up.
9690
9691   unsigned NumIncoming = getNumIncomingValues();
9692
9693   // Generate a sequence of selects of the form:
9694   // SELECT(Mask3, In3,
9695   //        SELECT(Mask2, In2,
9696   //               SELECT(Mask1, In1,
9697   //                      In0)))
9698   // Note that Mask0 is never used: lanes for which no path reaches this phi and
9699   // are essentially undef are taken from In0.
9700   InnerLoopVectorizer::VectorParts Entry(State.UF);
9701   for (unsigned In = 0; In < NumIncoming; ++In) {
9702     for (unsigned Part = 0; Part < State.UF; ++Part) {
9703       // We might have single edge PHIs (blocks) - use an identity
9704       // 'select' for the first PHI operand.
9705       Value *In0 = State.get(getIncomingValue(In), Part);
9706       if (In == 0)
9707         Entry[Part] = In0; // Initialize with the first incoming value.
9708       else {
9709         // Select between the current value and the previous incoming edge
9710         // based on the incoming mask.
9711         Value *Cond = State.get(getMask(In), Part);
9712         Entry[Part] =
9713             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9714       }
9715     }
9716   }
9717   for (unsigned Part = 0; Part < State.UF; ++Part)
9718     State.set(this, Entry[Part], Part);
9719 }
9720
9721 void VPInterleaveRecipe::execute(VPTransformState &State) {
9722   assert(!State.Instance && "Interleave group being replicated.");
9723   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9724                                       getStoredValues(), getMask());
9725 }
9726
9727 void VPReductionRecipe::execute(VPTransformState &State) {
9728   assert(!State.Instance && "Reduction being replicated.");
9729   Value *PrevInChain = State.get(getChainOp(), 0);
9730   for (unsigned Part = 0; Part < State.UF; ++Part) {
9731     RecurKind Kind = RdxDesc->getRecurrenceKind();
9732     bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9733     Value *NewVecOp = State.get(getVecOp(), Part);
9734     if (VPValue *Cond = getCondOp()) {
9735       Value *NewCond = State.get(Cond, Part);
9736       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9737       Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
9738           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9739       Constant *IdenVec =
9740           ConstantVector::getSplat(VecTy->getElementCount(), Iden);
9741       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9742       NewVecOp = Select;
9743     }
9744     Value *NewRed;
9745     Value *NextInChain;
9746     if (IsOrdered) {
9747       if (State.VF.isVector())
9748         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9749                                         PrevInChain);
9750       else
9751         NewRed = State.Builder.CreateBinOp(
9752             (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(),
9753             PrevInChain, NewVecOp);
9754       PrevInChain = NewRed;
9755     } else {
9756       PrevInChain = State.get(getChainOp(), Part);
9757       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9758     }
9759     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9760       NextInChain =
9761           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9762                          NewRed, PrevInChain);
9763     } else if (IsOrdered)
9764       NextInChain = NewRed;
9765     else {
9766       NextInChain = State.Builder.CreateBinOp(
9767           (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
9768           PrevInChain);
9769     }
9770     State.set(this, NextInChain, Part);
9771   }
9772 }
9773
9774 void VPReplicateRecipe::execute(VPTransformState &State) {
9775   if (State.Instance) { // Generate a single instance.
9776     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9777     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
9778                                     *State.Instance, IsPredicated, State);
9779     // Insert scalar instance packing it into a vector.
9780     if (AlsoPack && State.VF.isVector()) {
9781       // If we're constructing lane 0, initialize to start from poison.
9782       if (State.Instance->Lane.isFirstLane()) {
9783         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9784         Value *Poison = PoisonValue::get(
9785             VectorType::get(getUnderlyingValue()->getType(), State.VF));
9786         State.set(this, Poison, State.Instance->Part);
9787       }
9788       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9789     }
9790     return;
9791   }
9792
9793   // Generate scalar instances for all VF lanes of all UF parts, unless the
9794   // instruction is uniform inwhich case generate only the first lane for each
9795   // of the UF parts.
9796   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9797   assert((!State.VF.isScalable() || IsUniform) &&
9798          "Can't scalarize a scalable vector");
9799   for (unsigned Part = 0; Part < State.UF; ++Part)
9800     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9801       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
9802                                       VPIteration(Part, Lane), IsPredicated,
9803                                       State);
9804 }
9805
9806 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
9807   assert(State.Instance && "Branch on Mask works only on single instance.");
9808
9809   unsigned Part = State.Instance->Part;
9810   unsigned Lane = State.Instance->Lane.getKnownLane();
9811
9812   Value *ConditionBit = nullptr;
9813   VPValue *BlockInMask = getMask();
9814   if (BlockInMask) {
9815     ConditionBit = State.get(BlockInMask, Part);
9816     if (ConditionBit->getType()->isVectorTy())
9817       ConditionBit = State.Builder.CreateExtractElement(
9818           ConditionBit, State.Builder.getInt32(Lane));
9819   } else // Block in mask is all-one.
9820     ConditionBit = State.Builder.getTrue();
9821
9822   // Replace the temporary unreachable terminator with a new conditional branch,
9823   // whose two destinations will be set later when they are created.
9824   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9825   assert(isa<UnreachableInst>(CurrentTerminator) &&
9826          "Expected to replace unreachable terminator with conditional branch.");
9827   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9828   CondBr->setSuccessor(0, nullptr);
9829   ReplaceInstWithInst(CurrentTerminator, CondBr);
9830 }
9831
9832 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9833   assert(State.Instance && "Predicated instruction PHI works per instance.");
9834   Instruction *ScalarPredInst =
9835       cast<Instruction>(State.get(getOperand(0), *State.Instance));
9836   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9837   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9838   assert(PredicatingBB && "Predicated block has no single predecessor.");
9839   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
9840          "operand must be VPReplicateRecipe");
9841
9842   // By current pack/unpack logic we need to generate only a single phi node: if
9843   // a vector value for the predicated instruction exists at this point it means
9844   // the instruction has vector users only, and a phi for the vector value is
9845   // needed. In this case the recipe of the predicated instruction is marked to
9846   // also do that packing, thereby "hoisting" the insert-element sequence.
9847   // Otherwise, a phi node for the scalar value is needed.
9848   unsigned Part = State.Instance->Part;
9849   if (State.hasVectorValue(getOperand(0), Part)) {
9850     Value *VectorValue = State.get(getOperand(0), Part);
9851     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9852     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9853     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9854     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9855     if (State.hasVectorValue(this, Part))
9856       State.reset(this, VPhi, Part);
9857     else
9858       State.set(this, VPhi, Part);
9859     // NOTE: Currently we need to update the value of the operand, so the next
9860     // predicated iteration inserts its generated value in the correct vector.
9861     State.reset(getOperand(0), VPhi, Part);
9862   } else {
9863     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9864     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9865     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9866                      PredicatingBB);
9867     Phi->addIncoming(ScalarPredInst, PredicatedBB);
9868     if (State.hasScalarValue(this, *State.Instance))
9869       State.reset(this, Phi, *State.Instance);
9870     else
9871       State.set(this, Phi, *State.Instance);
9872     // NOTE: Currently we need to update the value of the operand, so the next
9873     // predicated iteration inserts its generated value in the correct vector.
9874     State.reset(getOperand(0), Phi, *State.Instance);
9875   }
9876 }
9877
9878 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9879   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9880   State.ILV->vectorizeMemoryInstruction(
9881       &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(),
9882       StoredValue, getMask());
9883 }
9884
9885 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9886 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9887 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9888 // for predication.
9889 static ScalarEpilogueLowering getScalarEpilogueLowering(
9890     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9891     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9892     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
9893     LoopVectorizationLegality &LVL) {
9894   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9895   // don't look at hints or options, and don't request a scalar epilogue.
9896   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9897   // LoopAccessInfo (due to code dependency and not being able to reliably get
9898   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9899   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9900   // versioning when the vectorization is forced, unlike hasOptSize. So revert
9901   // back to the old way and vectorize with versioning when forced. See D81345.)
9902   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9903                                                       PGSOQueryType::IRPass) &&
9904                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9905     return CM_ScalarEpilogueNotAllowedOptSize;
9906
9907   // 2) If set, obey the directives
9908   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9909     switch (PreferPredicateOverEpilogue) {
9910     case PreferPredicateTy::ScalarEpilogue:
9911       return CM_ScalarEpilogueAllowed;
9912     case PreferPredicateTy::PredicateElseScalarEpilogue:
9913       return CM_ScalarEpilogueNotNeededUsePredicate;
9914     case PreferPredicateTy::PredicateOrDontVectorize:
9915       return CM_ScalarEpilogueNotAllowedUsePredicate;
9916     };
9917   }
9918
9919   // 3) If set, obey the hints
9920   switch (Hints.getPredicate()) {
9921   case LoopVectorizeHints::FK_Enabled:
9922     return CM_ScalarEpilogueNotNeededUsePredicate;
9923   case LoopVectorizeHints::FK_Disabled:
9924     return CM_ScalarEpilogueAllowed;
9925   };
9926
9927   // 4) if the TTI hook indicates this is profitable, request predication.
9928   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
9929                                        LVL.getLAI()))
9930     return CM_ScalarEpilogueNotNeededUsePredicate;
9931
9932   return CM_ScalarEpilogueAllowed;
9933 }
9934
9935 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
9936   // If Values have been set for this Def return the one relevant for \p Part.
9937   if (hasVectorValue(Def, Part))
9938     return Data.PerPartOutput[Def][Part];
9939
9940   if (!hasScalarValue(Def, {Part, 0})) {
9941     Value *IRV = Def->getLiveInIRValue();
9942     Value *B = ILV->getBroadcastInstrs(IRV);
9943     set(Def, B, Part);
9944     return B;
9945   }
9946
9947   Value *ScalarValue = get(Def, {Part, 0});
9948   // If we aren't vectorizing, we can just copy the scalar map values over
9949   // to the vector map.
9950   if (VF.isScalar()) {
9951     set(Def, ScalarValue, Part);
9952     return ScalarValue;
9953   }
9954
9955   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
9956   bool IsUniform = RepR && RepR->isUniform();
9957
9958   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
9959   // Check if there is a scalar value for the selected lane.
9960   if (!hasScalarValue(Def, {Part, LastLane})) {
9961     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
9962     assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) &&
9963            "unexpected recipe found to be invariant");
9964     IsUniform = true;
9965     LastLane = 0;
9966   }
9967
9968   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
9969   // Set the insert point after the last scalarized instruction or after the
9970   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
9971   // will directly follow the scalar definitions.
9972   auto OldIP = Builder.saveIP();
9973   auto NewIP =
9974       isa<PHINode>(LastInst)
9975           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
9976           : std::next(BasicBlock::iterator(LastInst));
9977   Builder.SetInsertPoint(&*NewIP);
9978
9979   // However, if we are vectorizing, we need to construct the vector values.
9980   // If the value is known to be uniform after vectorization, we can just
9981   // broadcast the scalar value corresponding to lane zero for each unroll
9982   // iteration. Otherwise, we construct the vector values using
9983   // insertelement instructions. Since the resulting vectors are stored in
9984   // State, we will only generate the insertelements once.
9985   Value *VectorValue = nullptr;
9986   if (IsUniform) {
9987     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
9988     set(Def, VectorValue, Part);
9989   } else {
9990     // Initialize packing with insertelements to start from undef.
9991     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
9992     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
9993     set(Def, Undef, Part);
9994     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
9995       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
9996     VectorValue = get(Def, Part);
9997   }
9998   Builder.restoreIP(OldIP);
9999   return VectorValue;
10000 }
10001
10002 // Process the loop in the VPlan-native vectorization path. This path builds
10003 // VPlan upfront in the vectorization pipeline, which allows to apply
10004 // VPlan-to-VPlan transformations from the very beginning without modifying the
10005 // input LLVM IR.
10006 static bool processLoopInVPlanNativePath(
10007     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
10008     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
10009     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
10010     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
10011     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
10012     LoopVectorizationRequirements &Requirements) {
10013
10014   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10015     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10016     return false;
10017   }
10018   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10019   Function *F = L->getHeader()->getParent();
10020   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10021
10022   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10023       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
10024
10025   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10026                                 &Hints, IAI);
10027   // Use the planner for outer loop vectorization.
10028   // TODO: CM is not used at this point inside the planner. Turn CM into an
10029   // optional argument if we don't need it in the future.
10030   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
10031                                Requirements, ORE);
10032
10033   // Get user vectorization factor.
10034   ElementCount UserVF = Hints.getWidth();
10035
10036   CM.collectElementTypesForWidening();
10037
10038   // Plan how to best vectorize, return the best VF and its cost.
10039   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10040
10041   // If we are stress testing VPlan builds, do not attempt to generate vector
10042   // code. Masked vector code generation support will follow soon.
10043   // Also, do not attempt to vectorize if no vector code will be produced.
10044   if (VPlanBuildStressTest || EnableVPlanPredication ||
10045       VectorizationFactor::Disabled() == VF)
10046     return false;
10047
10048   LVP.setBestPlan(VF.Width, 1);
10049
10050   {
10051     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10052                              F->getParent()->getDataLayout());
10053     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
10054                            &CM, BFI, PSI, Checks);
10055     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10056                       << L->getHeader()->getParent()->getName() << "\"\n");
10057     LVP.executePlan(LB, DT);
10058   }
10059
10060   // Mark the loop as already vectorized to avoid vectorizing again.
10061   Hints.setAlreadyVectorized();
10062   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10063   return true;
10064 }
10065
10066 // Emit a remark if there are stores to floats that required a floating point
10067 // extension. If the vectorized loop was generated with floating point there
10068 // will be a performance penalty from the conversion overhead and the change in
10069 // the vector width.
10070 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10071   SmallVector<Instruction *, 4> Worklist;
10072   for (BasicBlock *BB : L->getBlocks()) {
10073     for (Instruction &Inst : *BB) {
10074       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10075         if (S->getValueOperand()->getType()->isFloatTy())
10076           Worklist.push_back(S);
10077       }
10078     }
10079   }
10080
10081   // Traverse the floating point stores upwards searching, for floating point
10082   // conversions.
10083   SmallPtrSet<const Instruction *, 4> Visited;
10084   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10085   while (!Worklist.empty()) {
10086     auto *I = Worklist.pop_back_val();
10087     if (!L->contains(I))
10088       continue;
10089     if (!Visited.insert(I).second)
10090       continue;
10091
10092     // Emit a remark if the floating point store required a floating
10093     // point conversion.
10094     // TODO: More work could be done to identify the root cause such as a
10095     // constant or a function return type and point the user to it.
10096     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10097       ORE->emit([&]() {
10098         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10099                                           I->getDebugLoc(), L->getHeader())
10100                << "floating point conversion changes vector width. "
10101                << "Mixed floating point precision requires an up/down "
10102                << "cast that will negatively impact performance.";
10103       });
10104
10105     for (Use &Op : I->operands())
10106       if (auto *OpI = dyn_cast<Instruction>(Op))
10107         Worklist.push_back(OpI);
10108   }
10109 }
10110
10111 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10112     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10113                                !EnableLoopInterleaving),
10114       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10115                               !EnableLoopVectorization) {}
10116
10117 bool LoopVectorizePass::processLoop(Loop *L) {
10118   assert((EnableVPlanNativePath || L->isInnermost()) &&
10119          "VPlan-native path is not enabled. Only process inner loops.");
10120
10121 #ifndef NDEBUG
10122   const std::string DebugLocStr = getDebugLocString(L);
10123 #endif /* NDEBUG */
10124
10125   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
10126                     << L->getHeader()->getParent()->getName() << "\" from "
10127                     << DebugLocStr << "\n");
10128
10129   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
10130
10131   LLVM_DEBUG(
10132       dbgs() << "LV: Loop hints:"
10133              << " force="
10134              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10135                      ? "disabled"
10136                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10137                             ? "enabled"
10138                             : "?"))
10139              << " width=" << Hints.getWidth()
10140              << " interleave=" << Hints.getInterleave() << "\n");
10141
10142   // Function containing loop
10143   Function *F = L->getHeader()->getParent();
10144
10145   // Looking at the diagnostic output is the only way to determine if a loop
10146   // was vectorized (other than looking at the IR or machine code), so it
10147   // is important to generate an optimization remark for each loop. Most of
10148   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10149   // generated as OptimizationRemark and OptimizationRemarkMissed are
10150   // less verbose reporting vectorized loops and unvectorized loops that may
10151   // benefit from vectorization, respectively.
10152
10153   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10154     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10155     return false;
10156   }
10157
10158   PredicatedScalarEvolution PSE(*SE, *L);
10159
10160   // Check if it is legal to vectorize the loop.
10161   LoopVectorizationRequirements Requirements;
10162   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
10163                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10164   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10165     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10166     Hints.emitRemarkWithHints();
10167     return false;
10168   }
10169
10170   // Check the function attributes and profiles to find out if this function
10171   // should be optimized for size.
10172   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10173       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
10174
10175   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10176   // here. They may require CFG and instruction level transformations before
10177   // even evaluating whether vectorization is profitable. Since we cannot modify
10178   // the incoming IR, we need to build VPlan upfront in the vectorization
10179   // pipeline.
10180   if (!L->isInnermost())
10181     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10182                                         ORE, BFI, PSI, Hints, Requirements);
10183
10184   assert(L->isInnermost() && "Inner loop expected.");
10185
10186   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10187   // count by optimizing for size, to minimize overheads.
10188   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10189   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10190     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10191                       << "This loop is worth vectorizing only if no scalar "
10192                       << "iteration overheads are incurred.");
10193     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10194       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10195     else {
10196       LLVM_DEBUG(dbgs() << "\n");
10197       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10198     }
10199   }
10200
10201   // Check the function attributes to see if implicit floats are allowed.
10202   // FIXME: This check doesn't seem possibly correct -- what if the loop is
10203   // an integer loop and the vector instructions selected are purely integer
10204   // vector instructions?
10205   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10206     reportVectorizationFailure(
10207         "Can't vectorize when the NoImplicitFloat attribute is used",
10208         "loop not vectorized due to NoImplicitFloat attribute",
10209         "NoImplicitFloat", ORE, L);
10210     Hints.emitRemarkWithHints();
10211     return false;
10212   }
10213
10214   // Check if the target supports potentially unsafe FP vectorization.
10215   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10216   // for the target we're vectorizing for, to make sure none of the
10217   // additional fp-math flags can help.
10218   if (Hints.isPotentiallyUnsafe() &&
10219       TTI->isFPVectorizationPotentiallyUnsafe()) {
10220     reportVectorizationFailure(
10221         "Potentially unsafe FP op prevents vectorization",
10222         "loop not vectorized due to unsafe FP support.",
10223         "UnsafeFP", ORE, L);
10224     Hints.emitRemarkWithHints();
10225     return false;
10226   }
10227
10228   if (!LVL.canVectorizeFPMath(ForceOrderedReductions)) {
10229     ORE->emit([&]() {
10230       auto *ExactFPMathInst = Requirements.getExactFPInst();
10231       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10232                                                  ExactFPMathInst->getDebugLoc(),
10233                                                  ExactFPMathInst->getParent())
10234              << "loop not vectorized: cannot prove it is safe to reorder "
10235                 "floating-point operations";
10236     });
10237     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10238                          "reorder floating-point operations\n");
10239     Hints.emitRemarkWithHints();
10240     return false;
10241   }
10242
10243   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10244   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10245
10246   // If an override option has been passed in for interleaved accesses, use it.
10247   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10248     UseInterleaved = EnableInterleavedMemAccesses;
10249
10250   // Analyze interleaved memory accesses.
10251   if (UseInterleaved) {
10252     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10253   }
10254
10255   // Use the cost model.
10256   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10257                                 F, &Hints, IAI);
10258   CM.collectValuesToIgnore();
10259   CM.collectElementTypesForWidening();
10260
10261   // Use the planner for vectorization.
10262   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
10263                                Requirements, ORE);
10264
10265   // Get user vectorization factor and interleave count.
10266   ElementCount UserVF = Hints.getWidth();
10267   unsigned UserIC = Hints.getInterleave();
10268
10269   // Plan how to best vectorize, return the best VF and its cost.
10270   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10271
10272   VectorizationFactor VF = VectorizationFactor::Disabled();
10273   unsigned IC = 1;
10274
10275   if (MaybeVF) {
10276     VF = *MaybeVF;
10277     // Select the interleave count.
10278     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10279   }
10280
10281   // Identify the diagnostic messages that should be produced.
10282   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10283   bool VectorizeLoop = true, InterleaveLoop = true;
10284   if (VF.Width.isScalar()) {
10285     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10286     VecDiagMsg = std::make_pair(
10287         "VectorizationNotBeneficial",
10288         "the cost-model indicates that vectorization is not beneficial");
10289     VectorizeLoop = false;
10290   }
10291
10292   if (!MaybeVF && UserIC > 1) {
10293     // Tell the user interleaving was avoided up-front, despite being explicitly
10294     // requested.
10295     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10296                          "interleaving should be avoided up front\n");
10297     IntDiagMsg = std::make_pair(
10298         "InterleavingAvoided",
10299         "Ignoring UserIC, because interleaving was avoided up front");
10300     InterleaveLoop = false;
10301   } else if (IC == 1 && UserIC <= 1) {
10302     // Tell the user interleaving is not beneficial.
10303     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10304     IntDiagMsg = std::make_pair(
10305         "InterleavingNotBeneficial",
10306         "the cost-model indicates that interleaving is not beneficial");
10307     InterleaveLoop = false;
10308     if (UserIC == 1) {
10309       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10310       IntDiagMsg.second +=
10311           " and is explicitly disabled or interleave count is set to 1";
10312     }
10313   } else if (IC > 1 && UserIC == 1) {
10314     // Tell the user interleaving is beneficial, but it explicitly disabled.
10315     LLVM_DEBUG(
10316         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10317     IntDiagMsg = std::make_pair(
10318         "InterleavingBeneficialButDisabled",
10319         "the cost-model indicates that interleaving is beneficial "
10320         "but is explicitly disabled or interleave count is set to 1");
10321     InterleaveLoop = false;
10322   }
10323
10324   // Override IC if user provided an interleave count.
10325   IC = UserIC > 0 ? UserIC : IC;
10326
10327   // Emit diagnostic messages, if any.
10328   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10329   if (!VectorizeLoop && !InterleaveLoop) {
10330     // Do not vectorize or interleaving the loop.
10331     ORE->emit([&]() {
10332       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10333                                       L->getStartLoc(), L->getHeader())
10334              << VecDiagMsg.second;
10335     });
10336     ORE->emit([&]() {
10337       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10338                                       L->getStartLoc(), L->getHeader())
10339              << IntDiagMsg.second;
10340     });
10341     return false;
10342   } else if (!VectorizeLoop && InterleaveLoop) {
10343     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10344     ORE->emit([&]() {
10345       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10346                                         L->getStartLoc(), L->getHeader())
10347              << VecDiagMsg.second;
10348     });
10349   } else if (VectorizeLoop && !InterleaveLoop) {
10350     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10351                       << ") in " << DebugLocStr << '\n');
10352     ORE->emit([&]() {
10353       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10354                                         L->getStartLoc(), L->getHeader())
10355              << IntDiagMsg.second;
10356     });
10357   } else if (VectorizeLoop && InterleaveLoop) {
10358     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10359                       << ") in " << DebugLocStr << '\n');
10360     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10361   }
10362
10363   bool DisableRuntimeUnroll = false;
10364   MDNode *OrigLoopID = L->getLoopID();
10365   {
10366     // Optimistically generate runtime checks. Drop them if they turn out to not
10367     // be profitable. Limit the scope of Checks, so the cleanup happens
10368     // immediately after vector codegeneration is done.
10369     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10370                              F->getParent()->getDataLayout());
10371     if (!VF.Width.isScalar() || IC > 1)
10372       Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
10373     LVP.setBestPlan(VF.Width, IC);
10374
10375     using namespace ore;
10376     if (!VectorizeLoop) {
10377       assert(IC > 1 && "interleave count should not be 1 or 0");
10378       // If we decided that it is not legal to vectorize the loop, then
10379       // interleave it.
10380       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10381                                  &CM, BFI, PSI, Checks);
10382       LVP.executePlan(Unroller, DT);
10383
10384       ORE->emit([&]() {
10385         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10386                                   L->getHeader())
10387                << "interleaved loop (interleaved count: "
10388                << NV("InterleaveCount", IC) << ")";
10389       });
10390     } else {
10391       // If we decided that it is *legal* to vectorize the loop, then do it.
10392
10393       // Consider vectorizing the epilogue too if it's profitable.
10394       VectorizationFactor EpilogueVF =
10395           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10396       if (EpilogueVF.Width.isVector()) {
10397
10398         // The first pass vectorizes the main loop and creates a scalar epilogue
10399         // to be vectorized by executing the plan (potentially with a different
10400         // factor) again shortly afterwards.
10401         EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
10402                                           EpilogueVF.Width.getKnownMinValue(),
10403                                           1);
10404         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10405                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10406
10407         LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
10408         LVP.executePlan(MainILV, DT);
10409         ++LoopsVectorized;
10410
10411         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10412         formLCSSARecursively(*L, *DT, LI, SE);
10413
10414         // Second pass vectorizes the epilogue and adjusts the control flow
10415         // edges from the first pass.
10416         LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
10417         EPI.MainLoopVF = EPI.EpilogueVF;
10418         EPI.MainLoopUF = EPI.EpilogueUF;
10419         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10420                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10421                                                  Checks);
10422         LVP.executePlan(EpilogILV, DT);
10423         ++LoopsEpilogueVectorized;
10424
10425         if (!MainILV.areSafetyChecksAdded())
10426           DisableRuntimeUnroll = true;
10427       } else {
10428         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
10429                                &LVL, &CM, BFI, PSI, Checks);
10430         LVP.executePlan(LB, DT);
10431         ++LoopsVectorized;
10432
10433         // Add metadata to disable runtime unrolling a scalar loop when there
10434         // are no runtime checks about strides and memory. A scalar loop that is
10435         // rarely used is not worth unrolling.
10436         if (!LB.areSafetyChecksAdded())
10437           DisableRuntimeUnroll = true;
10438       }
10439       // Report the vectorization decision.
10440       ORE->emit([&]() {
10441         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10442                                   L->getHeader())
10443                << "vectorized loop (vectorization width: "
10444                << NV("VectorizationFactor", VF.Width)
10445                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10446       });
10447     }
10448
10449     if (ORE->allowExtraAnalysis(LV_NAME))
10450       checkMixedPrecision(L, ORE);
10451   }
10452
10453   Optional<MDNode *> RemainderLoopID =
10454       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10455                                       LLVMLoopVectorizeFollowupEpilogue});
10456   if (RemainderLoopID.hasValue()) {
10457     L->setLoopID(RemainderLoopID.getValue());
10458   } else {
10459     if (DisableRuntimeUnroll)
10460       AddRuntimeUnrollDisableMetaData(L);
10461
10462     // Mark the loop as already vectorized to avoid vectorizing again.
10463     Hints.setAlreadyVectorized();
10464   }
10465
10466   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10467   return true;
10468 }
10469
10470 LoopVectorizeResult LoopVectorizePass::runImpl(
10471     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10472     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10473     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10474     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10475     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10476   SE = &SE_;
10477   LI = &LI_;
10478   TTI = &TTI_;
10479   DT = &DT_;
10480   BFI = &BFI_;
10481   TLI = TLI_;
10482   AA = &AA_;
10483   AC = &AC_;
10484   GetLAA = &GetLAA_;
10485   DB = &DB_;
10486   ORE = &ORE_;
10487   PSI = PSI_;
10488
10489   // Don't attempt if
10490   // 1. the target claims to have no vector registers, and
10491   // 2. interleaving won't help ILP.
10492   //
10493   // The second condition is necessary because, even if the target has no
10494   // vector registers, loop vectorization may still enable scalar
10495   // interleaving.
10496   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10497       TTI->getMaxInterleaveFactor(1) < 2)
10498     return LoopVectorizeResult(false, false);
10499
10500   bool Changed = false, CFGChanged = false;
10501
10502   // The vectorizer requires loops to be in simplified form.
10503   // Since simplification may add new inner loops, it has to run before the
10504   // legality and profitability checks. This means running the loop vectorizer
10505   // will simplify all loops, regardless of whether anything end up being
10506   // vectorized.
10507   for (auto &L : *LI)
10508     Changed |= CFGChanged |=
10509         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10510
10511   // Build up a worklist of inner-loops to vectorize. This is necessary as
10512   // the act of vectorizing or partially unrolling a loop creates new loops
10513   // and can invalidate iterators across the loops.
10514   SmallVector<Loop *, 8> Worklist;
10515
10516   for (Loop *L : *LI)
10517     collectSupportedLoops(*L, LI, ORE, Worklist);
10518
10519   LoopsAnalyzed += Worklist.size();
10520
10521   // Now walk the identified inner loops.
10522   while (!Worklist.empty()) {
10523     Loop *L = Worklist.pop_back_val();
10524
10525     // For the inner loops we actually process, form LCSSA to simplify the
10526     // transform.
10527     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10528
10529     Changed |= CFGChanged |= processLoop(L);
10530   }
10531
10532   // Process each loop nest in the function.
10533   return LoopVectorizeResult(Changed, CFGChanged);
10534 }
10535
10536 PreservedAnalyses LoopVectorizePass::run(Function &F,
10537                                          FunctionAnalysisManager &AM) {
10538     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10539     auto &LI = AM.getResult<LoopAnalysis>(F);
10540     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10541     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10542     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10543     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10544     auto &AA = AM.getResult<AAManager>(F);
10545     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10546     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10547     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10548
10549     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10550     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10551         [&](Loop &L) -> const LoopAccessInfo & {
10552       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
10553                                         TLI, TTI, nullptr, nullptr};
10554       return LAM.getResult<LoopAccessAnalysis>(L, AR);
10555     };
10556     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10557     ProfileSummaryInfo *PSI =
10558         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10559     LoopVectorizeResult Result =
10560         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10561     if (!Result.MadeAnyChange)
10562       return PreservedAnalyses::all();
10563     PreservedAnalyses PA;
10564
10565     // We currently do not preserve loopinfo/dominator analyses with outer loop
10566     // vectorization. Until this is addressed, mark these analyses as preserved
10567     // only for non-VPlan-native path.
10568     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10569     if (!EnableVPlanNativePath) {
10570       PA.preserve<LoopAnalysis>();
10571       PA.preserve<DominatorTreeAnalysis>();
10572     }
10573     if (!Result.MadeCFGChange)
10574       PA.preserveSet<CFGAnalyses>();
10575     return PA;
10576 }