llvm/lib/Transforms/Vectorize/VectorCombine.cpp

   1 //===------- VectorCombine.cpp - Optimize partial vector operations -------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This pass optimizes scalar/vector interactions using target cost models. The
  10 // transforms implemented here may not fit in traditional loop-based or SLP
  11 // vectorization passes.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "llvm/Transforms/Vectorize/VectorCombine.h"
  16 #include "llvm/ADT/Statistic.h"
  17 #include "llvm/Analysis/AssumptionCache.h"
  18 #include "llvm/Analysis/BasicAliasAnalysis.h"
  19 #include "llvm/Analysis/GlobalsModRef.h"
  20 #include "llvm/Analysis/Loads.h"
  21 #include "llvm/Analysis/TargetTransformInfo.h"
  22 #include "llvm/Analysis/ValueTracking.h"
  23 #include "llvm/Analysis/VectorUtils.h"
  24 #include "llvm/IR/Dominators.h"
  25 #include "llvm/IR/Function.h"
  26 #include "llvm/IR/IRBuilder.h"
  27 #include "llvm/IR/PatternMatch.h"
  28 #include "llvm/InitializePasses.h"
  29 #include "llvm/Pass.h"
  30 #include "llvm/Support/CommandLine.h"
  31 #include "llvm/Transforms/Utils/Local.h"
  32 #include "llvm/Transforms/Vectorize.h"
  33
  34 using namespace llvm;
  35 using namespace llvm::PatternMatch;
  36
  37 #define DEBUG_TYPE "vector-combine"
  38 STATISTIC(NumVecLoad, "Number of vector loads formed");
  39 STATISTIC(NumVecCmp, "Number of vector compares formed");
  40 STATISTIC(NumVecBO, "Number of vector binops formed");
  41 STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
  42 STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
  43 STATISTIC(NumScalarBO, "Number of scalar binops formed");
  44 STATISTIC(NumScalarCmp, "Number of scalar compares formed");
  45
  46 static cl::opt<bool> DisableVectorCombine(
  47     "disable-vector-combine", cl::init(false), cl::Hidden,
  48     cl::desc("Disable all vector combine transforms"));
  49
  50 static cl::opt<bool> DisableBinopExtractShuffle(
  51     "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
  52     cl::desc("Disable binop extract to shuffle transforms"));
  53
  54 static cl::opt<unsigned> MaxInstrsToScan(
  55     "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
  56     cl::desc("Max number of instructions to scan for vector combining."));
  57
  58 static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
  59
  60 namespace {
  61 class VectorCombine {
  62 public:
  63   VectorCombine(Function &F, const TargetTransformInfo &TTI,
  64                 const DominatorTree &DT, AAResults &AA, AssumptionCache &AC)
  65       : F(F), Builder(F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC) {}
  66
  67   bool run();
  68
  69 private:
  70   Function &F;
  71   IRBuilder<> Builder;
  72   const TargetTransformInfo &TTI;
  73   const DominatorTree &DT;
  74   AAResults &AA;
  75   AssumptionCache &AC;
  76
  77   bool vectorizeLoadInsert(Instruction &I);
  78   ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
  79                                         ExtractElementInst *Ext1,
  80                                         unsigned PreferredExtractIndex) const;
  81   bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
  82                              unsigned Opcode,
  83                              ExtractElementInst *&ConvertToShuffle,
  84                              unsigned PreferredExtractIndex);
  85   void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
  86                      Instruction &I);
  87   void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
  88                        Instruction &I);
  89   bool foldExtractExtract(Instruction &I);
  90   bool foldBitcastShuf(Instruction &I);
  91   bool scalarizeBinopOrCmp(Instruction &I);
  92   bool foldExtractedCmps(Instruction &I);
  93   bool foldSingleElementStore(Instruction &I);
  94   bool scalarizeLoadExtract(Instruction &I);
  95 };
  96 } // namespace
  97
  98 static void replaceValue(Value &Old, Value &New) {
  99   Old.replaceAllUsesWith(&New);
 100   New.takeName(&Old);
 101 }
 102
 103 bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
 104   // Match insert into fixed vector of scalar value.
 105   // TODO: Handle non-zero insert index.
 106   auto *Ty = dyn_cast<FixedVectorType>(I.getType());
 107   Value *Scalar;
 108   if (!Ty || !match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) ||
 109       !Scalar->hasOneUse())
 110     return false;
 111
 112   // Optionally match an extract from another vector.
 113   Value *X;
 114   bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
 115   if (!HasExtract)
 116     X = Scalar;
 117
 118   // Match source value as load of scalar or vector.
 119   // Do not vectorize scalar load (widening) if atomic/volatile or under
 120   // asan/hwasan/memtag/tsan. The widened load may load data from dirty regions
 121   // or create data races non-existent in the source.
 122   auto *Load = dyn_cast<LoadInst>(X);
 123   if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
 124       Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
 125       mustSuppressSpeculation(*Load))
 126     return false;
 127
 128   const DataLayout &DL = I.getModule()->getDataLayout();
 129   Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
 130   assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
 131
 132   // If original AS != Load's AS, we can't bitcast the original pointer and have
 133   // to use Load's operand instead. Ideally we would want to strip pointer casts
 134   // without changing AS, but there's no API to do that ATM.
 135   unsigned AS = Load->getPointerAddressSpace();
 136   if (AS != SrcPtr->getType()->getPointerAddressSpace())
 137     SrcPtr = Load->getPointerOperand();
 138
 139   // We are potentially transforming byte-sized (8-bit) memory accesses, so make
 140   // sure we have all of our type-based constraints in place for this target.
 141   Type *ScalarTy = Scalar->getType();
 142   uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
 143   unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
 144   if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
 145       ScalarSize % 8 != 0)
 146     return false;
 147
 148   // Check safety of replacing the scalar load with a larger vector load.
 149   // We use minimal alignment (maximum flexibility) because we only care about
 150   // the dereferenceable region. When calculating cost and creating a new op,
 151   // we may use a larger value based on alignment attributes.
 152   unsigned MinVecNumElts = MinVectorSize / ScalarSize;
 153   auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
 154   unsigned OffsetEltIndex = 0;
 155   Align Alignment = Load->getAlign();
 156   if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT)) {
 157     // It is not safe to load directly from the pointer, but we can still peek
 158     // through gep offsets and check if it safe to load from a base address with
 159     // updated alignment. If it is, we can shuffle the element(s) into place
 160     // after loading.
 161     unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(SrcPtr->getType());
 162     APInt Offset(OffsetBitWidth, 0);
 163     SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
 164
 165     // We want to shuffle the result down from a high element of a vector, so
 166     // the offset must be positive.
 167     if (Offset.isNegative())
 168       return false;
 169
 170     // The offset must be a multiple of the scalar element to shuffle cleanly
 171     // in the element's size.
 172     uint64_t ScalarSizeInBytes = ScalarSize / 8;
 173     if (Offset.urem(ScalarSizeInBytes) != 0)
 174       return false;
 175
 176     // If we load MinVecNumElts, will our target element still be loaded?
 177     OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
 178     if (OffsetEltIndex >= MinVecNumElts)
 179       return false;
 180
 181     if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT))
 182       return false;
 183
 184     // Update alignment with offset value. Note that the offset could be negated
 185     // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
 186     // negation does not change the result of the alignment calculation.
 187     Alignment = commonAlignment(Alignment, Offset.getZExtValue());
 188   }
 189
 190   // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
 191   // Use the greater of the alignment on the load or its source pointer.
 192   Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment);
 193   Type *LoadTy = Load->getType();
 194   InstructionCost OldCost =
 195       TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);
 196   APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
 197   OldCost += TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
 198                                           /* Insert */ true, HasExtract);
 199
 200   // New pattern: load VecPtr
 201   InstructionCost NewCost =
 202       TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS);
 203   // Optionally, we are shuffling the loaded vector element(s) into place.
 204   // For the mask set everything but element 0 to undef to prevent poison from
 205   // propagating from the extra loaded memory. This will also optionally
 206   // shrink/grow the vector from the loaded size to the output size.
 207   // We assume this operation has no cost in codegen if there was no offset.
 208   // Note that we could use freeze to avoid poison problems, but then we might
 209   // still need a shuffle to change the vector size.
 210   unsigned OutputNumElts = Ty->getNumElements();
 211   SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem);
 212   assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
 213   Mask[0] = OffsetEltIndex;
 214   if (OffsetEltIndex)
 215     NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask);
 216
 217   // We can aggressively convert to the vector form because the backend can
 218   // invert this transform if it does not result in a performance win.
 219   if (OldCost < NewCost || !NewCost.isValid())
 220     return false;
 221
 222   // It is safe and potentially profitable to load a vector directly:
 223   // inselt undef, load Scalar, 0 --> load VecPtr
 224   IRBuilder<> Builder(Load);
 225   Value *CastedPtr = Builder.CreateBitCast(SrcPtr, MinVecTy->getPointerTo(AS));
 226   Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
 227   VecLd = Builder.CreateShuffleVector(VecLd, Mask);
 228
 229   replaceValue(I, *VecLd);
 230   ++NumVecLoad;
 231   return true;
 232 }
 233
 234 /// Determine which, if any, of the inputs should be replaced by a shuffle
 235 /// followed by extract from a different index.
 236 ExtractElementInst *VectorCombine::getShuffleExtract(
 237     ExtractElementInst *Ext0, ExtractElementInst *Ext1,
 238     unsigned PreferredExtractIndex = InvalidIndex) const {
 239   assert(isa<ConstantInt>(Ext0->getIndexOperand()) &&
 240          isa<ConstantInt>(Ext1->getIndexOperand()) &&
 241          "Expected constant extract indexes");
 242
 243   unsigned Index0 = cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue();
 244   unsigned Index1 = cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue();
 245
 246   // If the extract indexes are identical, no shuffle is needed.
 247   if (Index0 == Index1)
 248     return nullptr;
 249
 250   Type *VecTy = Ext0->getVectorOperand()->getType();
 251   assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
 252   InstructionCost Cost0 =
 253       TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
 254   InstructionCost Cost1 =
 255       TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
 256
 257   // If both costs are invalid no shuffle is needed
 258   if (!Cost0.isValid() && !Cost1.isValid())
 259     return nullptr;
 260
 261   // We are extracting from 2 different indexes, so one operand must be shuffled
 262   // before performing a vector operation and/or extract. The more expensive
 263   // extract will be replaced by a shuffle.
 264   if (Cost0 > Cost1)
 265     return Ext0;
 266   if (Cost1 > Cost0)
 267     return Ext1;
 268
 269   // If the costs are equal and there is a preferred extract index, shuffle the
 270   // opposite operand.
 271   if (PreferredExtractIndex == Index0)
 272     return Ext1;
 273   if (PreferredExtractIndex == Index1)
 274     return Ext0;
 275
 276   // Otherwise, replace the extract with the higher index.
 277   return Index0 > Index1 ? Ext0 : Ext1;
 278 }
 279
 280 /// Compare the relative costs of 2 extracts followed by scalar operation vs.
 281 /// vector operation(s) followed by extract. Return true if the existing
 282 /// instructions are cheaper than a vector alternative. Otherwise, return false
 283 /// and if one of the extracts should be transformed to a shufflevector, set
 284 /// \p ConvertToShuffle to that extract instruction.
 285 bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
 286                                           ExtractElementInst *Ext1,
 287                                           unsigned Opcode,
 288                                           ExtractElementInst *&ConvertToShuffle,
 289                                           unsigned PreferredExtractIndex) {
 290   assert(isa<ConstantInt>(Ext0->getOperand(1)) &&
 291          isa<ConstantInt>(Ext1->getOperand(1)) &&
 292          "Expected constant extract indexes");
 293   Type *ScalarTy = Ext0->getType();
 294   auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType());
 295   InstructionCost ScalarOpCost, VectorOpCost;
 296
 297   // Get cost estimates for scalar and vector versions of the operation.
 298   bool IsBinOp = Instruction::isBinaryOp(Opcode);
 299   if (IsBinOp) {
 300     ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy);
 301     VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
 302   } else {
 303     assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
 304            "Expected a compare");
 305     ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy,
 306                                           CmpInst::makeCmpResultType(ScalarTy));
 307     VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy,
 308                                           CmpInst::makeCmpResultType(VecTy));
 309   }
 310
 311   // Get cost estimates for the extract elements. These costs will factor into
 312   // both sequences.
 313   unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue();
 314   unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue();
 315
 316   InstructionCost Extract0Cost =
 317       TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index);
 318   InstructionCost Extract1Cost =
 319       TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext1Index);
 320
 321   // A more expensive extract will always be replaced by a splat shuffle.
 322   // For example, if Ext0 is more expensive:
 323   // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
 324   // extelt (opcode (splat V0, Ext0), V1), Ext1
 325   // TODO: Evaluate whether that always results in lowest cost. Alternatively,
 326   //       check the cost of creating a broadcast shuffle and shuffling both
 327   //       operands to element 0.
 328   InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
 329
 330   // Extra uses of the extracts mean that we include those costs in the
 331   // vector total because those instructions will not be eliminated.
 332   InstructionCost OldCost, NewCost;
 333   if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) {
 334     // Handle a special case. If the 2 extracts are identical, adjust the
 335     // formulas to account for that. The extra use charge allows for either the
 336     // CSE'd pattern or an unoptimized form with identical values:
 337     // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
 338     bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
 339                                   : !Ext0->hasOneUse() || !Ext1->hasOneUse();
 340     OldCost = CheapExtractCost + ScalarOpCost;
 341     NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
 342   } else {
 343     // Handle the general case. Each extract is actually a different value:
 344     // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
 345     OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
 346     NewCost = VectorOpCost + CheapExtractCost +
 347               !Ext0->hasOneUse() * Extract0Cost +
 348               !Ext1->hasOneUse() * Extract1Cost;
 349   }
 350
 351   ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
 352   if (ConvertToShuffle) {
 353     if (IsBinOp && DisableBinopExtractShuffle)
 354       return true;
 355
 356     // If we are extracting from 2 different indexes, then one operand must be
 357     // shuffled before performing the vector operation. The shuffle mask is
 358     // undefined except for 1 lane that is being translated to the remaining
 359     // extraction lane. Therefore, it is a splat shuffle. Ex:
 360     // ShufMask = { undef, undef, 0, undef }
 361     // TODO: The cost model has an option for a "broadcast" shuffle
 362     //       (splat-from-element-0), but no option for a more general splat.
 363     NewCost +=
 364         TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
 365   }
 366
 367   // Aggressively form a vector op if the cost is equal because the transform
 368   // may enable further optimization.
 369   // Codegen can reverse this transform (scalarize) if it was not profitable.
 370   return OldCost < NewCost;
 371 }
 372
 373 /// Create a shuffle that translates (shifts) 1 element from the input vector
 374 /// to a new element location.
 375 static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
 376                                  unsigned NewIndex, IRBuilder<> &Builder) {
 377   // The shuffle mask is undefined except for 1 lane that is being translated
 378   // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
 379   // ShufMask = { 2, undef, undef, undef }
 380   auto *VecTy = cast<FixedVectorType>(Vec->getType());
 381   SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem);
 382   ShufMask[NewIndex] = OldIndex;
 383   return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
 384 }
 385
 386 /// Given an extract element instruction with constant index operand, shuffle
 387 /// the source vector (shift the scalar element) to a NewIndex for extraction.
 388 /// Return null if the input can be constant folded, so that we are not creating
 389 /// unnecessary instructions.
 390 static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt,
 391                                             unsigned NewIndex,
 392                                             IRBuilder<> &Builder) {
 393   // If the extract can be constant-folded, this code is unsimplified. Defer
 394   // to other passes to handle that.
 395   Value *X = ExtElt->getVectorOperand();
 396   Value *C = ExtElt->getIndexOperand();
 397   assert(isa<ConstantInt>(C) && "Expected a constant index operand");
 398   if (isa<Constant>(X))
 399     return nullptr;
 400
 401   Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
 402                                    NewIndex, Builder);
 403   return cast<ExtractElementInst>(Builder.CreateExtractElement(Shuf, NewIndex));
 404 }
 405
 406 /// Try to reduce extract element costs by converting scalar compares to vector
 407 /// compares followed by extract.
 408 /// cmp (ext0 V0, C), (ext1 V1, C)
 409 void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0,
 410                                   ExtractElementInst *Ext1, Instruction &I) {
 411   assert(isa<CmpInst>(&I) && "Expected a compare");
 412   assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
 413              cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
 414          "Expected matching constant extract indexes");
 415
 416   // cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C
 417   ++NumVecCmp;
 418   CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
 419   Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand();
 420   Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
 421   Value *NewExt = Builder.CreateExtractElement(VecCmp, Ext0->getIndexOperand());
 422   replaceValue(I, *NewExt);
 423 }
 424
 425 /// Try to reduce extract element costs by converting scalar binops to vector
 426 /// binops followed by extract.
 427 /// bo (ext0 V0, C), (ext1 V1, C)
 428 void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0,
 429                                     ExtractElementInst *Ext1, Instruction &I) {
 430   assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
 431   assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
 432              cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
 433          "Expected matching constant extract indexes");
 434
 435   // bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C
 436   ++NumVecBO;
 437   Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand();
 438   Value *VecBO =
 439       Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0, V1);
 440
 441   // All IR flags are safe to back-propagate because any potential poison
 442   // created in unused vector elements is discarded by the extract.
 443   if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
 444     VecBOInst->copyIRFlags(&I);
 445
 446   Value *NewExt = Builder.CreateExtractElement(VecBO, Ext0->getIndexOperand());
 447   replaceValue(I, *NewExt);
 448 }
 449
 450 /// Match an instruction with extracted vector operands.
 451 bool VectorCombine::foldExtractExtract(Instruction &I) {
 452   // It is not safe to transform things like div, urem, etc. because we may
 453   // create undefined behavior when executing those on unknown vector elements.
 454   if (!isSafeToSpeculativelyExecute(&I))
 455     return false;
 456
 457   Instruction *I0, *I1;
 458   CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
 459   if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
 460       !match(&I, m_BinOp(m_Instruction(I0), m_Instruction(I1))))
 461     return false;
 462
 463   Value *V0, *V1;
 464   uint64_t C0, C1;
 465   if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
 466       !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
 467       V0->getType() != V1->getType())
 468     return false;
 469
 470   // If the scalar value 'I' is going to be re-inserted into a vector, then try
 471   // to create an extract to that same element. The extract/insert can be
 472   // reduced to a "select shuffle".
 473   // TODO: If we add a larger pattern match that starts from an insert, this
 474   //       probably becomes unnecessary.
 475   auto *Ext0 = cast<ExtractElementInst>(I0);
 476   auto *Ext1 = cast<ExtractElementInst>(I1);
 477   uint64_t InsertIndex = InvalidIndex;
 478   if (I.hasOneUse())
 479     match(I.user_back(),
 480           m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
 481
 482   ExtractElementInst *ExtractToChange;
 483   if (isExtractExtractCheap(Ext0, Ext1, I.getOpcode(), ExtractToChange,
 484                             InsertIndex))
 485     return false;
 486
 487   if (ExtractToChange) {
 488     unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
 489     ExtractElementInst *NewExtract =
 490         translateExtract(ExtractToChange, CheapExtractIdx, Builder);
 491     if (!NewExtract)
 492       return false;
 493     if (ExtractToChange == Ext0)
 494       Ext0 = NewExtract;
 495     else
 496       Ext1 = NewExtract;
 497   }
 498
 499   if (Pred != CmpInst::BAD_ICMP_PREDICATE)
 500     foldExtExtCmp(Ext0, Ext1, I);
 501   else
 502     foldExtExtBinop(Ext0, Ext1, I);
 503
 504   return true;
 505 }
 506
 507 /// If this is a bitcast of a shuffle, try to bitcast the source vector to the
 508 /// destination type followed by shuffle. This can enable further transforms by
 509 /// moving bitcasts or shuffles together.
 510 bool VectorCombine::foldBitcastShuf(Instruction &I) {
 511   Value *V;
 512   ArrayRef<int> Mask;
 513   if (!match(&I, m_BitCast(
 514                      m_OneUse(m_Shuffle(m_Value(V), m_Undef(), m_Mask(Mask))))))
 515     return false;
 516
 517   // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
 518   // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
 519   // mask for scalable type is a splat or not.
 520   // 2) Disallow non-vector casts and length-changing shuffles.
 521   // TODO: We could allow any shuffle.
 522   auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
 523   auto *SrcTy = dyn_cast<FixedVectorType>(V->getType());
 524   if (!SrcTy || !DestTy || I.getOperand(0)->getType() != SrcTy)
 525     return false;
 526
 527   unsigned DestNumElts = DestTy->getNumElements();
 528   unsigned SrcNumElts = SrcTy->getNumElements();
 529   SmallVector<int, 16> NewMask;
 530   if (SrcNumElts <= DestNumElts) {
 531     // The bitcast is from wide to narrow/equal elements. The shuffle mask can
 532     // always be expanded to the equivalent form choosing narrower elements.
 533     assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask");
 534     unsigned ScaleFactor = DestNumElts / SrcNumElts;
 535     narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
 536   } else {
 537     // The bitcast is from narrow elements to wide elements. The shuffle mask
 538     // must choose consecutive elements to allow casting first.
 539     assert(SrcNumElts % DestNumElts == 0 && "Unexpected shuffle mask");
 540     unsigned ScaleFactor = SrcNumElts / DestNumElts;
 541     if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
 542       return false;
 543   }
 544
 545   // The new shuffle must not cost more than the old shuffle. The bitcast is
 546   // moved ahead of the shuffle, so assume that it has the same cost as before.
 547   InstructionCost DestCost = TTI.getShuffleCost(
 548       TargetTransformInfo::SK_PermuteSingleSrc, DestTy, NewMask);
 549   InstructionCost SrcCost =
 550       TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy, Mask);
 551   if (DestCost > SrcCost || !DestCost.isValid())
 552     return false;
 553
 554   // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'
 555   ++NumShufOfBitcast;
 556   Value *CastV = Builder.CreateBitCast(V, DestTy);
 557   Value *Shuf = Builder.CreateShuffleVector(CastV, NewMask);
 558   replaceValue(I, *Shuf);
 559   return true;
 560 }
 561
 562 /// Match a vector binop or compare instruction with at least one inserted
 563 /// scalar operand and convert to scalar binop/cmp followed by insertelement.
 564 bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
 565   CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
 566   Value *Ins0, *Ins1;
 567   if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) &&
 568       !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1))))
 569     return false;
 570
 571   // Do not convert the vector condition of a vector select into a scalar
 572   // condition. That may cause problems for codegen because of differences in
 573   // boolean formats and register-file transfers.
 574   // TODO: Can we account for that in the cost model?
 575   bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE;
 576   if (IsCmp)
 577     for (User *U : I.users())
 578       if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
 579         return false;
 580
 581   // Match against one or both scalar values being inserted into constant
 582   // vectors:
 583   // vec_op VecC0, (inselt VecC1, V1, Index)
 584   // vec_op (inselt VecC0, V0, Index), VecC1
 585   // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index)
 586   // TODO: Deal with mismatched index constants and variable indexes?
 587   Constant *VecC0 = nullptr, *VecC1 = nullptr;
 588   Value *V0 = nullptr, *V1 = nullptr;
 589   uint64_t Index0 = 0, Index1 = 0;
 590   if (!match(Ins0, m_InsertElt(m_Constant(VecC0), m_Value(V0),
 591                                m_ConstantInt(Index0))) &&
 592       !match(Ins0, m_Constant(VecC0)))
 593     return false;
 594   if (!match(Ins1, m_InsertElt(m_Constant(VecC1), m_Value(V1),
 595                                m_ConstantInt(Index1))) &&
 596       !match(Ins1, m_Constant(VecC1)))
 597     return false;
 598
 599   bool IsConst0 = !V0;
 600   bool IsConst1 = !V1;
 601   if (IsConst0 && IsConst1)
 602     return false;
 603   if (!IsConst0 && !IsConst1 && Index0 != Index1)
 604     return false;
 605
 606   // Bail for single insertion if it is a load.
 607   // TODO: Handle this once getVectorInstrCost can cost for load/stores.
 608   auto *I0 = dyn_cast_or_null<Instruction>(V0);
 609   auto *I1 = dyn_cast_or_null<Instruction>(V1);
 610   if ((IsConst0 && I1 && I1->mayReadFromMemory()) ||
 611       (IsConst1 && I0 && I0->mayReadFromMemory()))
 612     return false;
 613
 614   uint64_t Index = IsConst0 ? Index1 : Index0;
 615   Type *ScalarTy = IsConst0 ? V1->getType() : V0->getType();
 616   Type *VecTy = I.getType();
 617   assert(VecTy->isVectorTy() &&
 618          (IsConst0 || IsConst1 || V0->getType() == V1->getType()) &&
 619          (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
 620           ScalarTy->isPointerTy()) &&
 621          "Unexpected types for insert element into binop or cmp");
 622
 623   unsigned Opcode = I.getOpcode();
 624   InstructionCost ScalarOpCost, VectorOpCost;
 625   if (IsCmp) {
 626     ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy);
 627     VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy);
 628   } else {
 629     ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy);
 630     VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
 631   }
 632
 633   // Get cost estimate for the insert element. This cost will factor into
 634   // both sequences.
 635   InstructionCost InsertCost =
 636       TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index);
 637   InstructionCost OldCost =
 638       (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost;
 639   InstructionCost NewCost = ScalarOpCost + InsertCost +
 640                             (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) +
 641                             (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost);
 642
 643   // We want to scalarize unless the vector variant actually has lower cost.
 644   if (OldCost < NewCost || !NewCost.isValid())
 645     return false;
 646
 647   // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
 648   // inselt NewVecC, (scalar_op V0, V1), Index
 649   if (IsCmp)
 650     ++NumScalarCmp;
 651   else
 652     ++NumScalarBO;
 653
 654   // For constant cases, extract the scalar element, this should constant fold.
 655   if (IsConst0)
 656     V0 = ConstantExpr::getExtractElement(VecC0, Builder.getInt64(Index));
 657   if (IsConst1)
 658     V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index));
 659
 660   Value *Scalar =
 661       IsCmp ? Builder.CreateCmp(Pred, V0, V1)
 662             : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1);
 663
 664   Scalar->setName(I.getName() + ".scalar");
 665
 666   // All IR flags are safe to back-propagate. There is no potential for extra
 667   // poison to be created by the scalar instruction.
 668   if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
 669     ScalarInst->copyIRFlags(&I);
 670
 671   // Fold the vector constants in the original vectors into a new base vector.
 672   Constant *NewVecC = IsCmp ? ConstantExpr::getCompare(Pred, VecC0, VecC1)
 673                             : ConstantExpr::get(Opcode, VecC0, VecC1);
 674   Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index);
 675   replaceValue(I, *Insert);
 676   return true;
 677 }
 678
 679 /// Try to combine a scalar binop + 2 scalar compares of extracted elements of
 680 /// a vector into vector operations followed by extract. Note: The SLP pass
 681 /// may miss this pattern because of implementation problems.
 682 bool VectorCombine::foldExtractedCmps(Instruction &I) {
 683   // We are looking for a scalar binop of booleans.
 684   // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
 685   if (!I.isBinaryOp() || !I.getType()->isIntegerTy(1))
 686     return false;
 687
 688   // The compare predicates should match, and each compare should have a
 689   // constant operand.
 690   // TODO: Relax the one-use constraints.
 691   Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
 692   Instruction *I0, *I1;
 693   Constant *C0, *C1;
 694   CmpInst::Predicate P0, P1;
 695   if (!match(B0, m_OneUse(m_Cmp(P0, m_Instruction(I0), m_Constant(C0)))) ||
 696       !match(B1, m_OneUse(m_Cmp(P1, m_Instruction(I1), m_Constant(C1)))) ||
 697       P0 != P1)
 698     return false;
 699
 700   // The compare operands must be extracts of the same vector with constant
 701   // extract indexes.
 702   // TODO: Relax the one-use constraints.
 703   Value *X;
 704   uint64_t Index0, Index1;
 705   if (!match(I0, m_OneUse(m_ExtractElt(m_Value(X), m_ConstantInt(Index0)))) ||
 706       !match(I1, m_OneUse(m_ExtractElt(m_Specific(X), m_ConstantInt(Index1)))))
 707     return false;
 708
 709   auto *Ext0 = cast<ExtractElementInst>(I0);
 710   auto *Ext1 = cast<ExtractElementInst>(I1);
 711   ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1);
 712   if (!ConvertToShuf)
 713     return false;
 714
 715   // The original scalar pattern is:
 716   // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
 717   CmpInst::Predicate Pred = P0;
 718   unsigned CmpOpcode = CmpInst::isFPPredicate(Pred) ? Instruction::FCmp
 719                                                     : Instruction::ICmp;
 720   auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
 721   if (!VecTy)
 722     return false;
 723
 724   InstructionCost OldCost =
 725       TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
 726   OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
 727   OldCost += TTI.getCmpSelInstrCost(CmpOpcode, I0->getType()) * 2;
 728   OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType());
 729
 730   // The proposed vector pattern is:
 731   // vcmp = cmp Pred X, VecC
 732   // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
 733   int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
 734   int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
 735   auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType()));
 736   InstructionCost NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType());
 737   SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem);
 738   ShufMask[CheapIndex] = ExpensiveIndex;
 739   NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy,
 740                                 ShufMask);
 741   NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy);
 742   NewCost += TTI.getVectorInstrCost(Ext0->getOpcode(), CmpTy, CheapIndex);
 743
 744   // Aggressively form vector ops if the cost is equal because the transform
 745   // may enable further optimization.
 746   // Codegen can reverse this transform (scalarize) if it was not profitable.
 747   if (OldCost < NewCost || !NewCost.isValid())
 748     return false;
 749
 750   // Create a vector constant from the 2 scalar constants.
 751   SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
 752                                    UndefValue::get(VecTy->getElementType()));
 753   CmpC[Index0] = C0;
 754   CmpC[Index1] = C1;
 755   Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
 756
 757   Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
 758   Value *VecLogic = Builder.CreateBinOp(cast<BinaryOperator>(I).getOpcode(),
 759                                         VCmp, Shuf);
 760   Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
 761   replaceValue(I, *NewExt);
 762   ++NumVecCmpBO;
 763   return true;
 764 }
 765
 766 // Check if memory loc modified between two instrs in the same BB
 767 static bool isMemModifiedBetween(BasicBlock::iterator Begin,
 768                                  BasicBlock::iterator End,
 769                                  const MemoryLocation &Loc, AAResults &AA) {
 770   unsigned NumScanned = 0;
 771   return std::any_of(Begin, End, [&](const Instruction &Instr) {
 772     return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
 773            ++NumScanned > MaxInstrsToScan;
 774   });
 775 }
 776
 777 /// Check if it is legal to scalarize a memory access to \p VecTy at index \p
 778 /// Idx. \p Idx must access a valid vector element.
 779 static bool canScalarizeAccess(FixedVectorType *VecTy, Value *Idx,
 780                                Instruction *CtxI, AssumptionCache &AC) {
 781   if (auto *C = dyn_cast<ConstantInt>(Idx))
 782     return C->getValue().ult(VecTy->getNumElements());
 783
 784   if (!isGuaranteedNotToBePoison(Idx, &AC))
 785     return false;
 786
 787   APInt Zero(Idx->getType()->getScalarSizeInBits(), 0);
 788   APInt MaxElts(Idx->getType()->getScalarSizeInBits(), VecTy->getNumElements());
 789   ConstantRange ValidIndices(Zero, MaxElts);
 790   ConstantRange IdxRange = computeConstantRange(Idx, true, &AC, CtxI, 0);
 791   return ValidIndices.contains(IdxRange);
 792 }
 793
 794 /// The memory operation on a vector of \p ScalarType had alignment of
 795 /// \p VectorAlignment. Compute the maximal, but conservatively correct,
 796 /// alignment that will be valid for the memory operation on a single scalar
 797 /// element of the same type with index \p Idx.
 798 static Align computeAlignmentAfterScalarization(Align VectorAlignment,
 799                                                 Type *ScalarType, Value *Idx,
 800                                                 const DataLayout &DL) {
 801   if (auto *C = dyn_cast<ConstantInt>(Idx))
 802     return commonAlignment(VectorAlignment,
 803                            C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
 804   return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
 805 }
 806
 807 // Combine patterns like:
 808 //   %0 = load <4 x i32>, <4 x i32>* %a
 809 //   %1 = insertelement <4 x i32> %0, i32 %b, i32 1
 810 //   store <4 x i32> %1, <4 x i32>* %a
 811 // to:
 812 //   %0 = bitcast <4 x i32>* %a to i32*
 813 //   %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
 814 //   store i32 %b, i32* %1
 815 bool VectorCombine::foldSingleElementStore(Instruction &I) {
 816   StoreInst *SI = dyn_cast<StoreInst>(&I);
 817   if (!SI || !SI->isSimple() ||
 818       !isa<FixedVectorType>(SI->getValueOperand()->getType()))
 819     return false;
 820
 821   // TODO: Combine more complicated patterns (multiple insert) by referencing
 822   // TargetTransformInfo.
 823   Instruction *Source;
 824   Value *NewElement;
 825   Value *Idx;
 826   if (!match(SI->getValueOperand(),
 827              m_InsertElt(m_Instruction(Source), m_Value(NewElement),
 828                          m_Value(Idx))))
 829     return false;
 830
 831   if (auto *Load = dyn_cast<LoadInst>(Source)) {
 832     auto VecTy = cast<FixedVectorType>(SI->getValueOperand()->getType());
 833     const DataLayout &DL = I.getModule()->getDataLayout();
 834     Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
 835     // Don't optimize for atomic/volatile load or store. Ensure memory is not
 836     // modified between, vector type matches store size, and index is inbounds.
 837     if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
 838         !DL.typeSizeEqualsStoreSize(Load->getType()) ||
 839         !canScalarizeAccess(VecTy, Idx, Load, AC) ||
 840         SrcAddr != SI->getPointerOperand()->stripPointerCasts() ||
 841         isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
 842                              MemoryLocation::get(SI), AA))
 843       return false;
 844
 845     Value *GEP = Builder.CreateInBoundsGEP(
 846         SI->getValueOperand()->getType(), SI->getPointerOperand(),
 847         {ConstantInt::get(Idx->getType(), 0), Idx});
 848     StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
 849     NSI->copyMetadata(*SI);
 850     Align ScalarOpAlignment = computeAlignmentAfterScalarization(
 851         std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
 852         DL);
 853     NSI->setAlignment(ScalarOpAlignment);
 854     replaceValue(I, *NSI);
 855     // Need erasing the store manually.
 856     I.eraseFromParent();
 857     return true;
 858   }
 859
 860   return false;
 861 }
 862
 863 /// Try to scalarize vector loads feeding extractelement instructions.
 864 bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
 865   Value *Ptr;
 866   Value *Idx;
 867   if (!match(&I, m_ExtractElt(m_Load(m_Value(Ptr)), m_Value(Idx))))
 868     return false;
 869
 870   auto *LI = cast<LoadInst>(I.getOperand(0));
 871   const DataLayout &DL = I.getModule()->getDataLayout();
 872   if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(LI->getType()))
 873     return false;
 874
 875   auto *FixedVT = dyn_cast<FixedVectorType>(LI->getType());
 876   if (!FixedVT)
 877     return false;
 878
 879   InstructionCost OriginalCost = TTI.getMemoryOpCost(
 880       Instruction::Load, LI->getType(), Align(LI->getAlignment()),
 881       LI->getPointerAddressSpace());
 882   InstructionCost ScalarizedCost = 0;
 883
 884   Instruction *LastCheckedInst = LI;
 885   unsigned NumInstChecked = 0;
 886   // Check if all users of the load are extracts with no memory modifications
 887   // between the load and the extract. Compute the cost of both the original
 888   // code and the scalarized version.
 889   for (User *U : LI->users()) {
 890     auto *UI = dyn_cast<ExtractElementInst>(U);
 891     if (!UI || UI->getParent() != LI->getParent())
 892       return false;
 893
 894     if (!isGuaranteedNotToBePoison(UI->getOperand(1), &AC, LI, &DT))
 895       return false;
 896
 897     // Check if any instruction between the load and the extract may modify
 898     // memory.
 899     if (LastCheckedInst->comesBefore(UI)) {
 900       for (Instruction &I :
 901            make_range(std::next(LI->getIterator()), UI->getIterator())) {
 902         // Bail out if we reached the check limit or the instruction may write
 903         // to memory.
 904         if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
 905           return false;
 906         NumInstChecked++;
 907       }
 908     }
 909
 910     if (!LastCheckedInst)
 911       LastCheckedInst = UI;
 912     else if (LastCheckedInst->comesBefore(UI))
 913       LastCheckedInst = UI;
 914
 915     if (!canScalarizeAccess(FixedVT, UI->getOperand(1), &I, AC))
 916       return false;
 917
 918     auto *Index = dyn_cast<ConstantInt>(UI->getOperand(1));
 919     OriginalCost +=
 920         TTI.getVectorInstrCost(Instruction::ExtractElement, LI->getType(),
 921                                Index ? Index->getZExtValue() : -1);
 922     ScalarizedCost +=
 923         TTI.getMemoryOpCost(Instruction::Load, FixedVT->getElementType(),
 924                             Align(1), LI->getPointerAddressSpace());
 925     ScalarizedCost += TTI.getAddressComputationCost(FixedVT->getElementType());
 926   }
 927
 928   if (ScalarizedCost >= OriginalCost)
 929     return false;
 930
 931   // Replace extracts with narrow scalar loads.
 932   for (User *U : LI->users()) {
 933     auto *EI = cast<ExtractElementInst>(U);
 934     Builder.SetInsertPoint(EI);
 935
 936     Value *Idx = EI->getOperand(1);
 937     Value *GEP =
 938         Builder.CreateInBoundsGEP(FixedVT, Ptr, {Builder.getInt32(0), Idx});
 939     auto *NewLoad = cast<LoadInst>(Builder.CreateLoad(
 940         FixedVT->getElementType(), GEP, EI->getName() + ".scalar"));
 941
 942     Align ScalarOpAlignment = computeAlignmentAfterScalarization(
 943         LI->getAlign(), FixedVT->getElementType(), Idx, DL);
 944     NewLoad->setAlignment(ScalarOpAlignment);
 945
 946     replaceValue(*EI, *NewLoad);
 947   }
 948
 949   return true;
 950 }
 951
 952 /// This is the entry point for all transforms. Pass manager differences are
 953 /// handled in the callers of this function.
 954 bool VectorCombine::run() {
 955   if (DisableVectorCombine)
 956     return false;
 957
 958   // Don't attempt vectorization if the target does not support vectors.
 959   if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
 960     return false;
 961
 962   bool MadeChange = false;
 963   for (BasicBlock &BB : F) {
 964     // Ignore unreachable basic blocks.
 965     if (!DT.isReachableFromEntry(&BB))
 966       continue;
 967     // Use early increment range so that we can erase instructions in loop.
 968     for (Instruction &I : make_early_inc_range(BB)) {
 969       if (isa<DbgInfoIntrinsic>(I))
 970         continue;
 971       Builder.SetInsertPoint(&I);
 972       MadeChange |= vectorizeLoadInsert(I);
 973       MadeChange |= foldExtractExtract(I);
 974       MadeChange |= foldBitcastShuf(I);
 975       MadeChange |= scalarizeBinopOrCmp(I);
 976       MadeChange |= foldExtractedCmps(I);
 977       MadeChange |= scalarizeLoadExtract(I);
 978       MadeChange |= foldSingleElementStore(I);
 979     }
 980   }
 981
 982   // We're done with transforms, so remove dead instructions.
 983   if (MadeChange)
 984     for (BasicBlock &BB : F)
 985       SimplifyInstructionsInBlock(&BB);
 986
 987   return MadeChange;
 988 }
 989
 990 // Pass manager boilerplate below here.
 991
 992 namespace {
 993 class VectorCombineLegacyPass : public FunctionPass {
 994 public:
 995   static char ID;
 996   VectorCombineLegacyPass() : FunctionPass(ID) {
 997     initializeVectorCombineLegacyPassPass(*PassRegistry::getPassRegistry());
 998   }
 999
1000   void getAnalysisUsage(AnalysisUsage &AU) const override {
1001     AU.addRequired<AssumptionCacheTracker>();
1002     AU.addRequired<DominatorTreeWrapperPass>();
1003     AU.addRequired<TargetTransformInfoWrapperPass>();
1004     AU.addRequired<AAResultsWrapperPass>();
1005     AU.setPreservesCFG();
1006     AU.addPreserved<DominatorTreeWrapperPass>();
1007     AU.addPreserved<GlobalsAAWrapperPass>();
1008     AU.addPreserved<AAResultsWrapperPass>();
1009     AU.addPreserved<BasicAAWrapperPass>();
1010     FunctionPass::getAnalysisUsage(AU);
1011   }
1012
1013   bool runOnFunction(Function &F) override {
1014     if (skipFunction(F))
1015       return false;
1016     auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1017     auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1018     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1019     auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
1020     VectorCombine Combiner(F, TTI, DT, AA, AC);
1021     return Combiner.run();
1022   }
1023 };
1024 } // namespace
1025
1026 char VectorCombineLegacyPass::ID = 0;
1027 INITIALIZE_PASS_BEGIN(VectorCombineLegacyPass, "vector-combine",
1028                       "Optimize scalar/vector ops", false,
1029                       false)
1030 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1031 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
1032 INITIALIZE_PASS_END(VectorCombineLegacyPass, "vector-combine",
1033                     "Optimize scalar/vector ops", false, false)
1034 Pass *llvm::createVectorCombinePass() {
1035   return new VectorCombineLegacyPass();
1036 }
1037
1038 PreservedAnalyses VectorCombinePass::run(Function &F,
1039                                          FunctionAnalysisManager &FAM) {
1040   auto &AC = FAM.getResult<AssumptionAnalysis>(F);
1041   TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
1042   DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
1043   AAResults &AA = FAM.getResult<AAManager>(F);
1044   VectorCombine Combiner(F, TTI, DT, AA, AC);
1045   if (!Combiner.run())
1046     return PreservedAnalyses::all();
1047   PreservedAnalyses PA;
1048   PA.preserveSet<CFGAnalyses>();
1049   return PA;
1050 }