llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp

   1 //===- MemCpyOptimizer.cpp - Optimize use of memcpy and friends -----------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This pass performs various transformations related to eliminating memcpy
  10 // calls, or transforming sets of stores into memset's.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
  15 #include "llvm/ADT/DenseSet.h"
  16 #include "llvm/ADT/STLExtras.h"
  17 #include "llvm/ADT/ScopeExit.h"
  18 #include "llvm/ADT/SmallVector.h"
  19 #include "llvm/ADT/Statistic.h"
  20 #include "llvm/ADT/iterator_range.h"
  21 #include "llvm/Analysis/AliasAnalysis.h"
  22 #include "llvm/Analysis/AssumptionCache.h"
  23 #include "llvm/Analysis/CFG.h"
  24 #include "llvm/Analysis/CaptureTracking.h"
  25 #include "llvm/Analysis/GlobalsModRef.h"
  26 #include "llvm/Analysis/InstructionSimplify.h"
  27 #include "llvm/Analysis/Loads.h"
  28 #include "llvm/Analysis/MemoryLocation.h"
  29 #include "llvm/Analysis/MemorySSA.h"
  30 #include "llvm/Analysis/MemorySSAUpdater.h"
  31 #include "llvm/Analysis/PostDominators.h"
  32 #include "llvm/Analysis/TargetLibraryInfo.h"
  33 #include "llvm/Analysis/ValueTracking.h"
  34 #include "llvm/IR/BasicBlock.h"
  35 #include "llvm/IR/Constants.h"
  36 #include "llvm/IR/DataLayout.h"
  37 #include "llvm/IR/DerivedTypes.h"
  38 #include "llvm/IR/Dominators.h"
  39 #include "llvm/IR/Function.h"
  40 #include "llvm/IR/GlobalVariable.h"
  41 #include "llvm/IR/IRBuilder.h"
  42 #include "llvm/IR/InstrTypes.h"
  43 #include "llvm/IR/Instruction.h"
  44 #include "llvm/IR/Instructions.h"
  45 #include "llvm/IR/IntrinsicInst.h"
  46 #include "llvm/IR/Intrinsics.h"
  47 #include "llvm/IR/LLVMContext.h"
  48 #include "llvm/IR/Module.h"
  49 #include "llvm/IR/PassManager.h"
  50 #include "llvm/IR/Type.h"
  51 #include "llvm/IR/User.h"
  52 #include "llvm/IR/Value.h"
  53 #include "llvm/Support/Casting.h"
  54 #include "llvm/Support/Debug.h"
  55 #include "llvm/Support/raw_ostream.h"
  56 #include "llvm/Transforms/Utils/Local.h"
  57 #include <algorithm>
  58 #include <cassert>
  59 #include <cstdint>
  60 #include <optional>
  61
  62 using namespace llvm;
  63
  64 #define DEBUG_TYPE "memcpyopt"
  65
  66 static cl::opt<bool> EnableMemCpyOptWithoutLibcalls(
  67     "enable-memcpyopt-without-libcalls", cl::Hidden,
  68     cl::desc("Enable memcpyopt even when libcalls are disabled"));
  69
  70 STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
  71 STATISTIC(NumMemMoveInstr, "Number of memmove instructions deleted");
  72 STATISTIC(NumMemSetInfer, "Number of memsets inferred");
  73 STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy");
  74 STATISTIC(NumCpyToSet, "Number of memcpys converted to memset");
  75 STATISTIC(NumCallSlot, "Number of call slot optimizations performed");
  76 STATISTIC(NumStackMove, "Number of stack-move optimizations performed");
  77
  78 namespace {
  79
  80 /// Represents a range of memset'd bytes with the ByteVal value.
  81 /// This allows us to analyze stores like:
  82 ///   store 0 -> P+1
  83 ///   store 0 -> P+0
  84 ///   store 0 -> P+3
  85 ///   store 0 -> P+2
  86 /// which sometimes happens with stores to arrays of structs etc.  When we see
  87 /// the first store, we make a range [1, 2).  The second store extends the range
  88 /// to [0, 2).  The third makes a new range [2, 3).  The fourth store joins the
  89 /// two ranges into [0, 3) which is memset'able.
  90 struct MemsetRange {
  91   // Start/End - A semi range that describes the span that this range covers.
  92   // The range is closed at the start and open at the end: [Start, End).
  93   int64_t Start, End;
  94
  95   /// StartPtr - The getelementptr instruction that points to the start of the
  96   /// range.
  97   Value *StartPtr;
  98
  99   /// Alignment - The known alignment of the first store.
 100   MaybeAlign Alignment;
 101
 102   /// TheStores - The actual stores that make up this range.
 103   SmallVector<Instruction *, 16> TheStores;
 104
 105   bool isProfitableToUseMemset(const DataLayout &DL) const;
 106 };
 107
 108 } // end anonymous namespace
 109
 110 bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
 111   // If we found more than 4 stores to merge or 16 bytes, use memset.
 112   if (TheStores.size() >= 4 || End - Start >= 16)
 113     return true;
 114
 115   // If there is nothing to merge, don't do anything.
 116   if (TheStores.size() < 2)
 117     return false;
 118
 119   // If any of the stores are a memset, then it is always good to extend the
 120   // memset.
 121   for (Instruction *SI : TheStores)
 122     if (!isa<StoreInst>(SI))
 123       return true;
 124
 125   // Assume that the code generator is capable of merging pairs of stores
 126   // together if it wants to.
 127   if (TheStores.size() == 2)
 128     return false;
 129
 130   // If we have fewer than 8 stores, it can still be worthwhile to do this.
 131   // For example, merging 4 i8 stores into an i32 store is useful almost always.
 132   // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the
 133   // memset will be split into 2 32-bit stores anyway) and doing so can
 134   // pessimize the llvm optimizer.
 135   //
 136   // Since we don't have perfect knowledge here, make some assumptions: assume
 137   // the maximum GPR width is the same size as the largest legal integer
 138   // size. If so, check to see whether we will end up actually reducing the
 139   // number of stores used.
 140   unsigned Bytes = unsigned(End - Start);
 141   unsigned MaxIntSize = DL.getLargestLegalIntTypeSizeInBits() / 8;
 142   if (MaxIntSize == 0)
 143     MaxIntSize = 1;
 144   unsigned NumPointerStores = Bytes / MaxIntSize;
 145
 146   // Assume the remaining bytes if any are done a byte at a time.
 147   unsigned NumByteStores = Bytes % MaxIntSize;
 148
 149   // If we will reduce the # stores (according to this heuristic), do the
 150   // transformation.  This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
 151   // etc.
 152   return TheStores.size() > NumPointerStores + NumByteStores;
 153 }
 154
 155 namespace {
 156
 157 class MemsetRanges {
 158   using range_iterator = SmallVectorImpl<MemsetRange>::iterator;
 159
 160   /// A sorted list of the memset ranges.
 161   SmallVector<MemsetRange, 8> Ranges;
 162
 163   const DataLayout &DL;
 164
 165 public:
 166   MemsetRanges(const DataLayout &DL) : DL(DL) {}
 167
 168   using const_iterator = SmallVectorImpl<MemsetRange>::const_iterator;
 169
 170   const_iterator begin() const { return Ranges.begin(); }
 171   const_iterator end() const { return Ranges.end(); }
 172   bool empty() const { return Ranges.empty(); }
 173
 174   void addInst(int64_t OffsetFromFirst, Instruction *Inst) {
 175     if (auto *SI = dyn_cast<StoreInst>(Inst))
 176       addStore(OffsetFromFirst, SI);
 177     else
 178       addMemSet(OffsetFromFirst, cast<MemSetInst>(Inst));
 179   }
 180
 181   void addStore(int64_t OffsetFromFirst, StoreInst *SI) {
 182     TypeSize StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType());
 183     assert(!StoreSize.isScalable() && "Can't track scalable-typed stores");
 184     addRange(OffsetFromFirst, StoreSize.getFixedValue(),
 185              SI->getPointerOperand(), SI->getAlign(), SI);
 186   }
 187
 188   void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) {
 189     int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue();
 190     addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getDestAlign(), MSI);
 191   }
 192
 193   void addRange(int64_t Start, int64_t Size, Value *Ptr, MaybeAlign Alignment,
 194                 Instruction *Inst);
 195 };
 196
 197 } // end anonymous namespace
 198
 199 /// Add a new store to the MemsetRanges data structure.  This adds a
 200 /// new range for the specified store at the specified offset, merging into
 201 /// existing ranges as appropriate.
 202 void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
 203                             MaybeAlign Alignment, Instruction *Inst) {
 204   int64_t End = Start + Size;
 205
 206   range_iterator I = partition_point(
 207       Ranges, [=](const MemsetRange &O) { return O.End < Start; });
 208
 209   // We now know that I == E, in which case we didn't find anything to merge
 210   // with, or that Start <= I->End.  If End < I->Start or I == E, then we need
 211   // to insert a new range.  Handle this now.
 212   if (I == Ranges.end() || End < I->Start) {
 213     MemsetRange &R = *Ranges.insert(I, MemsetRange());
 214     R.Start = Start;
 215     R.End = End;
 216     R.StartPtr = Ptr;
 217     R.Alignment = Alignment;
 218     R.TheStores.push_back(Inst);
 219     return;
 220   }
 221
 222   // This store overlaps with I, add it.
 223   I->TheStores.push_back(Inst);
 224
 225   // At this point, we may have an interval that completely contains our store.
 226   // If so, just add it to the interval and return.
 227   if (I->Start <= Start && I->End >= End)
 228     return;
 229
 230   // Now we know that Start <= I->End and End >= I->Start so the range overlaps
 231   // but is not entirely contained within the range.
 232
 233   // See if the range extends the start of the range.  In this case, it couldn't
 234   // possibly cause it to join the prior range, because otherwise we would have
 235   // stopped on *it*.
 236   if (Start < I->Start) {
 237     I->Start = Start;
 238     I->StartPtr = Ptr;
 239     I->Alignment = Alignment;
 240   }
 241
 242   // Now we know that Start <= I->End and Start >= I->Start (so the startpoint
 243   // is in or right at the end of I), and that End >= I->Start.  Extend I out to
 244   // End.
 245   if (End > I->End) {
 246     I->End = End;
 247     range_iterator NextI = I;
 248     while (++NextI != Ranges.end() && End >= NextI->Start) {
 249       // Merge the range in.
 250       I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end());
 251       if (NextI->End > I->End)
 252         I->End = NextI->End;
 253       Ranges.erase(NextI);
 254       NextI = I;
 255     }
 256   }
 257 }
 258
 259 //===----------------------------------------------------------------------===//
 260 //                         MemCpyOptLegacyPass Pass
 261 //===----------------------------------------------------------------------===//
 262
 263 // Check that V is either not accessible by the caller, or unwinding cannot
 264 // occur between Start and End.
 265 static bool mayBeVisibleThroughUnwinding(Value *V, Instruction *Start,
 266                                          Instruction *End) {
 267   assert(Start->getParent() == End->getParent() && "Must be in same block");
 268   // Function can't unwind, so it also can't be visible through unwinding.
 269   if (Start->getFunction()->doesNotThrow())
 270     return false;
 271
 272   // Object is not visible on unwind.
 273   // TODO: Support RequiresNoCaptureBeforeUnwind case.
 274   bool RequiresNoCaptureBeforeUnwind;
 275   if (isNotVisibleOnUnwind(getUnderlyingObject(V),
 276                            RequiresNoCaptureBeforeUnwind) &&
 277       !RequiresNoCaptureBeforeUnwind)
 278     return false;
 279
 280   // Check whether there are any unwinding instructions in the range.
 281   return any_of(make_range(Start->getIterator(), End->getIterator()),
 282                 [](const Instruction &I) { return I.mayThrow(); });
 283 }
 284
 285 void MemCpyOptPass::eraseInstruction(Instruction *I) {
 286   MSSAU->removeMemoryAccess(I);
 287   EEA->removeInstruction(I);
 288   I->eraseFromParent();
 289 }
 290
 291 // Check for mod or ref of Loc between Start and End, excluding both boundaries.
 292 // Start and End must be in the same block.
 293 // If SkippedLifetimeStart is provided, skip over one clobbering lifetime.start
 294 // intrinsic and store it inside SkippedLifetimeStart.
 295 static bool accessedBetween(BatchAAResults &AA, MemoryLocation Loc,
 296                             const MemoryUseOrDef *Start,
 297                             const MemoryUseOrDef *End,
 298                             Instruction **SkippedLifetimeStart = nullptr) {
 299   assert(Start->getBlock() == End->getBlock() && "Only local supported");
 300   for (const MemoryAccess &MA :
 301        make_range(++Start->getIterator(), End->getIterator())) {
 302     Instruction *I = cast<MemoryUseOrDef>(MA).getMemoryInst();
 303     if (isModOrRefSet(AA.getModRefInfo(I, Loc))) {
 304       auto *II = dyn_cast<IntrinsicInst>(I);
 305       if (II && II->getIntrinsicID() == Intrinsic::lifetime_start &&
 306           SkippedLifetimeStart && !*SkippedLifetimeStart) {
 307         *SkippedLifetimeStart = I;
 308         continue;
 309       }
 310
 311       return true;
 312     }
 313   }
 314   return false;
 315 }
 316
 317 // Check for mod of Loc between Start and End, excluding both boundaries.
 318 // Start and End can be in different blocks.
 319 static bool writtenBetween(MemorySSA *MSSA, BatchAAResults &AA,
 320                            MemoryLocation Loc, const MemoryUseOrDef *Start,
 321                            const MemoryUseOrDef *End) {
 322   if (isa<MemoryUse>(End)) {
 323     // For MemoryUses, getClobberingMemoryAccess may skip non-clobbering writes.
 324     // Manually check read accesses between Start and End, if they are in the
 325     // same block, for clobbers. Otherwise assume Loc is clobbered.
 326     return Start->getBlock() != End->getBlock() ||
 327            any_of(
 328                make_range(std::next(Start->getIterator()), End->getIterator()),
 329                [&AA, Loc](const MemoryAccess &Acc) {
 330                  if (isa<MemoryUse>(&Acc))
 331                    return false;
 332                  Instruction *AccInst =
 333                      cast<MemoryUseOrDef>(&Acc)->getMemoryInst();
 334                  return isModSet(AA.getModRefInfo(AccInst, Loc));
 335                });
 336   }
 337
 338   // TODO: Only walk until we hit Start.
 339   MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
 340       End->getDefiningAccess(), Loc, AA);
 341   return !MSSA->dominates(Clobber, Start);
 342 }
 343
 344 // Update AA metadata
 345 static void combineAAMetadata(Instruction *ReplInst, Instruction *I) {
 346   // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be
 347   // handled here, but combineMetadata doesn't support them yet
 348   unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
 349                          LLVMContext::MD_noalias,
 350                          LLVMContext::MD_invariant_group,
 351                          LLVMContext::MD_access_group};
 352   combineMetadata(ReplInst, I, KnownIDs, true);
 353 }
 354
 355 /// When scanning forward over instructions, we look for some other patterns to
 356 /// fold away. In particular, this looks for stores to neighboring locations of
 357 /// memory. If it sees enough consecutive ones, it attempts to merge them
 358 /// together into a memcpy/memset.
 359 Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
 360                                                  Value *StartPtr,
 361                                                  Value *ByteVal) {
 362   const DataLayout &DL = StartInst->getDataLayout();
 363
 364   // We can't track scalable types
 365   if (auto *SI = dyn_cast<StoreInst>(StartInst))
 366     if (DL.getTypeStoreSize(SI->getOperand(0)->getType()).isScalable())
 367       return nullptr;
 368
 369   // Okay, so we now have a single store that can be splatable.  Scan to find
 370   // all subsequent stores of the same value to offset from the same pointer.
 371   // Join these together into ranges, so we can decide whether contiguous blocks
 372   // are stored.
 373   MemsetRanges Ranges(DL);
 374
 375   BasicBlock::iterator BI(StartInst);
 376
 377   // Keeps track of the last memory use or def before the insertion point for
 378   // the new memset. The new MemoryDef for the inserted memsets will be inserted
 379   // after MemInsertPoint.
 380   MemoryUseOrDef *MemInsertPoint = nullptr;
 381   for (++BI; !BI->isTerminator(); ++BI) {
 382     auto *CurrentAcc =
 383         cast_or_null<MemoryUseOrDef>(MSSA->getMemoryAccess(&*BI));
 384     if (CurrentAcc)
 385       MemInsertPoint = CurrentAcc;
 386
 387     // Calls that only access inaccessible memory do not block merging
 388     // accessible stores.
 389     if (auto *CB = dyn_cast<CallBase>(BI)) {
 390       if (CB->onlyAccessesInaccessibleMemory())
 391         continue;
 392     }
 393
 394     if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) {
 395       // If the instruction is readnone, ignore it, otherwise bail out.  We
 396       // don't even allow readonly here because we don't want something like:
 397       // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
 398       if (BI->mayWriteToMemory() || BI->mayReadFromMemory())
 399         break;
 400       continue;
 401     }
 402
 403     if (auto *NextStore = dyn_cast<StoreInst>(BI)) {
 404       // If this is a store, see if we can merge it in.
 405       if (!NextStore->isSimple())
 406         break;
 407
 408       Value *StoredVal = NextStore->getValueOperand();
 409
 410       // Don't convert stores of non-integral pointer types to memsets (which
 411       // stores integers).
 412       if (DL.isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
 413         break;
 414
 415       // We can't track ranges involving scalable types.
 416       if (DL.getTypeStoreSize(StoredVal->getType()).isScalable())
 417         break;
 418
 419       // Check to see if this stored value is of the same byte-splattable value.
 420       Value *StoredByte = isBytewiseValue(StoredVal, DL);
 421       if (isa<UndefValue>(ByteVal) && StoredByte)
 422         ByteVal = StoredByte;
 423       if (ByteVal != StoredByte)
 424         break;
 425
 426       // Check to see if this store is to a constant offset from the start ptr.
 427       std::optional<int64_t> Offset =
 428           NextStore->getPointerOperand()->getPointerOffsetFrom(StartPtr, DL);
 429       if (!Offset)
 430         break;
 431
 432       Ranges.addStore(*Offset, NextStore);
 433     } else {
 434       auto *MSI = cast<MemSetInst>(BI);
 435
 436       if (MSI->isVolatile() || ByteVal != MSI->getValue() ||
 437           !isa<ConstantInt>(MSI->getLength()))
 438         break;
 439
 440       // Check to see if this store is to a constant offset from the start ptr.
 441       std::optional<int64_t> Offset =
 442           MSI->getDest()->getPointerOffsetFrom(StartPtr, DL);
 443       if (!Offset)
 444         break;
 445
 446       Ranges.addMemSet(*Offset, MSI);
 447     }
 448   }
 449
 450   // If we have no ranges, then we just had a single store with nothing that
 451   // could be merged in.  This is a very common case of course.
 452   if (Ranges.empty())
 453     return nullptr;
 454
 455   // If we had at least one store that could be merged in, add the starting
 456   // store as well.  We try to avoid this unless there is at least something
 457   // interesting as a small compile-time optimization.
 458   Ranges.addInst(0, StartInst);
 459
 460   // If we create any memsets, we put it right before the first instruction that
 461   // isn't part of the memset block.  This ensure that the memset is dominated
 462   // by any addressing instruction needed by the start of the block.
 463   IRBuilder<> Builder(&*BI);
 464
 465   // Now that we have full information about ranges, loop over the ranges and
 466   // emit memset's for anything big enough to be worthwhile.
 467   Instruction *AMemSet = nullptr;
 468   for (const MemsetRange &Range : Ranges) {
 469     if (Range.TheStores.size() == 1)
 470       continue;
 471
 472     // If it is profitable to lower this range to memset, do so now.
 473     if (!Range.isProfitableToUseMemset(DL))
 474       continue;
 475
 476     // Otherwise, we do want to transform this!  Create a new memset.
 477     // Get the starting pointer of the block.
 478     StartPtr = Range.StartPtr;
 479
 480     AMemSet = Builder.CreateMemSet(StartPtr, ByteVal, Range.End - Range.Start,
 481                                    Range.Alignment);
 482     AMemSet->mergeDIAssignID(Range.TheStores);
 483
 484     LLVM_DEBUG(dbgs() << "Replace stores:\n"; for (Instruction *SI
 485                                                    : Range.TheStores) dbgs()
 486                                               << *SI << '\n';
 487                dbgs() << "With: " << *AMemSet << '\n');
 488     if (!Range.TheStores.empty())
 489       AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
 490
 491     auto *NewDef = cast<MemoryDef>(
 492         MemInsertPoint->getMemoryInst() == &*BI
 493             ? MSSAU->createMemoryAccessBefore(AMemSet, nullptr, MemInsertPoint)
 494             : MSSAU->createMemoryAccessAfter(AMemSet, nullptr, MemInsertPoint));
 495     MSSAU->insertDef(NewDef, /*RenameUses=*/true);
 496     MemInsertPoint = NewDef;
 497
 498     // Zap all the stores.
 499     for (Instruction *SI : Range.TheStores)
 500       eraseInstruction(SI);
 501
 502     ++NumMemSetInfer;
 503   }
 504
 505   return AMemSet;
 506 }
 507
 508 // This method try to lift a store instruction before position P.
 509 // It will lift the store and its argument + that anything that
 510 // may alias with these.
 511 // The method returns true if it was successful.
 512 bool MemCpyOptPass::moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI) {
 513   // If the store alias this position, early bail out.
 514   MemoryLocation StoreLoc = MemoryLocation::get(SI);
 515   if (isModOrRefSet(AA->getModRefInfo(P, StoreLoc)))
 516     return false;
 517
 518   // Keep track of the arguments of all instruction we plan to lift
 519   // so we can make sure to lift them as well if appropriate.
 520   DenseSet<Instruction *> Args;
 521   auto AddArg = [&](Value *Arg) {
 522     auto *I = dyn_cast<Instruction>(Arg);
 523     if (I && I->getParent() == SI->getParent()) {
 524       // Cannot hoist user of P above P
 525       if (I == P)
 526         return false;
 527       Args.insert(I);
 528     }
 529     return true;
 530   };
 531   if (!AddArg(SI->getPointerOperand()))
 532     return false;
 533
 534   // Instruction to lift before P.
 535   SmallVector<Instruction *, 8> ToLift{SI};
 536
 537   // Memory locations of lifted instructions.
 538   SmallVector<MemoryLocation, 8> MemLocs{StoreLoc};
 539
 540   // Lifted calls.
 541   SmallVector<const CallBase *, 8> Calls;
 542
 543   const MemoryLocation LoadLoc = MemoryLocation::get(LI);
 544
 545   for (auto I = --SI->getIterator(), E = P->getIterator(); I != E; --I) {
 546     auto *C = &*I;
 547
 548     // Make sure hoisting does not perform a store that was not guaranteed to
 549     // happen.
 550     if (!isGuaranteedToTransferExecutionToSuccessor(C))
 551       return false;
 552
 553     bool MayAlias = isModOrRefSet(AA->getModRefInfo(C, std::nullopt));
 554
 555     bool NeedLift = false;
 556     if (Args.erase(C))
 557       NeedLift = true;
 558     else if (MayAlias) {
 559       NeedLift = llvm::any_of(MemLocs, [C, this](const MemoryLocation &ML) {
 560         return isModOrRefSet(AA->getModRefInfo(C, ML));
 561       });
 562
 563       if (!NeedLift)
 564         NeedLift = llvm::any_of(Calls, [C, this](const CallBase *Call) {
 565           return isModOrRefSet(AA->getModRefInfo(C, Call));
 566         });
 567     }
 568
 569     if (!NeedLift)
 570       continue;
 571
 572     if (MayAlias) {
 573       // Since LI is implicitly moved downwards past the lifted instructions,
 574       // none of them may modify its source.
 575       if (isModSet(AA->getModRefInfo(C, LoadLoc)))
 576         return false;
 577       else if (const auto *Call = dyn_cast<CallBase>(C)) {
 578         // If we can't lift this before P, it's game over.
 579         if (isModOrRefSet(AA->getModRefInfo(P, Call)))
 580           return false;
 581
 582         Calls.push_back(Call);
 583       } else if (isa<LoadInst>(C) || isa<StoreInst>(C) || isa<VAArgInst>(C)) {
 584         // If we can't lift this before P, it's game over.
 585         auto ML = MemoryLocation::get(C);
 586         if (isModOrRefSet(AA->getModRefInfo(P, ML)))
 587           return false;
 588
 589         MemLocs.push_back(ML);
 590       } else
 591         // We don't know how to lift this instruction.
 592         return false;
 593     }
 594
 595     ToLift.push_back(C);
 596     for (Value *Op : C->operands())
 597       if (!AddArg(Op))
 598         return false;
 599   }
 600
 601   // Find MSSA insertion point. Normally P will always have a corresponding
 602   // memory access before which we can insert. However, with non-standard AA
 603   // pipelines, there may be a mismatch between AA and MSSA, in which case we
 604   // will scan for a memory access before P. In either case, we know for sure
 605   // that at least the load will have a memory access.
 606   // TODO: Simplify this once P will be determined by MSSA, in which case the
 607   // discrepancy can no longer occur.
 608   MemoryUseOrDef *MemInsertPoint = nullptr;
 609   if (MemoryUseOrDef *MA = MSSA->getMemoryAccess(P)) {
 610     MemInsertPoint = cast<MemoryUseOrDef>(--MA->getIterator());
 611   } else {
 612     const Instruction *ConstP = P;
 613     for (const Instruction &I : make_range(++ConstP->getReverseIterator(),
 614                                            ++LI->getReverseIterator())) {
 615       if (MemoryUseOrDef *MA = MSSA->getMemoryAccess(&I)) {
 616         MemInsertPoint = MA;
 617         break;
 618       }
 619     }
 620   }
 621
 622   // We made it, we need to lift.
 623   for (auto *I : llvm::reverse(ToLift)) {
 624     LLVM_DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n");
 625     I->moveBefore(P);
 626     assert(MemInsertPoint && "Must have found insert point");
 627     if (MemoryUseOrDef *MA = MSSA->getMemoryAccess(I)) {
 628       MSSAU->moveAfter(MA, MemInsertPoint);
 629       MemInsertPoint = MA;
 630     }
 631   }
 632
 633   return true;
 634 }
 635
 636 bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
 637                                        const DataLayout &DL,
 638                                        BasicBlock::iterator &BBI) {
 639   if (!LI->isSimple() || !LI->hasOneUse() || LI->getParent() != SI->getParent())
 640     return false;
 641
 642   BatchAAResults BAA(*AA, EEA);
 643   auto *T = LI->getType();
 644   // Don't introduce calls to memcpy/memmove intrinsics out of thin air if
 645   // the corresponding libcalls are not available.
 646   // TODO: We should really distinguish between libcall availability and
 647   // our ability to introduce intrinsics.
 648   if (T->isAggregateType() &&
 649       (EnableMemCpyOptWithoutLibcalls ||
 650        (TLI->has(LibFunc_memcpy) && TLI->has(LibFunc_memmove)))) {
 651     MemoryLocation LoadLoc = MemoryLocation::get(LI);
 652     MemoryUseOrDef *LoadAccess = MSSA->getMemoryAccess(LI),
 653                    *StoreAccess = MSSA->getMemoryAccess(SI);
 654
 655     // We use MSSA to check if an instruction may store to the memory we load
 656     // from in between the load and the store. If such an instruction is found,
 657     // we try to promote there instead of at the store position.
 658     auto *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
 659         StoreAccess->getDefiningAccess(), LoadLoc, BAA);
 660     Instruction *P = MSSA->dominates(LoadAccess, Clobber)
 661                          ? cast<MemoryUseOrDef>(Clobber)->getMemoryInst()
 662                          : SI;
 663
 664     // If we found an instruction that may write to the loaded memory,
 665     // we can try to promote at this position instead of the store
 666     // position if nothing aliases the store memory after this and the store
 667     // destination is not in the range.
 668     if (P == SI || moveUp(SI, P, LI)) {
 669       // If we load from memory that may alias the memory we store to,
 670       // memmove must be used to preserve semantic. If not, memcpy can
 671       // be used. Also, if we load from constant memory, memcpy can be used
 672       // as the constant memory won't be modified.
 673       bool UseMemMove = false;
 674       if (isModSet(AA->getModRefInfo(SI, LoadLoc)))
 675         UseMemMove = true;
 676
 677       IRBuilder<> Builder(P);
 678       Value *Size =
 679           Builder.CreateTypeSize(Builder.getInt64Ty(), DL.getTypeStoreSize(T));
 680       Instruction *M;
 681       if (UseMemMove)
 682         M = Builder.CreateMemMove(SI->getPointerOperand(), SI->getAlign(),
 683                                   LI->getPointerOperand(), LI->getAlign(),
 684                                   Size);
 685       else
 686         M = Builder.CreateMemCpy(SI->getPointerOperand(), SI->getAlign(),
 687                                  LI->getPointerOperand(), LI->getAlign(), Size);
 688       M->copyMetadata(*SI, LLVMContext::MD_DIAssignID);
 689
 690       LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => " << *M
 691                         << "\n");
 692
 693       auto *LastDef = cast<MemoryDef>(MSSA->getMemoryAccess(SI));
 694       auto *NewAccess = MSSAU->createMemoryAccessAfter(M, nullptr, LastDef);
 695       MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
 696
 697       eraseInstruction(SI);
 698       eraseInstruction(LI);
 699       ++NumMemCpyInstr;
 700
 701       // Make sure we do not invalidate the iterator.
 702       BBI = M->getIterator();
 703       return true;
 704     }
 705   }
 706
 707   // Detect cases where we're performing call slot forwarding, but
 708   // happen to be using a load-store pair to implement it, rather than
 709   // a memcpy.
 710   auto GetCall = [&]() -> CallInst * {
 711     // We defer this expensive clobber walk until the cheap checks
 712     // have been done on the source inside performCallSlotOptzn.
 713     if (auto *LoadClobber = dyn_cast<MemoryUseOrDef>(
 714             MSSA->getWalker()->getClobberingMemoryAccess(LI, BAA)))
 715       return dyn_cast_or_null<CallInst>(LoadClobber->getMemoryInst());
 716     return nullptr;
 717   };
 718
 719   bool Changed = performCallSlotOptzn(
 720       LI, SI, SI->getPointerOperand()->stripPointerCasts(),
 721       LI->getPointerOperand()->stripPointerCasts(),
 722       DL.getTypeStoreSize(SI->getOperand(0)->getType()),
 723       std::min(SI->getAlign(), LI->getAlign()), BAA, GetCall);
 724   if (Changed) {
 725     eraseInstruction(SI);
 726     eraseInstruction(LI);
 727     ++NumMemCpyInstr;
 728     return true;
 729   }
 730
 731   // If this is a load-store pair from a stack slot to a stack slot, we
 732   // might be able to perform the stack-move optimization just as we do for
 733   // memcpys from an alloca to an alloca.
 734   if (auto *DestAlloca = dyn_cast<AllocaInst>(SI->getPointerOperand())) {
 735     if (auto *SrcAlloca = dyn_cast<AllocaInst>(LI->getPointerOperand())) {
 736       if (performStackMoveOptzn(LI, SI, DestAlloca, SrcAlloca,
 737                                 DL.getTypeStoreSize(T), BAA)) {
 738         // Avoid invalidating the iterator.
 739         BBI = SI->getNextNonDebugInstruction()->getIterator();
 740         eraseInstruction(SI);
 741         eraseInstruction(LI);
 742         ++NumMemCpyInstr;
 743         return true;
 744       }
 745     }
 746   }
 747
 748   return false;
 749 }
 750
 751 bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
 752   if (!SI->isSimple())
 753     return false;
 754
 755   // Avoid merging nontemporal stores since the resulting
 756   // memcpy/memset would not be able to preserve the nontemporal hint.
 757   // In theory we could teach how to propagate the !nontemporal metadata to
 758   // memset calls. However, that change would force the backend to
 759   // conservatively expand !nontemporal memset calls back to sequences of
 760   // store instructions (effectively undoing the merging).
 761   if (SI->getMetadata(LLVMContext::MD_nontemporal))
 762     return false;
 763
 764   const DataLayout &DL = SI->getDataLayout();
 765
 766   Value *StoredVal = SI->getValueOperand();
 767
 768   // Not all the transforms below are correct for non-integral pointers, bail
 769   // until we've audited the individual pieces.
 770   if (DL.isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
 771     return false;
 772
 773   // Load to store forwarding can be interpreted as memcpy.
 774   if (auto *LI = dyn_cast<LoadInst>(StoredVal))
 775     return processStoreOfLoad(SI, LI, DL, BBI);
 776
 777   // The following code creates memset intrinsics out of thin air. Don't do
 778   // this if the corresponding libfunc is not available.
 779   // TODO: We should really distinguish between libcall availability and
 780   // our ability to introduce intrinsics.
 781   if (!(TLI->has(LibFunc_memset) || EnableMemCpyOptWithoutLibcalls))
 782     return false;
 783
 784   // There are two cases that are interesting for this code to handle: memcpy
 785   // and memset.  Right now we only handle memset.
 786
 787   // Ensure that the value being stored is something that can be memset'able a
 788   // byte at a time like "0" or "-1" or any width, as well as things like
 789   // 0xA0A0A0A0 and 0.0.
 790   Value *V = SI->getOperand(0);
 791   Value *ByteVal = isBytewiseValue(V, DL);
 792   if (!ByteVal)
 793     return false;
 794
 795   if (Instruction *I =
 796           tryMergingIntoMemset(SI, SI->getPointerOperand(), ByteVal)) {
 797     BBI = I->getIterator(); // Don't invalidate iterator.
 798     return true;
 799   }
 800
 801   // If we have an aggregate, we try to promote it to memset regardless
 802   // of opportunity for merging as it can expose optimization opportunities
 803   // in subsequent passes.
 804   auto *T = V->getType();
 805   if (!T->isAggregateType())
 806     return false;
 807
 808   TypeSize Size = DL.getTypeStoreSize(T);
 809   if (Size.isScalable())
 810     return false;
 811
 812   IRBuilder<> Builder(SI);
 813   auto *M = Builder.CreateMemSet(SI->getPointerOperand(), ByteVal, Size,
 814                                  SI->getAlign());
 815   M->copyMetadata(*SI, LLVMContext::MD_DIAssignID);
 816
 817   LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n");
 818
 819   // The newly inserted memset is immediately overwritten by the original
 820   // store, so we do not need to rename uses.
 821   auto *StoreDef = cast<MemoryDef>(MSSA->getMemoryAccess(SI));
 822   auto *NewAccess = MSSAU->createMemoryAccessBefore(M, nullptr, StoreDef);
 823   MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/false);
 824
 825   eraseInstruction(SI);
 826   NumMemSetInfer++;
 827
 828   // Make sure we do not invalidate the iterator.
 829   BBI = M->getIterator();
 830   return true;
 831 }
 832
 833 bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
 834   // See if there is another memset or store neighboring this memset which
 835   // allows us to widen out the memset to do a single larger store.
 836   if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile())
 837     if (Instruction *I =
 838             tryMergingIntoMemset(MSI, MSI->getDest(), MSI->getValue())) {
 839       BBI = I->getIterator(); // Don't invalidate iterator.
 840       return true;
 841     }
 842   return false;
 843 }
 844
 845 /// Takes a memcpy and a call that it depends on,
 846 /// and checks for the possibility of a call slot optimization by having
 847 /// the call write its result directly into the destination of the memcpy.
 848 bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
 849                                          Instruction *cpyStore, Value *cpyDest,
 850                                          Value *cpySrc, TypeSize cpySize,
 851                                          Align cpyDestAlign,
 852                                          BatchAAResults &BAA,
 853                                          std::function<CallInst *()> GetC) {
 854   // The general transformation to keep in mind is
 855   //
 856   //   call @func(..., src, ...)
 857   //   memcpy(dest, src, ...)
 858   //
 859   // ->
 860   //
 861   //   memcpy(dest, src, ...)
 862   //   call @func(..., dest, ...)
 863   //
 864   // Since moving the memcpy is technically awkward, we additionally check that
 865   // src only holds uninitialized values at the moment of the call, meaning that
 866   // the memcpy can be discarded rather than moved.
 867
 868   // We can't optimize scalable types.
 869   if (cpySize.isScalable())
 870     return false;
 871
 872   // Require that src be an alloca.  This simplifies the reasoning considerably.
 873   auto *srcAlloca = dyn_cast<AllocaInst>(cpySrc);
 874   if (!srcAlloca)
 875     return false;
 876
 877   ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize());
 878   if (!srcArraySize)
 879     return false;
 880
 881   const DataLayout &DL = cpyLoad->getDataLayout();
 882   TypeSize SrcAllocaSize = DL.getTypeAllocSize(srcAlloca->getAllocatedType());
 883   // We can't optimize scalable types.
 884   if (SrcAllocaSize.isScalable())
 885     return false;
 886   uint64_t srcSize = SrcAllocaSize * srcArraySize->getZExtValue();
 887
 888   if (cpySize < srcSize)
 889     return false;
 890
 891   CallInst *C = GetC();
 892   if (!C)
 893     return false;
 894
 895   // Lifetime marks shouldn't be operated on.
 896   if (Function *F = C->getCalledFunction())
 897     if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start)
 898       return false;
 899
 900   if (C->getParent() != cpyStore->getParent()) {
 901     LLVM_DEBUG(dbgs() << "Call Slot: block local restriction\n");
 902     return false;
 903   }
 904
 905   MemoryLocation DestLoc =
 906       isa<StoreInst>(cpyStore)
 907           ? MemoryLocation::get(cpyStore)
 908           : MemoryLocation::getForDest(cast<MemCpyInst>(cpyStore));
 909
 910   // Check that nothing touches the dest of the copy between
 911   // the call and the store/memcpy.
 912   Instruction *SkippedLifetimeStart = nullptr;
 913   if (accessedBetween(BAA, DestLoc, MSSA->getMemoryAccess(C),
 914                       MSSA->getMemoryAccess(cpyStore), &SkippedLifetimeStart)) {
 915     LLVM_DEBUG(dbgs() << "Call Slot: Dest pointer modified after call\n");
 916     return false;
 917   }
 918
 919   // If we need to move a lifetime.start above the call, make sure that we can
 920   // actually do so. If the argument is bitcasted for example, we would have to
 921   // move the bitcast as well, which we don't handle.
 922   if (SkippedLifetimeStart) {
 923     auto *LifetimeArg =
 924         dyn_cast<Instruction>(SkippedLifetimeStart->getOperand(1));
 925     if (LifetimeArg && LifetimeArg->getParent() == C->getParent() &&
 926         C->comesBefore(LifetimeArg))
 927       return false;
 928   }
 929
 930   // Check that storing to the first srcSize bytes of dest will not cause a
 931   // trap or data race.
 932   bool ExplicitlyDereferenceableOnly;
 933   if (!isWritableObject(getUnderlyingObject(cpyDest),
 934                         ExplicitlyDereferenceableOnly) ||
 935       !isDereferenceableAndAlignedPointer(cpyDest, Align(1), APInt(64, cpySize),
 936                                           DL, C, AC, DT)) {
 937     LLVM_DEBUG(dbgs() << "Call Slot: Dest pointer not dereferenceable\n");
 938     return false;
 939   }
 940
 941   // Make sure that nothing can observe cpyDest being written early. There are
 942   // a number of cases to consider:
 943   //  1. cpyDest cannot be accessed between C and cpyStore as a precondition of
 944   //     the transform.
 945   //  2. C itself may not access cpyDest (prior to the transform). This is
 946   //     checked further below.
 947   //  3. If cpyDest is accessible to the caller of this function (potentially
 948   //     captured and not based on an alloca), we need to ensure that we cannot
 949   //     unwind between C and cpyStore. This is checked here.
 950   //  4. If cpyDest is potentially captured, there may be accesses to it from
 951   //     another thread. In this case, we need to check that cpyStore is
 952   //     guaranteed to be executed if C is. As it is a non-atomic access, it
 953   //     renders accesses from other threads undefined.
 954   //     TODO: This is currently not checked.
 955   if (mayBeVisibleThroughUnwinding(cpyDest, C, cpyStore)) {
 956     LLVM_DEBUG(dbgs() << "Call Slot: Dest may be visible through unwinding\n");
 957     return false;
 958   }
 959
 960   // Check that dest points to memory that is at least as aligned as src.
 961   Align srcAlign = srcAlloca->getAlign();
 962   bool isDestSufficientlyAligned = srcAlign <= cpyDestAlign;
 963   // If dest is not aligned enough and we can't increase its alignment then
 964   // bail out.
 965   if (!isDestSufficientlyAligned && !isa<AllocaInst>(cpyDest)) {
 966     LLVM_DEBUG(dbgs() << "Call Slot: Dest not sufficiently aligned\n");
 967     return false;
 968   }
 969
 970   // Check that src is not accessed except via the call and the memcpy.  This
 971   // guarantees that it holds only undefined values when passed in (so the final
 972   // memcpy can be dropped), that it is not read or written between the call and
 973   // the memcpy, and that writing beyond the end of it is undefined.
 974   SmallVector<User *, 8> srcUseList(srcAlloca->users());
 975   while (!srcUseList.empty()) {
 976     User *U = srcUseList.pop_back_val();
 977
 978     if (isa<AddrSpaceCastInst>(U)) {
 979       append_range(srcUseList, U->users());
 980       continue;
 981     }
 982     if (const auto *IT = dyn_cast<IntrinsicInst>(U))
 983       if (IT->isLifetimeStartOrEnd())
 984         continue;
 985
 986     if (U != C && U != cpyLoad) {
 987       LLVM_DEBUG(dbgs() << "Call slot: Source accessed by " << *U << "\n");
 988       return false;
 989     }
 990   }
 991
 992   // Check whether src is captured by the called function, in which case there
 993   // may be further indirect uses of src.
 994   bool SrcIsCaptured = any_of(C->args(), [&](Use &U) {
 995     return U->stripPointerCasts() == cpySrc &&
 996            !C->doesNotCapture(C->getArgOperandNo(&U));
 997   });
 998
 999   // If src is captured, then check whether there are any potential uses of
1000   // src through the captured pointer before the lifetime of src ends, either
1001   // due to a lifetime.end or a return from the function.
1002   if (SrcIsCaptured) {
1003     // Check that dest is not captured before/at the call. We have already
1004     // checked that src is not captured before it. If either had been captured,
1005     // then the call might be comparing the argument against the captured dest
1006     // or src pointer.
1007     Value *DestObj = getUnderlyingObject(cpyDest);
1008     if (!isIdentifiedFunctionLocal(DestObj) ||
1009         PointerMayBeCapturedBefore(DestObj, /* ReturnCaptures */ true,
1010                                    /* StoreCaptures */ true, C, DT,
1011                                    /* IncludeI */ true))
1012       return false;
1013
1014     MemoryLocation SrcLoc =
1015         MemoryLocation(srcAlloca, LocationSize::precise(srcSize));
1016     for (Instruction &I :
1017          make_range(++C->getIterator(), C->getParent()->end())) {
1018       // Lifetime of srcAlloca ends at lifetime.end.
1019       if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
1020         if (II->getIntrinsicID() == Intrinsic::lifetime_end &&
1021             II->getArgOperand(1)->stripPointerCasts() == srcAlloca &&
1022             cast<ConstantInt>(II->getArgOperand(0))->uge(srcSize))
1023           break;
1024       }
1025
1026       // Lifetime of srcAlloca ends at return.
1027       if (isa<ReturnInst>(&I))
1028         break;
1029
1030       // Ignore the direct read of src in the load.
1031       if (&I == cpyLoad)
1032         continue;
1033
1034       // Check whether this instruction may mod/ref src through the captured
1035       // pointer (we have already any direct mod/refs in the loop above).
1036       // Also bail if we hit a terminator, as we don't want to scan into other
1037       // blocks.
1038       if (isModOrRefSet(BAA.getModRefInfo(&I, SrcLoc)) || I.isTerminator())
1039         return false;
1040     }
1041   }
1042
1043   // Since we're changing the parameter to the callsite, we need to make sure
1044   // that what would be the new parameter dominates the callsite.
1045   bool NeedMoveGEP = false;
1046   if (!DT->dominates(cpyDest, C)) {
1047     // Support moving a constant index GEP before the call.
1048     auto *GEP = dyn_cast<GetElementPtrInst>(cpyDest);
1049     if (GEP && GEP->hasAllConstantIndices() &&
1050         DT->dominates(GEP->getPointerOperand(), C))
1051       NeedMoveGEP = true;
1052     else
1053       return false;
1054   }
1055
1056   // In addition to knowing that the call does not access src in some
1057   // unexpected manner, for example via a global, which we deduce from
1058   // the use analysis, we also need to know that it does not sneakily
1059   // access dest.  We rely on AA to figure this out for us.
1060   MemoryLocation DestWithSrcSize(cpyDest, LocationSize::precise(srcSize));
1061   ModRefInfo MR = BAA.getModRefInfo(C, DestWithSrcSize);
1062   // If necessary, perform additional analysis.
1063   if (isModOrRefSet(MR))
1064     MR = BAA.callCapturesBefore(C, DestWithSrcSize, DT);
1065   if (isModOrRefSet(MR))
1066     return false;
1067
1068   // We can't create address space casts here because we don't know if they're
1069   // safe for the target.
1070   if (cpySrc->getType() != cpyDest->getType())
1071     return false;
1072   for (unsigned ArgI = 0; ArgI < C->arg_size(); ++ArgI)
1073     if (C->getArgOperand(ArgI)->stripPointerCasts() == cpySrc &&
1074         cpySrc->getType() != C->getArgOperand(ArgI)->getType())
1075       return false;
1076
1077   // All the checks have passed, so do the transformation.
1078   bool changedArgument = false;
1079   for (unsigned ArgI = 0; ArgI < C->arg_size(); ++ArgI)
1080     if (C->getArgOperand(ArgI)->stripPointerCasts() == cpySrc) {
1081       changedArgument = true;
1082       C->setArgOperand(ArgI, cpyDest);
1083     }
1084
1085   if (!changedArgument)
1086     return false;
1087
1088   // If the destination wasn't sufficiently aligned then increase its alignment.
1089   if (!isDestSufficientlyAligned) {
1090     assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!");
1091     cast<AllocaInst>(cpyDest)->setAlignment(srcAlign);
1092   }
1093
1094   if (NeedMoveGEP) {
1095     auto *GEP = dyn_cast<GetElementPtrInst>(cpyDest);
1096     GEP->moveBefore(C);
1097   }
1098
1099   if (SkippedLifetimeStart) {
1100     SkippedLifetimeStart->moveBefore(C);
1101     MSSAU->moveBefore(MSSA->getMemoryAccess(SkippedLifetimeStart),
1102                       MSSA->getMemoryAccess(C));
1103   }
1104
1105   combineAAMetadata(C, cpyLoad);
1106   if (cpyLoad != cpyStore)
1107     combineAAMetadata(C, cpyStore);
1108
1109   ++NumCallSlot;
1110   return true;
1111 }
1112
1113 /// We've found that the (upward scanning) memory dependence of memcpy 'M' is
1114 /// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can.
1115 bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
1116                                                   MemCpyInst *MDep,
1117                                                   BatchAAResults &BAA) {
1118   // If dep instruction is reading from our current input, then it is a noop
1119   // transfer and substituting the input won't change this instruction. Just
1120   // ignore the input and let someone else zap MDep. This handles cases like:
1121   //    memcpy(a <- a)
1122   //    memcpy(b <- a)
1123   if (M->getSource() == MDep->getSource())
1124     return false;
1125
1126   // We can only optimize non-volatile memcpy's.
1127   if (MDep->isVolatile())
1128     return false;
1129
1130   int64_t MForwardOffset = 0;
1131   const DataLayout &DL = M->getModule()->getDataLayout();
1132   // We can only transforms memcpy's where the dest of one is the source of the
1133   // other, or they have an offset in a range.
1134   if (M->getSource() != MDep->getDest()) {
1135     std::optional<int64_t> Offset =
1136         M->getSource()->getPointerOffsetFrom(MDep->getDest(), DL);
1137     if (!Offset || *Offset < 0)
1138       return false;
1139     MForwardOffset = *Offset;
1140   }
1141
1142   // The length of the memcpy's must be the same, or the preceding one
1143   // must be larger than the following one.
1144   if (MForwardOffset != 0 || MDep->getLength() != M->getLength()) {
1145     auto *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
1146     auto *MLen = dyn_cast<ConstantInt>(M->getLength());
1147     if (!MDepLen || !MLen ||
1148         MDepLen->getZExtValue() < MLen->getZExtValue() + MForwardOffset)
1149       return false;
1150   }
1151
1152   IRBuilder<> Builder(M);
1153   auto *CopySource = MDep->getSource();
1154   Instruction *NewCopySource = nullptr;
1155   auto CleanupOnRet = llvm::make_scope_exit([&] {
1156     if (NewCopySource && NewCopySource->use_empty())
1157       // Safety: It's safe here because we will only allocate more instructions
1158       // after finishing all BatchAA queries, but we have to be careful if we
1159       // want to do something like this in another place. Then we'd probably
1160       // have to delay instruction removal until all transforms on an
1161       // instruction finished.
1162       eraseInstruction(NewCopySource);
1163   });
1164   MaybeAlign CopySourceAlign = MDep->getSourceAlign();
1165   // We just need to calculate the actual size of the copy.
1166   auto MCopyLoc = MemoryLocation::getForSource(MDep).getWithNewSize(
1167       MemoryLocation::getForSource(M).Size);
1168
1169   // When the forwarding offset is greater than 0, we transform
1170   //    memcpy(d1 <- s1)
1171   //    memcpy(d2 <- d1+o)
1172   // to
1173   //    memcpy(d2 <- s1+o)
1174   if (MForwardOffset > 0) {
1175     // The copy destination of `M` maybe can serve as the source of copying.
1176     std::optional<int64_t> MDestOffset =
1177         M->getRawDest()->getPointerOffsetFrom(MDep->getRawSource(), DL);
1178     if (MDestOffset == MForwardOffset)
1179       CopySource = M->getDest();
1180     else {
1181       CopySource = Builder.CreateInBoundsPtrAdd(
1182           CopySource, Builder.getInt64(MForwardOffset));
1183       NewCopySource = dyn_cast<Instruction>(CopySource);
1184     }
1185     // We need to update `MCopyLoc` if an offset exists.
1186     MCopyLoc = MCopyLoc.getWithNewPtr(CopySource);
1187     if (CopySourceAlign)
1188       CopySourceAlign = commonAlignment(*CopySourceAlign, MForwardOffset);
1189   }
1190
1191   // Avoid infinite loops
1192   if (BAA.isMustAlias(M->getSource(), CopySource))
1193     return false;
1194
1195   // Verify that the copied-from memory doesn't change in between the two
1196   // transfers.  For example, in:
1197   //    memcpy(a <- b)
1198   //    *b = 42;
1199   //    memcpy(c <- a)
1200   // It would be invalid to transform the second memcpy into memcpy(c <- b).
1201   //
1202   // TODO: If the code between M and MDep is transparent to the destination "c",
1203   // then we could still perform the xform by moving M up to the first memcpy.
1204   if (writtenBetween(MSSA, BAA, MCopyLoc, MSSA->getMemoryAccess(MDep),
1205                      MSSA->getMemoryAccess(M)))
1206     return false;
1207
1208   // No need to create `memcpy(a <- a)`.
1209   if (BAA.isMustAlias(M->getDest(), CopySource)) {
1210     // Remove the instruction we're replacing.
1211     eraseInstruction(M);
1212     ++NumMemCpyInstr;
1213     return true;
1214   }
1215
1216   // If the dest of the second might alias the source of the first, then the
1217   // source and dest might overlap. In addition, if the source of the first
1218   // points to constant memory, they won't overlap by definition. Otherwise, we
1219   // still want to eliminate the intermediate value, but we have to generate a
1220   // memmove instead of memcpy.
1221   bool UseMemMove = false;
1222   if (isModSet(BAA.getModRefInfo(M, MemoryLocation::getForSource(MDep)))) {
1223     // Don't convert llvm.memcpy.inline into memmove because memmove can be
1224     // lowered as a call, and that is not allowed for llvm.memcpy.inline (and
1225     // there is no inline version of llvm.memmove)
1226     if (isa<MemCpyInlineInst>(M))
1227       return false;
1228     UseMemMove = true;
1229   }
1230
1231   // If all checks passed, then we can transform M.
1232   LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy->memcpy src:\n"
1233                     << *MDep << '\n'
1234                     << *M << '\n');
1235
1236   // TODO: Is this worth it if we're creating a less aligned memcpy? For
1237   // example we could be moving from movaps -> movq on x86.
1238   Instruction *NewM;
1239   if (UseMemMove)
1240     NewM =
1241         Builder.CreateMemMove(M->getDest(), M->getDestAlign(), CopySource,
1242                               CopySourceAlign, M->getLength(), M->isVolatile());
1243   else if (isa<MemCpyInlineInst>(M)) {
1244     // llvm.memcpy may be promoted to llvm.memcpy.inline, but the converse is
1245     // never allowed since that would allow the latter to be lowered as a call
1246     // to an external function.
1247     NewM = Builder.CreateMemCpyInline(M->getDest(), M->getDestAlign(),
1248                                       CopySource, CopySourceAlign,
1249                                       M->getLength(), M->isVolatile());
1250   } else
1251     NewM =
1252         Builder.CreateMemCpy(M->getDest(), M->getDestAlign(), CopySource,
1253                              CopySourceAlign, M->getLength(), M->isVolatile());
1254   NewM->copyMetadata(*M, LLVMContext::MD_DIAssignID);
1255
1256   assert(isa<MemoryDef>(MSSA->getMemoryAccess(M)));
1257   auto *LastDef = cast<MemoryDef>(MSSA->getMemoryAccess(M));
1258   auto *NewAccess = MSSAU->createMemoryAccessAfter(NewM, nullptr, LastDef);
1259   MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
1260
1261   // Remove the instruction we're replacing.
1262   eraseInstruction(M);
1263   ++NumMemCpyInstr;
1264   return true;
1265 }
1266
1267 /// We've found that the (upward scanning) memory dependence of \p MemCpy is
1268 /// \p MemSet.  Try to simplify \p MemSet to only set the trailing bytes that
1269 /// weren't copied over by \p MemCpy.
1270 ///
1271 /// In other words, transform:
1272 /// \code
1273 ///   memset(dst, c, dst_size);
1274 ///   ...
1275 ///   memcpy(dst, src, src_size);
1276 /// \endcode
1277 /// into:
1278 /// \code
1279 ///   ...
1280 ///   memset(dst + src_size, c, dst_size <= src_size ? 0 : dst_size - src_size);
1281 ///   memcpy(dst, src, src_size);
1282 /// \endcode
1283 ///
1284 /// The memset is sunk to just before the memcpy to ensure that src_size is
1285 /// present when emitting the simplified memset.
1286 bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
1287                                                   MemSetInst *MemSet,
1288                                                   BatchAAResults &BAA) {
1289   // We can only transform memset/memcpy with the same destination.
1290   if (!BAA.isMustAlias(MemSet->getDest(), MemCpy->getDest()))
1291     return false;
1292
1293   // Don't perform the transform if src_size may be zero. In that case, the
1294   // transform is essentially a complex no-op and may lead to an infinite
1295   // loop if BasicAA is smart enough to understand that dst and dst + src_size
1296   // are still MustAlias after the transform.
1297   Value *SrcSize = MemCpy->getLength();
1298   if (!isKnownNonZero(SrcSize,
1299                       SimplifyQuery(MemCpy->getDataLayout(), DT, AC, MemCpy)))
1300     return false;
1301
1302   // Check that src and dst of the memcpy aren't the same. While memcpy
1303   // operands cannot partially overlap, exact equality is allowed.
1304   if (isModSet(BAA.getModRefInfo(MemCpy, MemoryLocation::getForSource(MemCpy))))
1305     return false;
1306
1307   // We know that dst up to src_size is not written. We now need to make sure
1308   // that dst up to dst_size is not accessed. (If we did not move the memset,
1309   // checking for reads would be sufficient.)
1310   if (accessedBetween(BAA, MemoryLocation::getForDest(MemSet),
1311                       MSSA->getMemoryAccess(MemSet),
1312                       MSSA->getMemoryAccess(MemCpy)))
1313     return false;
1314
1315   // Use the same i8* dest as the memcpy, killing the memset dest if different.
1316   Value *Dest = MemCpy->getRawDest();
1317   Value *DestSize = MemSet->getLength();
1318
1319   if (mayBeVisibleThroughUnwinding(Dest, MemSet, MemCpy))
1320     return false;
1321
1322   // If the sizes are the same, simply drop the memset instead of generating
1323   // a replacement with zero size.
1324   if (DestSize == SrcSize) {
1325     eraseInstruction(MemSet);
1326     return true;
1327   }
1328
1329   // By default, create an unaligned memset.
1330   Align Alignment = Align(1);
1331   // If Dest is aligned, and SrcSize is constant, use the minimum alignment
1332   // of the sum.
1333   const Align DestAlign = std::max(MemSet->getDestAlign().valueOrOne(),
1334                                    MemCpy->getDestAlign().valueOrOne());
1335   if (DestAlign > 1)
1336     if (auto *SrcSizeC = dyn_cast<ConstantInt>(SrcSize))
1337       Alignment = commonAlignment(DestAlign, SrcSizeC->getZExtValue());
1338
1339   IRBuilder<> Builder(MemCpy);
1340
1341   // Preserve the debug location of the old memset for the code emitted here
1342   // related to the new memset. This is correct according to the rules in
1343   // https://llvm.org/docs/HowToUpdateDebugInfo.html about "when to preserve an
1344   // instruction location", given that we move the memset within the basic
1345   // block.
1346   assert(MemSet->getParent() == MemCpy->getParent() &&
1347          "Preserving debug location based on moving memset within BB.");
1348   Builder.SetCurrentDebugLocation(MemSet->getDebugLoc());
1349
1350   // If the sizes have different types, zext the smaller one.
1351   if (DestSize->getType() != SrcSize->getType()) {
1352     if (DestSize->getType()->getIntegerBitWidth() >
1353         SrcSize->getType()->getIntegerBitWidth())
1354       SrcSize = Builder.CreateZExt(SrcSize, DestSize->getType());
1355     else
1356       DestSize = Builder.CreateZExt(DestSize, SrcSize->getType());
1357   }
1358
1359   Value *Ule = Builder.CreateICmpULE(DestSize, SrcSize);
1360   Value *SizeDiff = Builder.CreateSub(DestSize, SrcSize);
1361   Value *MemsetLen = Builder.CreateSelect(
1362       Ule, ConstantInt::getNullValue(DestSize->getType()), SizeDiff);
1363   Instruction *NewMemSet =
1364       Builder.CreateMemSet(Builder.CreatePtrAdd(Dest, SrcSize),
1365                            MemSet->getOperand(1), MemsetLen, Alignment);
1366
1367   assert(isa<MemoryDef>(MSSA->getMemoryAccess(MemCpy)) &&
1368          "MemCpy must be a MemoryDef");
1369   // The new memset is inserted before the memcpy, and it is known that the
1370   // memcpy's defining access is the memset about to be removed.
1371   auto *LastDef = cast<MemoryDef>(MSSA->getMemoryAccess(MemCpy));
1372   auto *NewAccess =
1373       MSSAU->createMemoryAccessBefore(NewMemSet, nullptr, LastDef);
1374   MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
1375
1376   eraseInstruction(MemSet);
1377   return true;
1378 }
1379
1380 /// Determine whether the instruction has undefined content for the given Size,
1381 /// either because it was freshly alloca'd or started its lifetime.
1382 static bool hasUndefContents(MemorySSA *MSSA, BatchAAResults &AA, Value *V,
1383                              MemoryDef *Def, Value *Size) {
1384   if (MSSA->isLiveOnEntryDef(Def))
1385     return isa<AllocaInst>(getUnderlyingObject(V));
1386
1387   if (auto *II = dyn_cast_or_null<IntrinsicInst>(Def->getMemoryInst())) {
1388     if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
1389       auto *LTSize = cast<ConstantInt>(II->getArgOperand(0));
1390
1391       if (auto *CSize = dyn_cast<ConstantInt>(Size)) {
1392         if (AA.isMustAlias(V, II->getArgOperand(1)) &&
1393             LTSize->getZExtValue() >= CSize->getZExtValue())
1394           return true;
1395       }
1396
1397       // If the lifetime.start covers a whole alloca (as it almost always
1398       // does) and we're querying a pointer based on that alloca, then we know
1399       // the memory is definitely undef, regardless of how exactly we alias.
1400       // The size also doesn't matter, as an out-of-bounds access would be UB.
1401       if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(V))) {
1402         if (getUnderlyingObject(II->getArgOperand(1)) == Alloca) {
1403           const DataLayout &DL = Alloca->getDataLayout();
1404           if (std::optional<TypeSize> AllocaSize =
1405                   Alloca->getAllocationSize(DL))
1406             if (*AllocaSize == LTSize->getValue())
1407               return true;
1408         }
1409       }
1410     }
1411   }
1412
1413   return false;
1414 }
1415
1416 /// Transform memcpy to memset when its source was just memset.
1417 /// In other words, turn:
1418 /// \code
1419 ///   memset(dst1, c, dst1_size);
1420 ///   memcpy(dst2, dst1, dst2_size);
1421 /// \endcode
1422 /// into:
1423 /// \code
1424 ///   memset(dst1, c, dst1_size);
1425 ///   memset(dst2, c, dst2_size);
1426 /// \endcode
1427 /// When dst2_size <= dst1_size.
1428 bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
1429                                                MemSetInst *MemSet,
1430                                                BatchAAResults &BAA) {
1431   // Make sure that memcpy(..., memset(...), ...), that is we are memsetting and
1432   // memcpying from the same address. Otherwise it is hard to reason about.
1433   if (!BAA.isMustAlias(MemSet->getRawDest(), MemCpy->getRawSource()))
1434     return false;
1435
1436   Value *MemSetSize = MemSet->getLength();
1437   Value *CopySize = MemCpy->getLength();
1438
1439   if (MemSetSize != CopySize) {
1440     // Make sure the memcpy doesn't read any more than what the memset wrote.
1441     // Don't worry about sizes larger than i64.
1442
1443     // A known memset size is required.
1444     auto *CMemSetSize = dyn_cast<ConstantInt>(MemSetSize);
1445     if (!CMemSetSize)
1446       return false;
1447
1448     // A known memcpy size is also required.
1449     auto *CCopySize = dyn_cast<ConstantInt>(CopySize);
1450     if (!CCopySize)
1451       return false;
1452     if (CCopySize->getZExtValue() > CMemSetSize->getZExtValue()) {
1453       // If the memcpy is larger than the memset, but the memory was undef prior
1454       // to the memset, we can just ignore the tail. Technically we're only
1455       // interested in the bytes from MemSetSize..CopySize here, but as we can't
1456       // easily represent this location, we use the full 0..CopySize range.
1457       MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy);
1458       bool CanReduceSize = false;
1459       MemoryUseOrDef *MemSetAccess = MSSA->getMemoryAccess(MemSet);
1460       MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
1461           MemSetAccess->getDefiningAccess(), MemCpyLoc, BAA);
1462       if (auto *MD = dyn_cast<MemoryDef>(Clobber))
1463         if (hasUndefContents(MSSA, BAA, MemCpy->getSource(), MD, CopySize))
1464           CanReduceSize = true;
1465
1466       if (!CanReduceSize)
1467         return false;
1468       CopySize = MemSetSize;
1469     }
1470   }
1471
1472   IRBuilder<> Builder(MemCpy);
1473   Instruction *NewM =
1474       Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1),
1475                            CopySize, MemCpy->getDestAlign());
1476   auto *LastDef = cast<MemoryDef>(MSSA->getMemoryAccess(MemCpy));
1477   auto *NewAccess = MSSAU->createMemoryAccessAfter(NewM, nullptr, LastDef);
1478   MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
1479
1480   return true;
1481 }
1482
1483 // Attempts to optimize the pattern whereby memory is copied from an alloca to
1484 // another alloca, where the two allocas don't have conflicting mod/ref. If
1485 // successful, the two allocas can be merged into one and the transfer can be
1486 // deleted. This pattern is generated frequently in Rust, due to the ubiquity of
1487 // move operations in that language.
1488 //
1489 // Once we determine that the optimization is safe to perform, we replace all
1490 // uses of the destination alloca with the source alloca. We also "shrink wrap"
1491 // the lifetime markers of the single merged alloca to before the first use
1492 // and after the last use. Note that the "shrink wrapping" procedure is a safe
1493 // transformation only because we restrict the scope of this optimization to
1494 // allocas that aren't captured.
1495 bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store,
1496                                           AllocaInst *DestAlloca,
1497                                           AllocaInst *SrcAlloca, TypeSize Size,
1498                                           BatchAAResults &BAA) {
1499   LLVM_DEBUG(dbgs() << "Stack Move: Attempting to optimize:\n"
1500                     << *Store << "\n");
1501
1502   // Make sure the two allocas are in the same address space.
1503   if (SrcAlloca->getAddressSpace() != DestAlloca->getAddressSpace()) {
1504     LLVM_DEBUG(dbgs() << "Stack Move: Address space mismatch\n");
1505     return false;
1506   }
1507
1508   // Check that copy is full with static size.
1509   const DataLayout &DL = DestAlloca->getDataLayout();
1510   std::optional<TypeSize> SrcSize = SrcAlloca->getAllocationSize(DL);
1511   if (!SrcSize || Size != *SrcSize) {
1512     LLVM_DEBUG(dbgs() << "Stack Move: Source alloca size mismatch\n");
1513     return false;
1514   }
1515   std::optional<TypeSize> DestSize = DestAlloca->getAllocationSize(DL);
1516   if (!DestSize || Size != *DestSize) {
1517     LLVM_DEBUG(dbgs() << "Stack Move: Destination alloca size mismatch\n");
1518     return false;
1519   }
1520
1521   if (!SrcAlloca->isStaticAlloca() || !DestAlloca->isStaticAlloca())
1522     return false;
1523
1524   // Check that src and dest are never captured, unescaped allocas. Also
1525   // find the nearest common dominator and postdominator for all users in
1526   // order to shrink wrap the lifetimes, and instructions with noalias metadata
1527   // to remove them.
1528
1529   SmallVector<Instruction *, 4> LifetimeMarkers;
1530   SmallSet<Instruction *, 4> NoAliasInstrs;
1531   bool SrcNotDom = false;
1532
1533   // Recursively track the user and check whether modified alias exist.
1534   auto IsDereferenceableOrNull = [](Value *V, const DataLayout &DL) -> bool {
1535     bool CanBeNull, CanBeFreed;
1536     return V->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed);
1537   };
1538
1539   auto CaptureTrackingWithModRef =
1540       [&](Instruction *AI,
1541           function_ref<bool(Instruction *)> ModRefCallback) -> bool {
1542     SmallVector<Instruction *, 8> Worklist;
1543     Worklist.push_back(AI);
1544     unsigned MaxUsesToExplore = getDefaultMaxUsesToExploreForCaptureTracking();
1545     Worklist.reserve(MaxUsesToExplore);
1546     SmallSet<const Use *, 20> Visited;
1547     while (!Worklist.empty()) {
1548       Instruction *I = Worklist.back();
1549       Worklist.pop_back();
1550       for (const Use &U : I->uses()) {
1551         auto *UI = cast<Instruction>(U.getUser());
1552         // If any use that isn't dominated by SrcAlloca exists, we move src
1553         // alloca to the entry before the transformation.
1554         if (!DT->dominates(SrcAlloca, UI))
1555           SrcNotDom = true;
1556
1557         if (Visited.size() >= MaxUsesToExplore) {
1558           LLVM_DEBUG(
1559               dbgs()
1560               << "Stack Move: Exceeded max uses to see ModRef, bailing\n");
1561           return false;
1562         }
1563         if (!Visited.insert(&U).second)
1564           continue;
1565         switch (DetermineUseCaptureKind(U, IsDereferenceableOrNull)) {
1566         case UseCaptureKind::MAY_CAPTURE:
1567           return false;
1568         case UseCaptureKind::PASSTHROUGH:
1569           // Instructions cannot have non-instruction users.
1570           Worklist.push_back(UI);
1571           continue;
1572         case UseCaptureKind::NO_CAPTURE: {
1573           if (UI->isLifetimeStartOrEnd()) {
1574             // We note the locations of these intrinsic calls so that we can
1575             // delete them later if the optimization succeeds, this is safe
1576             // since both llvm.lifetime.start and llvm.lifetime.end intrinsics
1577             // practically fill all the bytes of the alloca with an undefined
1578             // value, although conceptually marked as alive/dead.
1579             int64_t Size = cast<ConstantInt>(UI->getOperand(0))->getSExtValue();
1580             if (Size < 0 || Size == DestSize) {
1581               LifetimeMarkers.push_back(UI);
1582               continue;
1583             }
1584           }
1585           if (UI->hasMetadata(LLVMContext::MD_noalias))
1586             NoAliasInstrs.insert(UI);
1587           if (!ModRefCallback(UI))
1588             return false;
1589         }
1590         }
1591       }
1592     }
1593     return true;
1594   };
1595
1596   // Check that dest has no Mod/Ref, from the alloca to the Store, except full
1597   // size lifetime intrinsics. And collect modref inst for the reachability
1598   // check.
1599   ModRefInfo DestModRef = ModRefInfo::NoModRef;
1600   MemoryLocation DestLoc(DestAlloca, LocationSize::precise(Size));
1601   SmallVector<BasicBlock *, 8> ReachabilityWorklist;
1602   auto DestModRefCallback = [&](Instruction *UI) -> bool {
1603     // We don't care about the store itself.
1604     if (UI == Store)
1605       return true;
1606     ModRefInfo Res = BAA.getModRefInfo(UI, DestLoc);
1607     DestModRef |= Res;
1608     if (isModOrRefSet(Res)) {
1609       // Instructions reachability checks.
1610       // FIXME: adding the Instruction version isPotentiallyReachableFromMany on
1611       // lib/Analysis/CFG.cpp (currently only for BasicBlocks) might be helpful.
1612       if (UI->getParent() == Store->getParent()) {
1613         // The same block case is special because it's the only time we're
1614         // looking within a single block to see which instruction comes first.
1615         // Once we start looking at multiple blocks, the first instruction of
1616         // the block is reachable, so we only need to determine reachability
1617         // between whole blocks.
1618         BasicBlock *BB = UI->getParent();
1619
1620         // If A comes before B, then B is definitively reachable from A.
1621         if (UI->comesBefore(Store))
1622           return false;
1623
1624         // If the user's parent block is entry, no predecessor exists.
1625         if (BB->isEntryBlock())
1626           return true;
1627
1628         // Otherwise, continue doing the normal per-BB CFG walk.
1629         ReachabilityWorklist.append(succ_begin(BB), succ_end(BB));
1630       } else {
1631         ReachabilityWorklist.push_back(UI->getParent());
1632       }
1633     }
1634     return true;
1635   };
1636
1637   if (!CaptureTrackingWithModRef(DestAlloca, DestModRefCallback))
1638     return false;
1639   // Bailout if Dest may have any ModRef before Store.
1640   if (!ReachabilityWorklist.empty() &&
1641       isPotentiallyReachableFromMany(ReachabilityWorklist, Store->getParent(),
1642                                      nullptr, DT, nullptr))
1643     return false;
1644
1645   // Check that, from after the Load to the end of the BB,
1646   //   - if the dest has any Mod, src has no Ref, and
1647   //   - if the dest has any Ref, src has no Mod except full-sized lifetimes.
1648   MemoryLocation SrcLoc(SrcAlloca, LocationSize::precise(Size));
1649
1650   auto SrcModRefCallback = [&](Instruction *UI) -> bool {
1651     // Any ModRef post-dominated by Load doesn't matter, also Load and Store
1652     // themselves can be ignored.
1653     if (PDT->dominates(Load, UI) || UI == Load || UI == Store)
1654       return true;
1655     ModRefInfo Res = BAA.getModRefInfo(UI, SrcLoc);
1656     if ((isModSet(DestModRef) && isRefSet(Res)) ||
1657         (isRefSet(DestModRef) && isModSet(Res)))
1658       return false;
1659
1660     return true;
1661   };
1662
1663   if (!CaptureTrackingWithModRef(SrcAlloca, SrcModRefCallback))
1664     return false;
1665
1666   // We can do the transformation. First, move the SrcAlloca to the start of the
1667   // BB.
1668   if (SrcNotDom)
1669     SrcAlloca->moveBefore(*SrcAlloca->getParent(),
1670                           SrcAlloca->getParent()->getFirstInsertionPt());
1671   // Align the allocas appropriately.
1672   SrcAlloca->setAlignment(
1673       std::max(SrcAlloca->getAlign(), DestAlloca->getAlign()));
1674
1675   // Merge the two allocas.
1676   DestAlloca->replaceAllUsesWith(SrcAlloca);
1677   eraseInstruction(DestAlloca);
1678
1679   // Drop metadata on the source alloca.
1680   SrcAlloca->dropUnknownNonDebugMetadata();
1681
1682   // TODO: Reconstruct merged lifetime markers.
1683   // Remove all other lifetime markers. if the original lifetime intrinsics
1684   // exists.
1685   if (!LifetimeMarkers.empty()) {
1686     for (Instruction *I : LifetimeMarkers)
1687       eraseInstruction(I);
1688   }
1689
1690   // As this transformation can cause memory accesses that didn't previously
1691   // alias to begin to alias one another, we remove !noalias metadata from any
1692   // uses of either alloca. This is conservative, but more precision doesn't
1693   // seem worthwhile right now.
1694   for (Instruction *I : NoAliasInstrs)
1695     I->setMetadata(LLVMContext::MD_noalias, nullptr);
1696
1697   LLVM_DEBUG(dbgs() << "Stack Move: Performed staack-move optimization\n");
1698   NumStackMove++;
1699   return true;
1700 }
1701
1702 static bool isZeroSize(Value *Size) {
1703   if (auto *I = dyn_cast<Instruction>(Size))
1704     if (auto *Res = simplifyInstruction(I, I->getDataLayout()))
1705       Size = Res;
1706   // Treat undef/poison size like zero.
1707   if (auto *C = dyn_cast<Constant>(Size))
1708     return isa<UndefValue>(C) || C->isNullValue();
1709   return false;
1710 }
1711
1712 /// Perform simplification of memcpy's.  If we have memcpy A
1713 /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite
1714 /// B to be a memcpy from X to Z (or potentially a memmove, depending on
1715 /// circumstances). This allows later passes to remove the first memcpy
1716 /// altogether.
1717 bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
1718   // We can only optimize non-volatile memcpy's.
1719   if (M->isVolatile())
1720     return false;
1721
1722   // If the source and destination of the memcpy are the same, then zap it.
1723   if (M->getSource() == M->getDest()) {
1724     ++BBI;
1725     eraseInstruction(M);
1726     return true;
1727   }
1728
1729   // If the size is zero, remove the memcpy.
1730   if (isZeroSize(M->getLength())) {
1731     ++BBI;
1732     eraseInstruction(M);
1733     return true;
1734   }
1735
1736   MemoryUseOrDef *MA = MSSA->getMemoryAccess(M);
1737   if (!MA)
1738     // Degenerate case: memcpy marked as not accessing memory.
1739     return false;
1740
1741   // If copying from a constant, try to turn the memcpy into a memset.
1742   if (auto *GV = dyn_cast<GlobalVariable>(M->getSource()))
1743     if (GV->isConstant() && GV->hasDefinitiveInitializer())
1744       if (Value *ByteVal = isBytewiseValue(GV->getInitializer(),
1745                                            M->getDataLayout())) {
1746         IRBuilder<> Builder(M);
1747         Instruction *NewM = Builder.CreateMemSet(
1748             M->getRawDest(), ByteVal, M->getLength(), M->getDestAlign(), false);
1749         auto *LastDef = cast<MemoryDef>(MA);
1750         auto *NewAccess =
1751             MSSAU->createMemoryAccessAfter(NewM, nullptr, LastDef);
1752         MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
1753
1754         eraseInstruction(M);
1755         ++NumCpyToSet;
1756         return true;
1757       }
1758
1759   BatchAAResults BAA(*AA, EEA);
1760   // FIXME: Not using getClobberingMemoryAccess() here due to PR54682.
1761   MemoryAccess *AnyClobber = MA->getDefiningAccess();
1762   MemoryLocation DestLoc = MemoryLocation::getForDest(M);
1763   const MemoryAccess *DestClobber =
1764       MSSA->getWalker()->getClobberingMemoryAccess(AnyClobber, DestLoc, BAA);
1765
1766   // Try to turn a partially redundant memset + memcpy into
1767   // smaller memset + memcpy.  We don't need the memcpy size for this.
1768   // The memcpy must post-dom the memset, so limit this to the same basic
1769   // block. A non-local generalization is likely not worthwhile.
1770   if (auto *MD = dyn_cast<MemoryDef>(DestClobber))
1771     if (auto *MDep = dyn_cast_or_null<MemSetInst>(MD->getMemoryInst()))
1772       if (DestClobber->getBlock() == M->getParent())
1773         if (processMemSetMemCpyDependence(M, MDep, BAA))
1774           return true;
1775
1776   MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess(
1777       AnyClobber, MemoryLocation::getForSource(M), BAA);
1778
1779   // There are five possible optimizations we can do for memcpy:
1780   //   a) memcpy-memcpy xform which exposes redundance for DSE.
1781   //   b) call-memcpy xform for return slot optimization.
1782   //   c) memcpy from freshly alloca'd space or space that has just started
1783   //      its lifetime copies undefined data, and we can therefore eliminate
1784   //      the memcpy in favor of the data that was already at the destination.
1785   //   d) memcpy from a just-memset'd source can be turned into memset.
1786   //   e) elimination of memcpy via stack-move optimization.
1787   if (auto *MD = dyn_cast<MemoryDef>(SrcClobber)) {
1788     if (Instruction *MI = MD->getMemoryInst()) {
1789       if (auto *CopySize = dyn_cast<ConstantInt>(M->getLength())) {
1790         if (auto *C = dyn_cast<CallInst>(MI)) {
1791           if (performCallSlotOptzn(M, M, M->getDest(), M->getSource(),
1792                                    TypeSize::getFixed(CopySize->getZExtValue()),
1793                                    M->getDestAlign().valueOrOne(), BAA,
1794                                    [C]() -> CallInst * { return C; })) {
1795             LLVM_DEBUG(dbgs() << "Performed call slot optimization:\n"
1796                               << "    call: " << *C << "\n"
1797                               << "    memcpy: " << *M << "\n");
1798             eraseInstruction(M);
1799             ++NumMemCpyInstr;
1800             return true;
1801           }
1802         }
1803       }
1804       if (auto *MDep = dyn_cast<MemCpyInst>(MI))
1805         if (processMemCpyMemCpyDependence(M, MDep, BAA))
1806           return true;
1807       if (auto *MDep = dyn_cast<MemSetInst>(MI)) {
1808         if (performMemCpyToMemSetOptzn(M, MDep, BAA)) {
1809           LLVM_DEBUG(dbgs() << "Converted memcpy to memset\n");
1810           eraseInstruction(M);
1811           ++NumCpyToSet;
1812           return true;
1813         }
1814       }
1815     }
1816
1817     if (hasUndefContents(MSSA, BAA, M->getSource(), MD, M->getLength())) {
1818       LLVM_DEBUG(dbgs() << "Removed memcpy from undef\n");
1819       eraseInstruction(M);
1820       ++NumMemCpyInstr;
1821       return true;
1822     }
1823   }
1824
1825   // If the transfer is from a stack slot to a stack slot, then we may be able
1826   // to perform the stack-move optimization. See the comments in
1827   // performStackMoveOptzn() for more details.
1828   auto *DestAlloca = dyn_cast<AllocaInst>(M->getDest());
1829   if (!DestAlloca)
1830     return false;
1831   auto *SrcAlloca = dyn_cast<AllocaInst>(M->getSource());
1832   if (!SrcAlloca)
1833     return false;
1834   ConstantInt *Len = dyn_cast<ConstantInt>(M->getLength());
1835   if (Len == nullptr)
1836     return false;
1837   if (performStackMoveOptzn(M, M, DestAlloca, SrcAlloca,
1838                             TypeSize::getFixed(Len->getZExtValue()), BAA)) {
1839     // Avoid invalidating the iterator.
1840     BBI = M->getNextNonDebugInstruction()->getIterator();
1841     eraseInstruction(M);
1842     ++NumMemCpyInstr;
1843     return true;
1844   }
1845
1846   return false;
1847 }
1848
1849 /// Memmove calls with overlapping src/dest buffers that come after a memset may
1850 /// be removed.
1851 bool MemCpyOptPass::isMemMoveMemSetDependency(MemMoveInst *M) {
1852   const auto &DL = M->getDataLayout();
1853   MemoryUseOrDef *MemMoveAccess = MSSA->getMemoryAccess(M);
1854   if (!MemMoveAccess)
1855     return false;
1856
1857   // The memmove is of form memmove(x, x + A, B).
1858   MemoryLocation SourceLoc = MemoryLocation::getForSource(M);
1859   auto *MemMoveSourceOp = M->getSource();
1860   auto *Source = dyn_cast<GEPOperator>(MemMoveSourceOp);
1861   if (!Source)
1862     return false;
1863
1864   APInt Offset(DL.getIndexTypeSizeInBits(Source->getType()), 0);
1865   LocationSize MemMoveLocSize = SourceLoc.Size;
1866   if (Source->getPointerOperand() != M->getDest() ||
1867       !MemMoveLocSize.hasValue() ||
1868       !Source->accumulateConstantOffset(DL, Offset) || Offset.isNegative()) {
1869     return false;
1870   }
1871
1872   uint64_t MemMoveSize = MemMoveLocSize.getValue();
1873   LocationSize TotalSize =
1874       LocationSize::precise(Offset.getZExtValue() + MemMoveSize);
1875   MemoryLocation CombinedLoc(M->getDest(), TotalSize);
1876
1877   // The first dominating clobbering MemoryAccess for the combined location
1878   // needs to be a memset.
1879   BatchAAResults BAA(*AA);
1880   MemoryAccess *FirstDef = MemMoveAccess->getDefiningAccess();
1881   auto *DestClobber = dyn_cast<MemoryDef>(
1882       MSSA->getWalker()->getClobberingMemoryAccess(FirstDef, CombinedLoc, BAA));
1883   if (!DestClobber)
1884     return false;
1885
1886   auto *MS = dyn_cast_or_null<MemSetInst>(DestClobber->getMemoryInst());
1887   if (!MS)
1888     return false;
1889
1890   // Memset length must be sufficiently large.
1891   auto *MemSetLength = dyn_cast<ConstantInt>(MS->getLength());
1892   if (!MemSetLength || MemSetLength->getZExtValue() < MemMoveSize)
1893     return false;
1894
1895   // The destination buffer must have been memset'd.
1896   if (!BAA.isMustAlias(MS->getDest(), M->getDest()))
1897     return false;
1898
1899   return true;
1900 }
1901
1902 /// Transforms memmove calls to memcpy calls when the src/dst are guaranteed
1903 /// not to alias.
1904 bool MemCpyOptPass::processMemMove(MemMoveInst *M, BasicBlock::iterator &BBI) {
1905   // See if the source could be modified by this memmove potentially.
1906   if (isModSet(AA->getModRefInfo(M, MemoryLocation::getForSource(M)))) {
1907     // On the off-chance the memmove clobbers src with previously memset'd
1908     // bytes, the memmove may be redundant.
1909     if (!M->isVolatile() && isMemMoveMemSetDependency(M)) {
1910       LLVM_DEBUG(dbgs() << "Removed redundant memmove.\n");
1911       ++BBI;
1912       eraseInstruction(M);
1913       ++NumMemMoveInstr;
1914       return true;
1915     }
1916     return false;
1917   }
1918
1919   LLVM_DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M
1920                     << "\n");
1921
1922   // If not, then we know we can transform this.
1923   Type *ArgTys[3] = {M->getRawDest()->getType(), M->getRawSource()->getType(),
1924                      M->getLength()->getType()};
1925   M->setCalledFunction(Intrinsic::getOrInsertDeclaration(
1926       M->getModule(), Intrinsic::memcpy, ArgTys));
1927
1928   // For MemorySSA nothing really changes (except that memcpy may imply stricter
1929   // aliasing guarantees).
1930
1931   ++NumMoveToCpy;
1932   return true;
1933 }
1934
1935 /// This is called on every byval argument in call sites.
1936 bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
1937   const DataLayout &DL = CB.getDataLayout();
1938   // Find out what feeds this byval argument.
1939   Value *ByValArg = CB.getArgOperand(ArgNo);
1940   Type *ByValTy = CB.getParamByValType(ArgNo);
1941   TypeSize ByValSize = DL.getTypeAllocSize(ByValTy);
1942   MemoryLocation Loc(ByValArg, LocationSize::precise(ByValSize));
1943   MemoryUseOrDef *CallAccess = MSSA->getMemoryAccess(&CB);
1944   if (!CallAccess)
1945     return false;
1946   MemCpyInst *MDep = nullptr;
1947   BatchAAResults BAA(*AA, EEA);
1948   MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
1949       CallAccess->getDefiningAccess(), Loc, BAA);
1950   if (auto *MD = dyn_cast<MemoryDef>(Clobber))
1951     MDep = dyn_cast_or_null<MemCpyInst>(MD->getMemoryInst());
1952
1953   // If the byval argument isn't fed by a memcpy, ignore it.  If it is fed by
1954   // a memcpy, see if we can byval from the source of the memcpy instead of the
1955   // result.
1956   if (!MDep || MDep->isVolatile() ||
1957       ByValArg->stripPointerCasts() != MDep->getDest())
1958     return false;
1959
1960   // The length of the memcpy must be larger or equal to the size of the byval.
1961   auto *C1 = dyn_cast<ConstantInt>(MDep->getLength());
1962   if (!C1 || !TypeSize::isKnownGE(
1963                  TypeSize::getFixed(C1->getValue().getZExtValue()), ByValSize))
1964     return false;
1965
1966   // Get the alignment of the byval.  If the call doesn't specify the alignment,
1967   // then it is some target specific value that we can't know.
1968   MaybeAlign ByValAlign = CB.getParamAlign(ArgNo);
1969   if (!ByValAlign)
1970     return false;
1971
1972   // If it is greater than the memcpy, then we check to see if we can force the
1973   // source of the memcpy to the alignment we need.  If we fail, we bail out.
1974   MaybeAlign MemDepAlign = MDep->getSourceAlign();
1975   if ((!MemDepAlign || *MemDepAlign < *ByValAlign) &&
1976       getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL, &CB, AC,
1977                                  DT) < *ByValAlign)
1978     return false;
1979
1980   // The type of the memcpy source must match the byval argument
1981   if (MDep->getSource()->getType() != ByValArg->getType())
1982     return false;
1983
1984   // Verify that the copied-from memory doesn't change in between the memcpy and
1985   // the byval call.
1986   //    memcpy(a <- b)
1987   //    *b = 42;
1988   //    foo(*a)
1989   // It would be invalid to transform the second memcpy into foo(*b).
1990   if (writtenBetween(MSSA, BAA, MemoryLocation::getForSource(MDep),
1991                      MSSA->getMemoryAccess(MDep), CallAccess))
1992     return false;
1993
1994   LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n"
1995                     << "  " << *MDep << "\n"
1996                     << "  " << CB << "\n");
1997
1998   // Otherwise we're good!  Update the byval argument.
1999   combineAAMetadata(&CB, MDep);
2000   CB.setArgOperand(ArgNo, MDep->getSource());
2001   ++NumMemCpyInstr;
2002   return true;
2003 }
2004
2005 /// This is called on memcpy dest pointer arguments attributed as immutable
2006 /// during call. Try to use memcpy source directly if all of the following
2007 /// conditions are satisfied.
2008 /// 1. The memcpy dst is neither modified during the call nor captured by the
2009 /// call.
2010 /// 2. The memcpy dst is an alloca with known alignment & size.
2011 ///     2-1. The memcpy length == the alloca size which ensures that the new
2012 ///     pointer is dereferenceable for the required range
2013 ///     2-2. The src pointer has alignment >= the alloca alignment or can be
2014 ///     enforced so.
2015 /// 3. The memcpy dst and src is not modified between the memcpy and the call.
2016 /// (if MSSA clobber check is safe.)
2017 /// 4. The memcpy src is not modified during the call. (ModRef check shows no
2018 /// Mod.)
2019 bool MemCpyOptPass::processImmutArgument(CallBase &CB, unsigned ArgNo) {
2020   BatchAAResults BAA(*AA, EEA);
2021   Value *ImmutArg = CB.getArgOperand(ArgNo);
2022
2023   // 1. Ensure passed argument is immutable during call.
2024   if (!CB.paramHasAttr(ArgNo, Attribute::NoCapture))
2025     return false;
2026
2027   // We know that the argument is readonly at this point, but the function
2028   // might still modify the same memory through a different pointer. Exclude
2029   // this either via noalias, or alias analysis.
2030   if (!CB.paramHasAttr(ArgNo, Attribute::NoAlias) &&
2031       isModSet(
2032           BAA.getModRefInfo(&CB, MemoryLocation::getBeforeOrAfter(ImmutArg))))
2033     return false;
2034
2035   const DataLayout &DL = CB.getDataLayout();
2036
2037   // 2. Check that arg is alloca
2038   // TODO: Even if the arg gets back to branches, we can remove memcpy if all
2039   // the alloca alignments can be enforced to source alignment.
2040   auto *AI = dyn_cast<AllocaInst>(ImmutArg->stripPointerCasts());
2041   if (!AI)
2042     return false;
2043
2044   std::optional<TypeSize> AllocaSize = AI->getAllocationSize(DL);
2045   // Can't handle unknown size alloca.
2046   // (e.g. Variable Length Array, Scalable Vector)
2047   if (!AllocaSize || AllocaSize->isScalable())
2048     return false;
2049   MemoryLocation Loc(ImmutArg, LocationSize::precise(*AllocaSize));
2050   MemoryUseOrDef *CallAccess = MSSA->getMemoryAccess(&CB);
2051   if (!CallAccess)
2052     return false;
2053
2054   MemCpyInst *MDep = nullptr;
2055   MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
2056       CallAccess->getDefiningAccess(), Loc, BAA);
2057   if (auto *MD = dyn_cast<MemoryDef>(Clobber))
2058     MDep = dyn_cast_or_null<MemCpyInst>(MD->getMemoryInst());
2059
2060   // If the immut argument isn't fed by a memcpy, ignore it.  If it is fed by
2061   // a memcpy, check that the arg equals the memcpy dest.
2062   if (!MDep || MDep->isVolatile() || AI != MDep->getDest())
2063     return false;
2064
2065   // The type of the memcpy source must match the immut argument
2066   if (MDep->getSource()->getType() != ImmutArg->getType())
2067     return false;
2068
2069   // 2-1. The length of the memcpy must be equal to the size of the alloca.
2070   auto *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
2071   if (!MDepLen || AllocaSize != MDepLen->getValue())
2072     return false;
2073
2074   // 2-2. the memcpy source align must be larger than or equal the alloca's
2075   // align. If not so, we check to see if we can force the source of the memcpy
2076   // to the alignment we need. If we fail, we bail out.
2077   Align MemDepAlign = MDep->getSourceAlign().valueOrOne();
2078   Align AllocaAlign = AI->getAlign();
2079   if (MemDepAlign < AllocaAlign &&
2080       getOrEnforceKnownAlignment(MDep->getSource(), AllocaAlign, DL, &CB, AC,
2081                                  DT) < AllocaAlign)
2082     return false;
2083
2084   // 3. Verify that the source doesn't change in between the memcpy and
2085   // the call.
2086   //    memcpy(a <- b)
2087   //    *b = 42;
2088   //    foo(*a)
2089   // It would be invalid to transform the second memcpy into foo(*b).
2090   if (writtenBetween(MSSA, BAA, MemoryLocation::getForSource(MDep),
2091                      MSSA->getMemoryAccess(MDep), CallAccess))
2092     return false;
2093
2094   // 4. The memcpy src must not be modified during the call.
2095   if (isModSet(BAA.getModRefInfo(&CB, MemoryLocation::getForSource(MDep))))
2096     return false;
2097
2098   LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to Immut src:\n"
2099                     << "  " << *MDep << "\n"
2100                     << "  " << CB << "\n");
2101
2102   // Otherwise we're good!  Update the immut argument.
2103   combineAAMetadata(&CB, MDep);
2104   CB.setArgOperand(ArgNo, MDep->getSource());
2105   ++NumMemCpyInstr;
2106   return true;
2107 }
2108
2109 /// Executes one iteration of MemCpyOptPass.
2110 bool MemCpyOptPass::iterateOnFunction(Function &F) {
2111   bool MadeChange = false;
2112
2113   // Walk all instruction in the function.
2114   for (BasicBlock &BB : F) {
2115     // Skip unreachable blocks. For example processStore assumes that an
2116     // instruction in a BB can't be dominated by a later instruction in the
2117     // same BB (which is a scenario that can happen for an unreachable BB that
2118     // has itself as a predecessor).
2119     if (!DT->isReachableFromEntry(&BB))
2120       continue;
2121
2122     for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
2123       // Avoid invalidating the iterator.
2124       Instruction *I = &*BI++;
2125
2126       bool RepeatInstruction = false;
2127
2128       if (auto *SI = dyn_cast<StoreInst>(I))
2129         MadeChange |= processStore(SI, BI);
2130       else if (auto *M = dyn_cast<MemSetInst>(I))
2131         RepeatInstruction = processMemSet(M, BI);
2132       else if (auto *M = dyn_cast<MemCpyInst>(I))
2133         RepeatInstruction = processMemCpy(M, BI);
2134       else if (auto *M = dyn_cast<MemMoveInst>(I))
2135         RepeatInstruction = processMemMove(M, BI);
2136       else if (auto *CB = dyn_cast<CallBase>(I)) {
2137         for (unsigned i = 0, e = CB->arg_size(); i != e; ++i) {
2138           if (CB->isByValArgument(i))
2139             MadeChange |= processByValArgument(*CB, i);
2140           else if (CB->onlyReadsMemory(i))
2141             MadeChange |= processImmutArgument(*CB, i);
2142         }
2143       }
2144
2145       // Reprocess the instruction if desired.
2146       if (RepeatInstruction) {
2147         if (BI != BB.begin())
2148           --BI;
2149         MadeChange = true;
2150       }
2151     }
2152   }
2153
2154   return MadeChange;
2155 }
2156
2157 PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
2158   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
2159   auto *AA = &AM.getResult<AAManager>(F);
2160   auto *AC = &AM.getResult<AssumptionAnalysis>(F);
2161   auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
2162   auto *PDT = &AM.getResult<PostDominatorTreeAnalysis>(F);
2163   auto *MSSA = &AM.getResult<MemorySSAAnalysis>(F);
2164
2165   bool MadeChange = runImpl(F, &TLI, AA, AC, DT, PDT, &MSSA->getMSSA());
2166   if (!MadeChange)
2167     return PreservedAnalyses::all();
2168
2169   PreservedAnalyses PA;
2170   PA.preserveSet<CFGAnalyses>();
2171   PA.preserve<MemorySSAAnalysis>();
2172   return PA;
2173 }
2174
2175 bool MemCpyOptPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
2176                             AliasAnalysis *AA_, AssumptionCache *AC_,
2177                             DominatorTree *DT_, PostDominatorTree *PDT_,
2178                             MemorySSA *MSSA_) {
2179   bool MadeChange = false;
2180   TLI = TLI_;
2181   AA = AA_;
2182   AC = AC_;
2183   DT = DT_;
2184   PDT = PDT_;
2185   MSSA = MSSA_;
2186   MemorySSAUpdater MSSAU_(MSSA_);
2187   MSSAU = &MSSAU_;
2188   EarliestEscapeAnalysis EEA_(*DT);
2189   EEA = &EEA_;
2190
2191   while (true) {
2192     if (!iterateOnFunction(F))
2193       break;
2194     MadeChange = true;
2195   }
2196
2197   if (VerifyMemorySSA)
2198     MSSA_->verifyMemorySSA();
2199
2200   return MadeChange;
2201 }