llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

   1 //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This pass merges loads/stores to/from sequential memory addresses into vector
  10 // loads/stores.  Although there's nothing GPU-specific in here, this pass is
  11 // motivated by the microarchitectural quirks of nVidia and AMD GPUs.
  12 //
  13 // (For simplicity below we talk about loads only, but everything also applies
  14 // to stores.)
  15 //
  16 // This pass is intended to be run late in the pipeline, after other
  17 // vectorization opportunities have been exploited.  So the assumption here is
  18 // that immediately following our new vector load we'll need to extract out the
  19 // individual elements of the load, so we can operate on them individually.
  20 //
  21 // On CPUs this transformation is usually not beneficial, because extracting the
  22 // elements of a vector register is expensive on most architectures.  It's
  23 // usually better just to load each element individually into its own scalar
  24 // register.
  25 //
  26 // However, nVidia and AMD GPUs don't have proper vector registers.  Instead, a
  27 // "vector load" loads directly into a series of scalar registers.  In effect,
  28 // extracting the elements of the vector is free.  It's therefore always
  29 // beneficial to vectorize a sequence of loads on these architectures.
  30 //
  31 // Vectorizing (perhaps a better name might be "coalescing") loads can have
  32 // large performance impacts on GPU kernels, and opportunities for vectorizing
  33 // are common in GPU code.  This pass tries very hard to find such
  34 // opportunities; its runtime is quadratic in the number of loads in a BB.
  35 //
  36 // Some CPU architectures, such as ARM, have instructions that load into
  37 // multiple scalar registers, similar to a GPU vectorized load.  In theory ARM
  38 // could use this pass (with some modifications), but currently it implements
  39 // its own pass to do something similar to what we do here.
  40 //
  41 // Overview of the algorithm and terminology in this pass:
  42 //
  43 //  - Break up each basic block into pseudo-BBs, composed of instructions which
  44 //    are guaranteed to transfer control to their successors.
  45 //  - Within a single pseudo-BB, find all loads, and group them into
  46 //    "equivalence classes" according to getUnderlyingObject() and loaded
  47 //    element size.  Do the same for stores.
  48 //  - For each equivalence class, greedily build "chains".  Each chain has a
  49 //    leader instruction, and every other member of the chain has a known
  50 //    constant offset from the first instr in the chain.
  51 //  - Break up chains so that they contain only contiguous accesses of legal
  52 //    size with no intervening may-alias instrs.
  53 //  - Convert each chain to vector instructions.
  54 //
  55 // The O(n^2) behavior of this pass comes from initially building the chains.
  56 // In the worst case we have to compare each new instruction to all of those
  57 // that came before. To limit this, we only calculate the offset to the leaders
  58 // of the N most recently-used chains.
  59
  60 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
  61 #include "llvm/ADT/APInt.h"
  62 #include "llvm/ADT/ArrayRef.h"
  63 #include "llvm/ADT/DenseMap.h"
  64 #include "llvm/ADT/MapVector.h"
  65 #include "llvm/ADT/PostOrderIterator.h"
  66 #include "llvm/ADT/STLExtras.h"
  67 #include "llvm/ADT/Sequence.h"
  68 #include "llvm/ADT/SmallPtrSet.h"
  69 #include "llvm/ADT/SmallVector.h"
  70 #include "llvm/ADT/Statistic.h"
  71 #include "llvm/ADT/iterator_range.h"
  72 #include "llvm/Analysis/AliasAnalysis.h"
  73 #include "llvm/Analysis/AssumptionCache.h"
  74 #include "llvm/Analysis/MemoryLocation.h"
  75 #include "llvm/Analysis/ScalarEvolution.h"
  76 #include "llvm/Analysis/TargetTransformInfo.h"
  77 #include "llvm/Analysis/ValueTracking.h"
  78 #include "llvm/Analysis/VectorUtils.h"
  79 #include "llvm/IR/Attributes.h"
  80 #include "llvm/IR/BasicBlock.h"
  81 #include "llvm/IR/ConstantRange.h"
  82 #include "llvm/IR/Constants.h"
  83 #include "llvm/IR/DataLayout.h"
  84 #include "llvm/IR/DerivedTypes.h"
  85 #include "llvm/IR/Dominators.h"
  86 #include "llvm/IR/Function.h"
  87 #include "llvm/IR/GetElementPtrTypeIterator.h"
  88 #include "llvm/IR/IRBuilder.h"
  89 #include "llvm/IR/InstrTypes.h"
  90 #include "llvm/IR/Instruction.h"
  91 #include "llvm/IR/Instructions.h"
  92 #include "llvm/IR/LLVMContext.h"
  93 #include "llvm/IR/Module.h"
  94 #include "llvm/IR/Type.h"
  95 #include "llvm/IR/Value.h"
  96 #include "llvm/InitializePasses.h"
  97 #include "llvm/Pass.h"
  98 #include "llvm/Support/Alignment.h"
  99 #include "llvm/Support/Casting.h"
 100 #include "llvm/Support/Debug.h"
 101 #include "llvm/Support/KnownBits.h"
 102 #include "llvm/Support/MathExtras.h"
 103 #include "llvm/Support/ModRef.h"
 104 #include "llvm/Support/raw_ostream.h"
 105 #include "llvm/Transforms/Utils/Local.h"
 106 #include <algorithm>
 107 #include <cassert>
 108 #include <cstdint>
 109 #include <cstdlib>
 110 #include <iterator>
 111 #include <numeric>
 112 #include <optional>
 113 #include <tuple>
 114 #include <type_traits>
 115 #include <utility>
 116 #include <vector>
 117
 118 using namespace llvm;
 119
 120 #define DEBUG_TYPE "load-store-vectorizer"
 121
 122 STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
 123 STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
 124
 125 namespace {
 126
 127 // Equivalence class key, the initial tuple by which we group loads/stores.
 128 // Loads/stores with different EqClassKeys are never merged.
 129 //
 130 // (We could in theory remove element-size from the this tuple.  We'd just need
 131 // to fix up the vector packing/unpacking code.)
 132 using EqClassKey =
 133     std::tuple<const Value * /* result of getUnderlyingObject() */,
 134                unsigned /* AddrSpace */,
 135                unsigned /* Load/Store element size bits */,
 136                char /* IsLoad; char b/c bool can't be a DenseMap key */
 137                >;
 138 [[maybe_unused]] llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
 139                                                const EqClassKey &K) {
 140   const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
 141   return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
 142             << " of element size " << ElementSize << " bits in addrspace "
 143             << AddrSpace;
 144 }
 145
 146 // A Chain is a set of instructions such that:
 147 //  - All instructions have the same equivalence class, so in particular all are
 148 //    loads, or all are stores.
 149 //  - We know the address accessed by the i'th chain elem relative to the
 150 //    chain's leader instruction, which is the first instr of the chain in BB
 151 //    order.
 152 //
 153 // Chains have two canonical orderings:
 154 //  - BB order, sorted by Instr->comesBefore.
 155 //  - Offset order, sorted by OffsetFromLeader.
 156 // This pass switches back and forth between these orders.
 157 struct ChainElem {
 158   Instruction *Inst;
 159   APInt OffsetFromLeader;
 160   ChainElem(Instruction *Inst, APInt OffsetFromLeader)
 161       : Inst(std::move(Inst)), OffsetFromLeader(std::move(OffsetFromLeader)) {}
 162 };
 163 using Chain = SmallVector<ChainElem, 1>;
 164
 165 void sortChainInBBOrder(Chain &C) {
 166   sort(C, [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); });
 167 }
 168
 169 void sortChainInOffsetOrder(Chain &C) {
 170   sort(C, [](const auto &A, const auto &B) {
 171     if (A.OffsetFromLeader != B.OffsetFromLeader)
 172       return A.OffsetFromLeader.slt(B.OffsetFromLeader);
 173     return A.Inst->comesBefore(B.Inst); // stable tiebreaker
 174   });
 175 }
 176
 177 [[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) {
 178   for (const auto &E : C) {
 179     dbgs() << "  " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n";
 180   }
 181 }
 182
 183 using EquivalenceClassMap =
 184     MapVector<EqClassKey, SmallVector<Instruction *, 8>>;
 185
 186 // FIXME: Assuming stack alignment of 4 is always good enough
 187 constexpr unsigned StackAdjustedAlignment = 4;
 188
 189 Instruction *propagateMetadata(Instruction *I, const Chain &C) {
 190   SmallVector<Value *, 8> Values;
 191   for (const ChainElem &E : C)
 192     Values.emplace_back(E.Inst);
 193   return propagateMetadata(I, Values);
 194 }
 195
 196 bool isInvariantLoad(const Instruction *I) {
 197   const LoadInst *LI = dyn_cast<LoadInst>(I);
 198   return LI != nullptr && LI->hasMetadata(LLVMContext::MD_invariant_load);
 199 }
 200
 201 /// Reorders the instructions that I depends on (the instructions defining its
 202 /// operands), to ensure they dominate I.
 203 void reorder(Instruction *I) {
 204   SmallPtrSet<Instruction *, 16> InstructionsToMove;
 205   SmallVector<Instruction *, 16> Worklist;
 206
 207   Worklist.emplace_back(I);
 208   while (!Worklist.empty()) {
 209     Instruction *IW = Worklist.pop_back_val();
 210     int NumOperands = IW->getNumOperands();
 211     for (int Idx = 0; Idx < NumOperands; Idx++) {
 212       Instruction *IM = dyn_cast<Instruction>(IW->getOperand(Idx));
 213       if (!IM || IM->getOpcode() == Instruction::PHI)
 214         continue;
 215
 216       // If IM is in another BB, no need to move it, because this pass only
 217       // vectorizes instructions within one BB.
 218       if (IM->getParent() != I->getParent())
 219         continue;
 220
 221       assert(IM != I && "Unexpected cycle while re-ordering instructions");
 222
 223       if (!IM->comesBefore(I)) {
 224         InstructionsToMove.insert(IM);
 225         Worklist.emplace_back(IM);
 226       }
 227     }
 228   }
 229
 230   // All instructions to move should follow I. Start from I, not from begin().
 231   for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) {
 232     Instruction *IM = &*(BBI++);
 233     if (!InstructionsToMove.contains(IM))
 234       continue;
 235     IM->moveBefore(I);
 236   }
 237 }
 238
 239 class Vectorizer {
 240   Function &F;
 241   AliasAnalysis &AA;
 242   AssumptionCache &AC;
 243   DominatorTree &DT;
 244   ScalarEvolution &SE;
 245   TargetTransformInfo &TTI;
 246   const DataLayout &DL;
 247   IRBuilder<> Builder;
 248
 249   // We could erase instrs right after vectorizing them, but that can mess up
 250   // our BB iterators, and also can make the equivalence class keys point to
 251   // freed memory.  This is fixable, but it's simpler just to wait until we're
 252   // done with the BB and erase all at once.
 253   SmallVector<Instruction *, 128> ToErase;
 254
 255 public:
 256   Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
 257              DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI)
 258       : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
 259         DL(F.getDataLayout()), Builder(SE.getContext()) {}
 260
 261   bool run();
 262
 263 private:
 264   static const unsigned MaxDepth = 3;
 265
 266   /// Runs the vectorizer on a "pseudo basic block", which is a range of
 267   /// instructions [Begin, End) within one BB all of which have
 268   /// isGuaranteedToTransferExecutionToSuccessor(I) == true.
 269   bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
 270
 271   /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
 272   /// in the same BB with the same value for getUnderlyingObject() etc.
 273   bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
 274                              ArrayRef<Instruction *> EqClass);
 275
 276   /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
 277   /// where all instructions access a known, constant offset from the first
 278   /// instruction.
 279   bool runOnChain(Chain &C);
 280
 281   /// Splits the chain into subchains of instructions which read/write a
 282   /// contiguous block of memory.  Discards any length-1 subchains (because
 283   /// there's nothing to vectorize in there).
 284   std::vector<Chain> splitChainByContiguity(Chain &C);
 285
 286   /// Splits the chain into subchains where it's safe to hoist loads up to the
 287   /// beginning of the sub-chain and it's safe to sink loads up to the end of
 288   /// the sub-chain.  Discards any length-1 subchains.
 289   std::vector<Chain> splitChainByMayAliasInstrs(Chain &C);
 290
 291   /// Splits the chain into subchains that make legal, aligned accesses.
 292   /// Discards any length-1 subchains.
 293   std::vector<Chain> splitChainByAlignment(Chain &C);
 294
 295   /// Converts the instrs in the chain into a single vectorized load or store.
 296   /// Adds the old scalar loads/stores to ToErase.
 297   bool vectorizeChain(Chain &C);
 298
 299   /// Tries to compute the offset in bytes PtrB - PtrA.
 300   std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB,
 301                                          Instruction *ContextInst,
 302                                          unsigned Depth = 0);
 303   std::optional<APInt> getConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB,
 304                                                      Instruction *ContextInst,
 305                                                      unsigned Depth);
 306   std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB,
 307                                                 Instruction *ContextInst,
 308                                                 unsigned Depth);
 309
 310   /// Gets the element type of the vector that the chain will load or store.
 311   /// This is nontrivial because the chain may contain elements of different
 312   /// types; e.g. it's legal to have a chain that contains both i32 and float.
 313   Type *getChainElemTy(const Chain &C);
 314
 315   /// Determines whether ChainElem can be moved up (if IsLoad) or down (if
 316   /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
 317   /// instructions.
 318   ///
 319   /// The map ChainElemOffsets must contain all of the elements in
 320   /// [ChainBegin, ChainElem] and their offsets from some arbitrary base
 321   /// address.  It's ok if it contains additional entries.
 322   template <bool IsLoadChain>
 323   bool isSafeToMove(
 324       Instruction *ChainElem, Instruction *ChainBegin,
 325       const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets);
 326
 327   /// Collects loads and stores grouped by "equivalence class", where:
 328   ///   - all elements in an eq class are a load or all are a store,
 329   ///   - they all load/store the same element size (it's OK to have e.g. i8 and
 330   ///     <4 x i8> in the same class, but not i32 and <4 x i8>), and
 331   ///   - they all have the same value for getUnderlyingObject().
 332   EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
 333                                                 BasicBlock::iterator End);
 334
 335   /// Partitions Instrs into "chains" where every instruction has a known
 336   /// constant offset from the first instr in the chain.
 337   ///
 338   /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
 339   /// in the chain is the leader, and an instr touches distance 0 from itself.
 340   std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
 341 };
 342
 343 class LoadStoreVectorizerLegacyPass : public FunctionPass {
 344 public:
 345   static char ID;
 346
 347   LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
 348     initializeLoadStoreVectorizerLegacyPassPass(
 349         *PassRegistry::getPassRegistry());
 350   }
 351
 352   bool runOnFunction(Function &F) override;
 353
 354   StringRef getPassName() const override {
 355     return "GPU Load and Store Vectorizer";
 356   }
 357
 358   void getAnalysisUsage(AnalysisUsage &AU) const override {
 359     AU.addRequired<AAResultsWrapperPass>();
 360     AU.addRequired<AssumptionCacheTracker>();
 361     AU.addRequired<ScalarEvolutionWrapperPass>();
 362     AU.addRequired<DominatorTreeWrapperPass>();
 363     AU.addRequired<TargetTransformInfoWrapperPass>();
 364     AU.setPreservesCFG();
 365   }
 366 };
 367
 368 } // end anonymous namespace
 369
 370 char LoadStoreVectorizerLegacyPass::ID = 0;
 371
 372 INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
 373                       "Vectorize load and Store instructions", false, false)
 374 INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
 375 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker);
 376 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 377 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 378 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
 379 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 380 INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
 381                     "Vectorize load and store instructions", false, false)
 382
 383 Pass *llvm::createLoadStoreVectorizerPass() {
 384   return new LoadStoreVectorizerLegacyPass();
 385 }
 386
 387 bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
 388   // Don't vectorize when the attribute NoImplicitFloat is used.
 389   if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
 390     return false;
 391
 392   AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
 393   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 394   ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
 395   TargetTransformInfo &TTI =
 396       getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 397
 398   AssumptionCache &AC =
 399       getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
 400
 401   return Vectorizer(F, AA, AC, DT, SE, TTI).run();
 402 }
 403
 404 PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,
 405                                                FunctionAnalysisManager &AM) {
 406   // Don't vectorize when the attribute NoImplicitFloat is used.
 407   if (F.hasFnAttribute(Attribute::NoImplicitFloat))
 408     return PreservedAnalyses::all();
 409
 410   AliasAnalysis &AA = AM.getResult<AAManager>(F);
 411   DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
 412   ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
 413   TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
 414   AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
 415
 416   bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run();
 417   PreservedAnalyses PA;
 418   PA.preserveSet<CFGAnalyses>();
 419   return Changed ? PA : PreservedAnalyses::all();
 420 }
 421
 422 bool Vectorizer::run() {
 423   bool Changed = false;
 424   // Break up the BB if there are any instrs which aren't guaranteed to transfer
 425   // execution to their successor.
 426   //
 427   // Consider, for example:
 428   //
 429   //   def assert_arr_len(int n) { if (n < 2) exit(); }
 430   //
 431   //   load arr[0]
 432   //   call assert_array_len(arr.length)
 433   //   load arr[1]
 434   //
 435   // Even though assert_arr_len does not read or write any memory, we can't
 436   // speculate the second load before the call.  More info at
 437   // https://github.com/llvm/llvm-project/issues/52950.
 438   for (BasicBlock *BB : post_order(&F)) {
 439     // BB must at least have a terminator.
 440     assert(!BB->empty());
 441
 442     SmallVector<BasicBlock::iterator, 8> Barriers;
 443     Barriers.emplace_back(BB->begin());
 444     for (Instruction &I : *BB)
 445       if (!isGuaranteedToTransferExecutionToSuccessor(&I))
 446         Barriers.emplace_back(I.getIterator());
 447     Barriers.emplace_back(BB->end());
 448
 449     for (auto It = Barriers.begin(), End = std::prev(Barriers.end()); It != End;
 450          ++It)
 451       Changed |= runOnPseudoBB(*It, *std::next(It));
 452
 453     for (Instruction *I : ToErase) {
 454       auto *PtrOperand = getLoadStorePointerOperand(I);
 455       if (I->use_empty())
 456         I->eraseFromParent();
 457       RecursivelyDeleteTriviallyDeadInstructions(PtrOperand);
 458     }
 459     ToErase.clear();
 460   }
 461
 462   return Changed;
 463 }
 464
 465 bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
 466                                BasicBlock::iterator End) {
 467   LLVM_DEBUG({
 468     dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... ";
 469     if (End != Begin->getParent()->end())
 470       dbgs() << *End;
 471     else
 472       dbgs() << "<BB end>";
 473     dbgs() << ")\n";
 474   });
 475
 476   bool Changed = false;
 477   for (const auto &[EqClassKey, EqClass] :
 478        collectEquivalenceClasses(Begin, End))
 479     Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
 480
 481   return Changed;
 482 }
 483
 484 bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
 485                                        ArrayRef<Instruction *> EqClass) {
 486   bool Changed = false;
 487
 488   LLVM_DEBUG({
 489     dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
 490            << " keyed on " << EqClassKey << ":\n";
 491     for (Instruction *I : EqClass)
 492       dbgs() << "  " << *I << "\n";
 493   });
 494
 495   std::vector<Chain> Chains = gatherChains(EqClass);
 496   LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
 497                     << " nontrivial chains.\n";);
 498   for (Chain &C : Chains)
 499     Changed |= runOnChain(C);
 500   return Changed;
 501 }
 502
 503 bool Vectorizer::runOnChain(Chain &C) {
 504   LLVM_DEBUG({
 505     dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
 506     dumpChain(C);
 507   });
 508
 509   // Split up the chain into increasingly smaller chains, until we can finally
 510   // vectorize the chains.
 511   //
 512   // (Don't be scared by the depth of the loop nest here.  These operations are
 513   // all at worst O(n lg n) in the number of instructions, and splitting chains
 514   // doesn't change the number of instrs.  So the whole loop nest is O(n lg n).)
 515   bool Changed = false;
 516   for (auto &C : splitChainByMayAliasInstrs(C))
 517     for (auto &C : splitChainByContiguity(C))
 518       for (auto &C : splitChainByAlignment(C))
 519         Changed |= vectorizeChain(C);
 520   return Changed;
 521 }
 522
 523 std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
 524   if (C.empty())
 525     return {};
 526
 527   sortChainInBBOrder(C);
 528
 529   LLVM_DEBUG({
 530     dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";
 531     dumpChain(C);
 532   });
 533
 534   // We know that elements in the chain with nonverlapping offsets can't
 535   // alias, but AA may not be smart enough to figure this out.  Use a
 536   // hashtable.
 537   DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;
 538   for (const auto &E : C)
 539     ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});
 540
 541   // Loads get hoisted up to the first load in the chain.  Stores get sunk
 542   // down to the last store in the chain.  Our algorithm for loads is:
 543   //
 544   //  - Take the first element of the chain.  This is the start of a new chain.
 545   //  - Take the next element of `Chain` and check for may-alias instructions
 546   //    up to the start of NewChain.  If no may-alias instrs, add it to
 547   //    NewChain.  Otherwise, start a new NewChain.
 548   //
 549   // For stores it's the same except in the reverse direction.
 550   //
 551   // We expect IsLoad to be an std::bool_constant.
 552   auto Impl = [&](auto IsLoad) {
 553     // MSVC is unhappy if IsLoad is a capture, so pass it as an arg.
 554     auto [ChainBegin, ChainEnd] = [&](auto IsLoad) {
 555       if constexpr (IsLoad())
 556         return std::make_pair(C.begin(), C.end());
 557       else
 558         return std::make_pair(C.rbegin(), C.rend());
 559     }(IsLoad);
 560     assert(ChainBegin != ChainEnd);
 561
 562     std::vector<Chain> Chains;
 563     SmallVector<ChainElem, 1> NewChain;
 564     NewChain.emplace_back(*ChainBegin);
 565     for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
 566       if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
 567                                ChainOffsets)) {
 568         LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
 569                           << *ChainIt->Inst << " into " << *ChainBegin->Inst
 570                           << "\n");
 571         NewChain.emplace_back(*ChainIt);
 572       } else {
 573         LLVM_DEBUG(
 574             dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
 575                    << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
 576         if (NewChain.size() > 1) {
 577           LLVM_DEBUG({
 578             dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
 579             dumpChain(NewChain);
 580           });
 581           Chains.emplace_back(std::move(NewChain));
 582         }
 583
 584         // Start a new chain.
 585         NewChain = SmallVector<ChainElem, 1>({*ChainIt});
 586       }
 587     }
 588     if (NewChain.size() > 1) {
 589       LLVM_DEBUG({
 590         dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
 591         dumpChain(NewChain);
 592       });
 593       Chains.emplace_back(std::move(NewChain));
 594     }
 595     return Chains;
 596   };
 597
 598   if (isa<LoadInst>(C[0].Inst))
 599     return Impl(/*IsLoad=*/std::bool_constant<true>());
 600
 601   assert(isa<StoreInst>(C[0].Inst));
 602   return Impl(/*IsLoad=*/std::bool_constant<false>());
 603 }
 604
 605 std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
 606   if (C.empty())
 607     return {};
 608
 609   sortChainInOffsetOrder(C);
 610
 611   LLVM_DEBUG({
 612     dbgs() << "LSV: splitChainByContiguity considering chain:\n";
 613     dumpChain(C);
 614   });
 615
 616   std::vector<Chain> Ret;
 617   Ret.push_back({C.front()});
 618
 619   for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
 620     // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
 621     auto &CurChain = Ret.back();
 622     const ChainElem &Prev = CurChain.back();
 623     unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
 624     assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
 625                               "collectEquivalenceClass");
 626     APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
 627
 628     // Add this instruction to the end of the current chain, or start a new one.
 629     bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
 630     LLVM_DEBUG(dbgs() << "LSV: Instructions are "
 631                       << (AreContiguous ? "" : "not ") << "contiguous: "
 632                       << *Prev.Inst << " (ends at offset " << PrevReadEnd
 633                       << ") -> " << *It->Inst << " (starts at offset "
 634                       << It->OffsetFromLeader << ")\n");
 635     if (AreContiguous)
 636       CurChain.push_back(*It);
 637     else
 638       Ret.push_back({*It});
 639   }
 640
 641   // Filter out length-1 chains, these are uninteresting.
 642   llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; });
 643   return Ret;
 644 }
 645
 646 Type *Vectorizer::getChainElemTy(const Chain &C) {
 647   assert(!C.empty());
 648   // The rules are:
 649   //  - If there are any pointer types in the chain, use an integer type.
 650   //  - Prefer an integer type if it appears in the chain.
 651   //  - Otherwise, use the first type in the chain.
 652   //
 653   // The rule about pointer types is a simplification when we merge e.g.  a load
 654   // of a ptr and a double.  There's no direct conversion from a ptr to a
 655   // double; it requires a ptrtoint followed by a bitcast.
 656   //
 657   // It's unclear to me if the other rules have any practical effect, but we do
 658   // it to match this pass's previous behavior.
 659   if (any_of(C, [](const ChainElem &E) {
 660         return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy();
 661       })) {
 662     return Type::getIntNTy(
 663         F.getContext(),
 664         DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType()));
 665   }
 666
 667   for (const ChainElem &E : C)
 668     if (Type *T = getLoadStoreType(E.Inst)->getScalarType(); T->isIntegerTy())
 669       return T;
 670   return getLoadStoreType(C[0].Inst)->getScalarType();
 671 }
 672
 673 std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
 674   // We use a simple greedy algorithm.
 675   //  - Given a chain of length N, find all prefixes that
 676   //    (a) are not longer than the max register length, and
 677   //    (b) are a power of 2.
 678   //  - Starting from the longest prefix, try to create a vector of that length.
 679   //  - If one of them works, great.  Repeat the algorithm on any remaining
 680   //    elements in the chain.
 681   //  - If none of them work, discard the first element and repeat on a chain
 682   //    of length N-1.
 683   if (C.empty())
 684     return {};
 685
 686   sortChainInOffsetOrder(C);
 687
 688   LLVM_DEBUG({
 689     dbgs() << "LSV: splitChainByAlignment considering chain:\n";
 690     dumpChain(C);
 691   });
 692
 693   bool IsLoadChain = isa<LoadInst>(C[0].Inst);
 694   auto GetVectorFactor = [&](unsigned VF, unsigned LoadStoreSize,
 695                              unsigned ChainSizeBytes, VectorType *VecTy) {
 696     return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadStoreSize,
 697                                                  ChainSizeBytes, VecTy)
 698                        : TTI.getStoreVectorFactor(VF, LoadStoreSize,
 699                                                   ChainSizeBytes, VecTy);
 700   };
 701
 702 #ifndef NDEBUG
 703   for (const auto &E : C) {
 704     Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
 705     assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) &&
 706            "Should have filtered out non-power-of-two elements in "
 707            "collectEquivalenceClasses.");
 708   }
 709 #endif
 710
 711   unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
 712   unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;
 713
 714   std::vector<Chain> Ret;
 715   for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
 716     // Find candidate chains of size not greater than the largest vector reg.
 717     // These chains are over the closed interval [CBegin, CEnd].
 718     SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
 719         CandidateChains;
 720     for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
 721       APInt Sz = C[CEnd].OffsetFromLeader +
 722                  DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst)) -
 723                  C[CBegin].OffsetFromLeader;
 724       if (Sz.sgt(VecRegBytes))
 725         break;
 726       CandidateChains.emplace_back(CEnd,
 727                                    static_cast<unsigned>(Sz.getLimitedValue()));
 728     }
 729
 730     // Consider the longest chain first.
 731     for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend();
 732          It != End; ++It) {
 733       auto [CEnd, SizeBytes] = *It;
 734       LLVM_DEBUG(
 735           dbgs() << "LSV: splitChainByAlignment considering candidate chain ["
 736                  << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n");
 737
 738       Type *VecElemTy = getChainElemTy(C);
 739       // Note, VecElemTy is a power of 2, but might be less than one byte.  For
 740       // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case
 741       // VecElemTy would be i4.
 742       unsigned VecElemBits = DL.getTypeSizeInBits(VecElemTy);
 743
 744       // SizeBytes and VecElemBits are powers of 2, so they divide evenly.
 745       assert((8 * SizeBytes) % VecElemBits == 0);
 746       unsigned NumVecElems = 8 * SizeBytes / VecElemBits;
 747       FixedVectorType *VecTy = FixedVectorType::get(VecElemTy, NumVecElems);
 748       unsigned VF = 8 * VecRegBytes / VecElemBits;
 749
 750       // Check that TTI is happy with this vectorization factor.
 751       unsigned TargetVF = GetVectorFactor(VF, VecElemBits,
 752                                           VecElemBits * NumVecElems / 8, VecTy);
 753       if (TargetVF != VF && TargetVF < NumVecElems) {
 754         LLVM_DEBUG(
 755             dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
 756                       "because TargetVF="
 757                    << TargetVF << " != VF=" << VF
 758                    << " and TargetVF < NumVecElems=" << NumVecElems << "\n");
 759         continue;
 760       }
 761
 762       // Is a load/store with this alignment allowed by TTI and at least as fast
 763       // as an unvectorized load/store?
 764       //
 765       // TTI and F are passed as explicit captures to WAR an MSVC misparse (??).
 766       auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI,
 767                                &F = F](Align Alignment) {
 768         if (Alignment.value() % SizeBytes == 0)
 769           return true;
 770         unsigned VectorizedSpeed = 0;
 771         bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
 772             F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed);
 773         if (!AllowsMisaligned) {
 774           LLVM_DEBUG(dbgs()
 775                      << "LSV: Access of " << SizeBytes << "B in addrspace "
 776                      << AS << " with alignment " << Alignment.value()
 777                      << " is misaligned, and therefore can't be vectorized.\n");
 778           return false;
 779         }
 780
 781         unsigned ElementwiseSpeed = 0;
 782         (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS,
 783                                              Alignment, &ElementwiseSpeed);
 784         if (VectorizedSpeed < ElementwiseSpeed) {
 785           LLVM_DEBUG(dbgs()
 786                      << "LSV: Access of " << SizeBytes << "B in addrspace "
 787                      << AS << " with alignment " << Alignment.value()
 788                      << " has relative speed " << VectorizedSpeed
 789                      << ", which is lower than the elementwise speed of "
 790                      << ElementwiseSpeed
 791                      << ".  Therefore this access won't be vectorized.\n");
 792           return false;
 793         }
 794         return true;
 795       };
 796
 797       // If we're loading/storing from an alloca, align it if possible.
 798       //
 799       // FIXME: We eagerly upgrade the alignment, regardless of whether TTI
 800       // tells us this is beneficial.  This feels a bit odd, but it matches
 801       // existing tests.  This isn't *so* bad, because at most we align to 4
 802       // bytes (current value of StackAdjustedAlignment).
 803       //
 804       // FIXME: We will upgrade the alignment of the alloca even if it turns out
 805       // we can't vectorize for some other reason.
 806       Value *PtrOperand = getLoadStorePointerOperand(C[CBegin].Inst);
 807       bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() &&
 808                             isa<AllocaInst>(PtrOperand->stripPointerCasts());
 809       Align Alignment = getLoadStoreAlignment(C[CBegin].Inst);
 810       Align PrefAlign = Align(StackAdjustedAlignment);
 811       if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 &&
 812           IsAllowedAndFast(PrefAlign)) {
 813         Align NewAlign = getOrEnforceKnownAlignment(
 814             PtrOperand, PrefAlign, DL, C[CBegin].Inst, nullptr, &DT);
 815         if (NewAlign >= Alignment) {
 816           LLVM_DEBUG(dbgs()
 817                      << "LSV: splitByChain upgrading alloca alignment from "
 818                      << Alignment.value() << " to " << NewAlign.value()
 819                      << "\n");
 820           Alignment = NewAlign;
 821         }
 822       }
 823
 824       if (!IsAllowedAndFast(Alignment)) {
 825         LLVM_DEBUG(
 826             dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
 827                       "because its alignment is not AllowedAndFast: "
 828                    << Alignment.value() << "\n");
 829         continue;
 830       }
 831
 832       if ((IsLoadChain &&
 833            !TTI.isLegalToVectorizeLoadChain(SizeBytes, Alignment, AS)) ||
 834           (!IsLoadChain &&
 835            !TTI.isLegalToVectorizeStoreChain(SizeBytes, Alignment, AS))) {
 836         LLVM_DEBUG(
 837             dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
 838                       "because !isLegalToVectorizeLoad/StoreChain.");
 839         continue;
 840       }
 841
 842       // Hooray, we can vectorize this chain!
 843       Chain &NewChain = Ret.emplace_back();
 844       for (unsigned I = CBegin; I <= CEnd; ++I)
 845         NewChain.emplace_back(C[I]);
 846       CBegin = CEnd; // Skip over the instructions we've added to the chain.
 847       break;
 848     }
 849   }
 850   return Ret;
 851 }
 852
 853 bool Vectorizer::vectorizeChain(Chain &C) {
 854   if (C.size() < 2)
 855     return false;
 856
 857   sortChainInOffsetOrder(C);
 858
 859   LLVM_DEBUG({
 860     dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n";
 861     dumpChain(C);
 862   });
 863
 864   Type *VecElemTy = getChainElemTy(C);
 865   bool IsLoadChain = isa<LoadInst>(C[0].Inst);
 866   unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
 867   unsigned ChainBytes = std::accumulate(
 868       C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
 869         return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
 870       });
 871   assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
 872   // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
 873   // than 1 byte (e.g. VecTy == <32 x i1>).
 874   Type *VecTy = FixedVectorType::get(
 875       VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));
 876
 877   Align Alignment = getLoadStoreAlignment(C[0].Inst);
 878   // If this is a load/store of an alloca, we might have upgraded the alloca's
 879   // alignment earlier.  Get the new alignment.
 880   if (AS == DL.getAllocaAddrSpace()) {
 881     Alignment = std::max(
 882         Alignment,
 883         getOrEnforceKnownAlignment(getLoadStorePointerOperand(C[0].Inst),
 884                                    MaybeAlign(), DL, C[0].Inst, nullptr, &DT));
 885   }
 886
 887   // All elements of the chain must have the same scalar-type size.
 888 #ifndef NDEBUG
 889   for (const ChainElem &E : C)
 890     assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==
 891            DL.getTypeStoreSize(VecElemTy));
 892 #endif
 893
 894   Instruction *VecInst;
 895   if (IsLoadChain) {
 896     // Loads get hoisted to the location of the first load in the chain.  We may
 897     // also need to hoist the (transitive) operands of the loads.
 898     Builder.SetInsertPoint(
 899         llvm::min_element(C, [](const auto &A, const auto &B) {
 900           return A.Inst->comesBefore(B.Inst);
 901         })->Inst);
 902
 903     // Chain is in offset order, so C[0] is the instr with the lowest offset,
 904     // i.e. the root of the vector.
 905     VecInst = Builder.CreateAlignedLoad(VecTy,
 906                                         getLoadStorePointerOperand(C[0].Inst),
 907                                         Alignment);
 908
 909     unsigned VecIdx = 0;
 910     for (const ChainElem &E : C) {
 911       Instruction *I = E.Inst;
 912       Value *V;
 913       Type *T = getLoadStoreType(I);
 914       if (auto *VT = dyn_cast<FixedVectorType>(T)) {
 915         auto Mask = llvm::to_vector<8>(
 916             llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
 917         V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
 918         VecIdx += VT->getNumElements();
 919       } else {
 920         V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
 921                                          I->getName());
 922         ++VecIdx;
 923       }
 924       if (V->getType() != I->getType())
 925         V = Builder.CreateBitOrPointerCast(V, I->getType());
 926       I->replaceAllUsesWith(V);
 927     }
 928
 929     // Finally, we need to reorder the instrs in the BB so that the (transitive)
 930     // operands of VecInst appear before it.  To see why, suppose we have
 931     // vectorized the following code:
 932     //
 933     //   ptr1  = gep a, 1
 934     //   load1 = load i32 ptr1
 935     //   ptr0  = gep a, 0
 936     //   load0 = load i32 ptr0
 937     //
 938     // We will put the vectorized load at the location of the earliest load in
 939     // the BB, i.e. load1.  We get:
 940     //
 941     //   ptr1  = gep a, 1
 942     //   loadv = load <2 x i32> ptr0
 943     //   load0 = extractelement loadv, 0
 944     //   load1 = extractelement loadv, 1
 945     //   ptr0 = gep a, 0
 946     //
 947     // Notice that loadv uses ptr0, which is defined *after* it!
 948     reorder(VecInst);
 949   } else {
 950     // Stores get sunk to the location of the last store in the chain.
 951     Builder.SetInsertPoint(llvm::max_element(C, [](auto &A, auto &B) {
 952                              return A.Inst->comesBefore(B.Inst);
 953                            })->Inst);
 954
 955     // Build the vector to store.
 956     Value *Vec = PoisonValue::get(VecTy);
 957     unsigned VecIdx = 0;
 958     auto InsertElem = [&](Value *V) {
 959       if (V->getType() != VecElemTy)
 960         V = Builder.CreateBitOrPointerCast(V, VecElemTy);
 961       Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
 962     };
 963     for (const ChainElem &E : C) {
 964       auto *I = cast<StoreInst>(E.Inst);
 965       if (FixedVectorType *VT =
 966               dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
 967         for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
 968           InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
 969                                                   Builder.getInt32(J)));
 970         }
 971       } else {
 972         InsertElem(I->getValueOperand());
 973       }
 974     }
 975
 976     // Chain is in offset order, so C[0] is the instr with the lowest offset,
 977     // i.e. the root of the vector.
 978     VecInst = Builder.CreateAlignedStore(
 979         Vec,
 980         getLoadStorePointerOperand(C[0].Inst),
 981         Alignment);
 982   }
 983
 984   propagateMetadata(VecInst, C);
 985
 986   for (const ChainElem &E : C)
 987     ToErase.emplace_back(E.Inst);
 988
 989   ++NumVectorInstructions;
 990   NumScalarsVectorized += C.size();
 991   return true;
 992 }
 993
 994 template <bool IsLoadChain>
 995 bool Vectorizer::isSafeToMove(
 996     Instruction *ChainElem, Instruction *ChainBegin,
 997     const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets) {
 998   LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
 999                     << *ChainBegin << ")\n");
1000
1001   assert(isa<LoadInst>(ChainElem) == IsLoadChain);
1002   if (ChainElem == ChainBegin)
1003     return true;
1004
1005   // Invariant loads can always be reordered; by definition they are not
1006   // clobbered by stores.
1007   if (isInvariantLoad(ChainElem))
1008     return true;
1009
1010   auto BBIt = std::next([&] {
1011     if constexpr (IsLoadChain)
1012       return BasicBlock::reverse_iterator(ChainElem);
1013     else
1014       return BasicBlock::iterator(ChainElem);
1015   }());
1016   auto BBItEnd = std::next([&] {
1017     if constexpr (IsLoadChain)
1018       return BasicBlock::reverse_iterator(ChainBegin);
1019     else
1020       return BasicBlock::iterator(ChainBegin);
1021   }());
1022
1023   const APInt &ChainElemOffset = ChainOffsets.at(ChainElem);
1024   const unsigned ChainElemSize =
1025       DL.getTypeStoreSize(getLoadStoreType(ChainElem));
1026
1027   for (; BBIt != BBItEnd; ++BBIt) {
1028     Instruction *I = &*BBIt;
1029
1030     if (!I->mayReadOrWriteMemory())
1031       continue;
1032
1033     // Loads can be reordered with other loads.
1034     if (IsLoadChain && isa<LoadInst>(I))
1035       continue;
1036
1037     // Stores can be sunk below invariant loads.
1038     if (!IsLoadChain && isInvariantLoad(I))
1039       continue;
1040
1041     // If I is in the chain, we can tell whether it aliases ChainIt by checking
1042     // what offset ChainIt accesses.  This may be better than AA is able to do.
1043     //
1044     // We should really only have duplicate offsets for stores (the duplicate
1045     // loads should be CSE'ed), but in case we have a duplicate load, we'll
1046     // split the chain so we don't have to handle this case specially.
1047     if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) {
1048       // I and ChainElem overlap if:
1049       //   - I and ChainElem have the same offset, OR
1050       //   - I's offset is less than ChainElem's, but I touches past the
1051       //     beginning of ChainElem, OR
1052       //   - ChainElem's offset is less than I's, but ChainElem touches past the
1053       //     beginning of I.
1054       const APInt &IOffset = OffsetIt->second;
1055       unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I));
1056       if (IOffset == ChainElemOffset ||
1057           (IOffset.sle(ChainElemOffset) &&
1058            (IOffset + IElemSize).sgt(ChainElemOffset)) ||
1059           (ChainElemOffset.sle(IOffset) &&
1060            (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {
1061         LLVM_DEBUG({
1062           // Double check that AA also sees this alias.  If not, we probably
1063           // have a bug.
1064           ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1065           assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
1066           dbgs() << "LSV: Found alias in chain: " << *I << "\n";
1067         });
1068         return false; // We found an aliasing instruction; bail.
1069       }
1070
1071       continue; // We're confident there's no alias.
1072     }
1073
1074     LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
1075     ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1076     if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
1077       LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
1078                         << "  Aliasing instruction:\n"
1079                         << "    " << *I << '\n'
1080                         << "  Aliased instruction and pointer:\n"
1081                         << "    " << *ChainElem << '\n'
1082                         << "    " << *getLoadStorePointerOperand(ChainElem)
1083                         << '\n');
1084
1085       return false;
1086     }
1087   }
1088   return true;
1089 }
1090
1091 static bool checkNoWrapFlags(Instruction *I, bool Signed) {
1092   BinaryOperator *BinOpI = cast<BinaryOperator>(I);
1093   return (Signed && BinOpI->hasNoSignedWrap()) ||
1094          (!Signed && BinOpI->hasNoUnsignedWrap());
1095 }
1096
1097 static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
1098                                    unsigned MatchingOpIdxA, Instruction *AddOpB,
1099                                    unsigned MatchingOpIdxB, bool Signed) {
1100   LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
1101                     << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA="
1102                     << MatchingOpIdxA << ", AddOpB=" << *AddOpB
1103                     << ", MatchingOpIdxB=" << MatchingOpIdxB
1104                     << ", Signed=" << Signed << "\n");
1105   // If both OpA and OpB are adds with NSW/NUW and with one of the operands
1106   // being the same, we can guarantee that the transformation is safe if we can
1107   // prove that OpA won't overflow when Ret added to the other operand of OpA.
1108   // For example:
1109   //  %tmp7 = add nsw i32 %tmp2, %v0
1110   //  %tmp8 = sext i32 %tmp7 to i64
1111   //  ...
1112   //  %tmp11 = add nsw i32 %v0, 1
1113   //  %tmp12 = add nsw i32 %tmp2, %tmp11
1114   //  %tmp13 = sext i32 %tmp12 to i64
1115   //
1116   //  Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.
1117   //  It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds
1118   //  1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.
1119   assert(AddOpA->getOpcode() == Instruction::Add &&
1120          AddOpB->getOpcode() == Instruction::Add &&
1121          checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
1122   if (AddOpA->getOperand(MatchingOpIdxA) ==
1123       AddOpB->getOperand(MatchingOpIdxB)) {
1124     Value *OtherOperandA = AddOpA->getOperand(MatchingOpIdxA == 1 ? 0 : 1);
1125     Value *OtherOperandB = AddOpB->getOperand(MatchingOpIdxB == 1 ? 0 : 1);
1126     Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA);
1127     Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB);
1128     // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
1129     if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
1130         checkNoWrapFlags(OtherInstrB, Signed) &&
1131         isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1132       int64_t CstVal =
1133           cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1134       if (OtherInstrB->getOperand(0) == OtherOperandA &&
1135           IdxDiff.getSExtValue() == CstVal)
1136         return true;
1137     }
1138     // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
1139     if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
1140         checkNoWrapFlags(OtherInstrA, Signed) &&
1141         isa<ConstantInt>(OtherInstrA->getOperand(1))) {
1142       int64_t CstVal =
1143           cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1144       if (OtherInstrA->getOperand(0) == OtherOperandB &&
1145           IdxDiff.getSExtValue() == -CstVal)
1146         return true;
1147     }
1148     // Match `x +nsw/nuw (y +nsw/nuw c)` and
1149     // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
1150     if (OtherInstrA && OtherInstrB &&
1151         OtherInstrA->getOpcode() == Instruction::Add &&
1152         OtherInstrB->getOpcode() == Instruction::Add &&
1153         checkNoWrapFlags(OtherInstrA, Signed) &&
1154         checkNoWrapFlags(OtherInstrB, Signed) &&
1155         isa<ConstantInt>(OtherInstrA->getOperand(1)) &&
1156         isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1157       int64_t CstValA =
1158           cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1159       int64_t CstValB =
1160           cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1161       if (OtherInstrA->getOperand(0) == OtherInstrB->getOperand(0) &&
1162           IdxDiff.getSExtValue() == (CstValB - CstValA))
1163         return true;
1164     }
1165   }
1166   return false;
1167 }
1168
1169 std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
1170     Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1171   LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA
1172                     << " PtrB=" << *PtrB << " ContextInst=" << *ContextInst
1173                     << " Depth=" << Depth << "\n");
1174   auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
1175   auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
1176   if (!GEPA || !GEPB)
1177     return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth);
1178
1179   // Look through GEPs after checking they're the same except for the last
1180   // index.
1181   if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
1182       GEPA->getPointerOperand() != GEPB->getPointerOperand())
1183     return std::nullopt;
1184   gep_type_iterator GTIA = gep_type_begin(GEPA);
1185   gep_type_iterator GTIB = gep_type_begin(GEPB);
1186   for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
1187     if (GTIA.getOperand() != GTIB.getOperand())
1188       return std::nullopt;
1189     ++GTIA;
1190     ++GTIB;
1191   }
1192
1193   Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
1194   Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
1195   if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
1196       OpA->getType() != OpB->getType())
1197     return std::nullopt;
1198
1199   uint64_t Stride = GTIA.getSequentialElementStride(DL);
1200
1201   // Only look through a ZExt/SExt.
1202   if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
1203     return std::nullopt;
1204
1205   bool Signed = isa<SExtInst>(OpA);
1206
1207   // At this point A could be a function parameter, i.e. not an instruction
1208   Value *ValA = OpA->getOperand(0);
1209   OpB = dyn_cast<Instruction>(OpB->getOperand(0));
1210   if (!OpB || ValA->getType() != OpB->getType())
1211     return std::nullopt;
1212
1213   const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
1214   const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
1215   const SCEV *IdxDiffSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);
1216   if (IdxDiffSCEV == SE.getCouldNotCompute())
1217     return std::nullopt;
1218
1219   ConstantRange IdxDiffRange = SE.getSignedRange(IdxDiffSCEV);
1220   if (!IdxDiffRange.isSingleElement())
1221     return std::nullopt;
1222   APInt IdxDiff = *IdxDiffRange.getSingleElement();
1223
1224   LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
1225                     << "\n");
1226
1227   // Now we need to prove that adding IdxDiff to ValA won't overflow.
1228   bool Safe = false;
1229
1230   // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
1231   // ValA, we're okay.
1232   if (OpB->getOpcode() == Instruction::Add &&
1233       isa<ConstantInt>(OpB->getOperand(1)) &&
1234       IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
1235       checkNoWrapFlags(OpB, Signed))
1236     Safe = true;
1237
1238   // Second attempt: check if we have eligible add NSW/NUW instruction
1239   // sequences.
1240   OpA = dyn_cast<Instruction>(ValA);
1241   if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
1242       OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) &&
1243       checkNoWrapFlags(OpB, Signed)) {
1244     // In the checks below a matching operand in OpA and OpB is an operand which
1245     // is the same in those two instructions.  Below we account for possible
1246     // orders of the operands of these add instructions.
1247     for (unsigned MatchingOpIdxA : {0, 1})
1248       for (unsigned MatchingOpIdxB : {0, 1})
1249         if (!Safe)
1250           Safe = checkIfSafeAddSequence(IdxDiff, OpA, MatchingOpIdxA, OpB,
1251                                         MatchingOpIdxB, Signed);
1252   }
1253
1254   unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
1255
1256   // Third attempt:
1257   //
1258   // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher
1259   // order bit other than the sign bit are known to be zero in ValA, we can add
1260   // Diff to it while guaranteeing no overflow of any sort.
1261   //
1262   // If IdxDiff is negative, do the same, but swap ValA and ValB.
1263   if (!Safe) {
1264     // When computing known bits, use the GEPs as context instructions, since
1265     // they likely are in the same BB as the load/store.
1266     KnownBits Known(BitWidth);
1267     computeKnownBits((IdxDiff.sge(0) ? ValA : OpB), Known, DL, 0, &AC,
1268                      ContextInst, &DT);
1269     APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
1270     if (Signed)
1271       BitsAllowedToBeSet.clearBit(BitWidth - 1);
1272     if (BitsAllowedToBeSet.ult(IdxDiff.abs()))
1273       return std::nullopt;
1274     Safe = true;
1275   }
1276
1277   if (Safe)
1278     return IdxDiff * Stride;
1279   return std::nullopt;
1280 }
1281
1282 std::optional<APInt> Vectorizer::getConstantOffsetSelects(
1283     Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1284   if (Depth++ == MaxDepth)
1285     return std::nullopt;
1286
1287   if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
1288     if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
1289       if (SelectA->getCondition() != SelectB->getCondition())
1290         return std::nullopt;
1291       LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA
1292                         << ", PtrB=" << *PtrB << ", ContextInst="
1293                         << *ContextInst << ", Depth=" << Depth << "\n");
1294       std::optional<APInt> TrueDiff = getConstantOffset(
1295           SelectA->getTrueValue(), SelectB->getTrueValue(), ContextInst, Depth);
1296       if (!TrueDiff)
1297         return std::nullopt;
1298       std::optional<APInt> FalseDiff =
1299           getConstantOffset(SelectA->getFalseValue(), SelectB->getFalseValue(),
1300                             ContextInst, Depth);
1301       if (TrueDiff == FalseDiff)
1302         return TrueDiff;
1303     }
1304   }
1305   return std::nullopt;
1306 }
1307
1308 EquivalenceClassMap
1309 Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
1310                                       BasicBlock::iterator End) {
1311   EquivalenceClassMap Ret;
1312
1313   auto GetUnderlyingObject = [](const Value *Ptr) -> const Value * {
1314     const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
1315     if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
1316       // The select's themselves are distinct instructions even if they share
1317       // the same condition and evaluate to consecutive pointers for true and
1318       // false values of the condition. Therefore using the select's themselves
1319       // for grouping instructions would put consecutive accesses into different
1320       // lists and they won't be even checked for being consecutive, and won't
1321       // be vectorized.
1322       return Sel->getCondition();
1323     }
1324     return ObjPtr;
1325   };
1326
1327   for (Instruction &I : make_range(Begin, End)) {
1328     auto *LI = dyn_cast<LoadInst>(&I);
1329     auto *SI = dyn_cast<StoreInst>(&I);
1330     if (!LI && !SI)
1331       continue;
1332
1333     if ((LI && !LI->isSimple()) || (SI && !SI->isSimple()))
1334       continue;
1335
1336     if ((LI && !TTI.isLegalToVectorizeLoad(LI)) ||
1337         (SI && !TTI.isLegalToVectorizeStore(SI)))
1338       continue;
1339
1340     Type *Ty = getLoadStoreType(&I);
1341     if (!VectorType::isValidElementType(Ty->getScalarType()))
1342       continue;
1343
1344     // Skip weird non-byte sizes. They probably aren't worth the effort of
1345     // handling correctly.
1346     unsigned TySize = DL.getTypeSizeInBits(Ty);
1347     if ((TySize % 8) != 0)
1348       continue;
1349
1350     // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
1351     // functions are currently using an integer type for the vectorized
1352     // load/store, and does not support casting between the integer type and a
1353     // vector of pointers (e.g. i64 to <2 x i16*>)
1354     if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
1355       continue;
1356
1357     Value *Ptr = getLoadStorePointerOperand(&I);
1358     unsigned AS = Ptr->getType()->getPointerAddressSpace();
1359     unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1360
1361     unsigned VF = VecRegSize / TySize;
1362     VectorType *VecTy = dyn_cast<VectorType>(Ty);
1363
1364     // Only handle power-of-two sized elements.
1365     if ((!VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(Ty))) ||
1366         (VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(VecTy->getScalarType()))))
1367       continue;
1368
1369     // No point in looking at these if they're too big to vectorize.
1370     if (TySize > VecRegSize / 2 ||
1371         (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
1372       continue;
1373
1374     Ret[{GetUnderlyingObject(Ptr), AS,
1375          DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
1376          /*IsLoad=*/LI != nullptr}]
1377         .emplace_back(&I);
1378   }
1379
1380   return Ret;
1381 }
1382
1383 std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
1384   if (Instrs.empty())
1385     return {};
1386
1387   unsigned AS = getLoadStoreAddressSpace(Instrs[0]);
1388   unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
1389
1390 #ifndef NDEBUG
1391   // Check that Instrs is in BB order and all have the same addr space.
1392   for (size_t I = 1; I < Instrs.size(); ++I) {
1393     assert(Instrs[I - 1]->comesBefore(Instrs[I]));
1394     assert(getLoadStoreAddressSpace(Instrs[I]) == AS);
1395   }
1396 #endif
1397
1398   // Machinery to build an MRU-hashtable of Chains.
1399   //
1400   // (Ideally this could be done with MapVector, but as currently implemented,
1401   // moving an element to the front of a MapVector is O(n).)
1402   struct InstrListElem : ilist_node<InstrListElem>,
1403                          std::pair<Instruction *, Chain> {
1404     explicit InstrListElem(Instruction *I)
1405         : std::pair<Instruction *, Chain>(I, {}) {}
1406   };
1407   struct InstrListElemDenseMapInfo {
1408     using PtrInfo = DenseMapInfo<InstrListElem *>;
1409     using IInfo = DenseMapInfo<Instruction *>;
1410     static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); }
1411     static InstrListElem *getTombstoneKey() {
1412       return PtrInfo::getTombstoneKey();
1413     }
1414     static unsigned getHashValue(const InstrListElem *E) {
1415       return IInfo::getHashValue(E->first);
1416     }
1417     static bool isEqual(const InstrListElem *A, const InstrListElem *B) {
1418       if (A == getEmptyKey() || B == getEmptyKey())
1419         return A == getEmptyKey() && B == getEmptyKey();
1420       if (A == getTombstoneKey() || B == getTombstoneKey())
1421         return A == getTombstoneKey() && B == getTombstoneKey();
1422       return IInfo::isEqual(A->first, B->first);
1423     }
1424   };
1425   SpecificBumpPtrAllocator<InstrListElem> Allocator;
1426   simple_ilist<InstrListElem> MRU;
1427   DenseSet<InstrListElem *, InstrListElemDenseMapInfo> Chains;
1428
1429   // Compare each instruction in `instrs` to leader of the N most recently-used
1430   // chains.  This limits the O(n^2) behavior of this pass while also allowing
1431   // us to build arbitrarily long chains.
1432   for (Instruction *I : Instrs) {
1433     constexpr int MaxChainsToTry = 64;
1434
1435     bool MatchFound = false;
1436     auto ChainIter = MRU.begin();
1437     for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end();
1438          ++J, ++ChainIter) {
1439       if (std::optional<APInt> Offset = getConstantOffset(
1440               getLoadStorePointerOperand(ChainIter->first),
1441               getLoadStorePointerOperand(I),
1442               /*ContextInst=*/
1443               (ChainIter->first->comesBefore(I) ? I : ChainIter->first))) {
1444         // `Offset` might not have the expected number of bits, if e.g. AS has a
1445         // different number of bits than opaque pointers.
1446         ChainIter->second.emplace_back(I, Offset.value());
1447         // Move ChainIter to the front of the MRU list.
1448         MRU.remove(*ChainIter);
1449         MRU.push_front(*ChainIter);
1450         MatchFound = true;
1451         break;
1452       }
1453     }
1454
1455     if (!MatchFound) {
1456       APInt ZeroOffset(ASPtrBits, 0);
1457       InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I);
1458       E->second.emplace_back(I, ZeroOffset);
1459       MRU.push_front(*E);
1460       Chains.insert(E);
1461     }
1462   }
1463
1464   std::vector<Chain> Ret;
1465   Ret.reserve(Chains.size());
1466   // Iterate over MRU rather than Chains so the order is deterministic.
1467   for (auto &E : MRU)
1468     if (E.second.size() > 1)
1469       Ret.emplace_back(std::move(E.second));
1470   return Ret;
1471 }
1472
1473 std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
1474                                                    Instruction *ContextInst,
1475                                                    unsigned Depth) {
1476   LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA
1477                     << ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst
1478                     << ", Depth=" << Depth << "\n");
1479   // We'll ultimately return a value of this bit width, even if computations
1480   // happen in a different width.
1481   unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType());
1482   APInt OffsetA(OrigBitWidth, 0);
1483   APInt OffsetB(OrigBitWidth, 0);
1484   PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1485   PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1486   unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
1487   if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
1488     return std::nullopt;
1489
1490   // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets
1491   // should properly handle a possible overflow and the value should fit into
1492   // the smallest data type used in the cast/gep chain.
1493   assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
1494          OffsetB.getSignificantBits() <= NewPtrBitWidth);
1495
1496   OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
1497   OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
1498   if (PtrA == PtrB)
1499     return (OffsetB - OffsetA).sextOrTrunc(OrigBitWidth);
1500
1501   // Try to compute B - A.
1502   const SCEV *DistScev = SE.getMinusSCEV(SE.getSCEV(PtrB), SE.getSCEV(PtrA));
1503   if (DistScev != SE.getCouldNotCompute()) {
1504     LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n");
1505     ConstantRange DistRange = SE.getSignedRange(DistScev);
1506     if (DistRange.isSingleElement()) {
1507       // Handle index width (the width of Dist) != pointer width (the width of
1508       // the Offset*s at this point).
1509       APInt Dist = DistRange.getSingleElement()->sextOrTrunc(NewPtrBitWidth);
1510       return (OffsetB - OffsetA + Dist).sextOrTrunc(OrigBitWidth);
1511     }
1512   }
1513   if (std::optional<APInt> Diff =
1514           getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth))
1515     return (OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth()))
1516         .sextOrTrunc(OrigBitWidth);
1517   return std::nullopt;
1518 }