1 //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass merges loads/stores to/from sequential memory addresses into vector
10 // loads/stores. Although there's nothing GPU-specific in here, this pass is
11 // motivated by the microarchitectural quirks of nVidia and AMD GPUs.
13 // (For simplicity below we talk about loads only, but everything also applies
16 // This pass is intended to be run late in the pipeline, after other
17 // vectorization opportunities have been exploited. So the assumption here is
18 // that immediately following our new vector load we'll need to extract out the
19 // individual elements of the load, so we can operate on them individually.
21 // On CPUs this transformation is usually not beneficial, because extracting the
22 // elements of a vector register is expensive on most architectures. It's
23 // usually better just to load each element individually into its own scalar
26 // However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27 // "vector load" loads directly into a series of scalar registers. In effect,
28 // extracting the elements of the vector is free. It's therefore always
29 // beneficial to vectorize a sequence of loads on these architectures.
31 // Vectorizing (perhaps a better name might be "coalescing") loads can have
32 // large performance impacts on GPU kernels, and opportunities for vectorizing
33 // are common in GPU code. This pass tries very hard to find such
34 // opportunities; its runtime is quadratic in the number of loads in a BB.
36 // Some CPU architectures, such as ARM, have instructions that load into
37 // multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38 // could use this pass (with some modifications), but currently it implements
39 // its own pass to do something similar to what we do here.
41 // Overview of the algorithm and terminology in this pass:
43 // - Break up each basic block into pseudo-BBs, composed of instructions which
44 // are guaranteed to transfer control to their successors.
45 // - Within a single pseudo-BB, find all loads, and group them into
46 // "equivalence classes" according to getUnderlyingObject() and loaded
47 // element size. Do the same for stores.
48 // - For each equivalence class, greedily build "chains". Each chain has a
49 // leader instruction, and every other member of the chain has a known
50 // constant offset from the first instr in the chain.
51 // - Break up chains so that they contain only contiguous accesses of legal
52 // size with no intervening may-alias instrs.
53 // - Convert each chain to vector instructions.
55 // The O(n^2) behavior of this pass comes from initially building the chains.
56 // In the worst case we have to compare each new instruction to all of those
57 // that came before. To limit this, we only calculate the offset to the leaders
58 // of the N most recently-used chains.
60 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
61 #include "llvm/ADT/APInt.h"
62 #include "llvm/ADT/ArrayRef.h"
63 #include "llvm/ADT/DenseMap.h"
64 #include "llvm/ADT/MapVector.h"
65 #include "llvm/ADT/PostOrderIterator.h"
66 #include "llvm/ADT/STLExtras.h"
67 #include "llvm/ADT/Sequence.h"
68 #include "llvm/ADT/SmallPtrSet.h"
69 #include "llvm/ADT/SmallVector.h"
70 #include "llvm/ADT/Statistic.h"
71 #include "llvm/ADT/iterator_range.h"
72 #include "llvm/Analysis/AliasAnalysis.h"
73 #include "llvm/Analysis/AssumptionCache.h"
74 #include "llvm/Analysis/MemoryLocation.h"
75 #include "llvm/Analysis/ScalarEvolution.h"
76 #include "llvm/Analysis/TargetTransformInfo.h"
77 #include "llvm/Analysis/ValueTracking.h"
78 #include "llvm/Analysis/VectorUtils.h"
79 #include "llvm/IR/Attributes.h"
80 #include "llvm/IR/BasicBlock.h"
81 #include "llvm/IR/ConstantRange.h"
82 #include "llvm/IR/Constants.h"
83 #include "llvm/IR/DataLayout.h"
84 #include "llvm/IR/DerivedTypes.h"
85 #include "llvm/IR/Dominators.h"
86 #include "llvm/IR/Function.h"
87 #include "llvm/IR/GetElementPtrTypeIterator.h"
88 #include "llvm/IR/IRBuilder.h"
89 #include "llvm/IR/InstrTypes.h"
90 #include "llvm/IR/Instruction.h"
91 #include "llvm/IR/Instructions.h"
92 #include "llvm/IR/LLVMContext.h"
93 #include "llvm/IR/Module.h"
94 #include "llvm/IR/Type.h"
95 #include "llvm/IR/Value.h"
96 #include "llvm/InitializePasses.h"
97 #include "llvm/Pass.h"
98 #include "llvm/Support/Alignment.h"
99 #include "llvm/Support/Casting.h"
100 #include "llvm/Support/Debug.h"
101 #include "llvm/Support/KnownBits.h"
102 #include "llvm/Support/MathExtras.h"
103 #include "llvm/Support/ModRef.h"
104 #include "llvm/Support/raw_ostream.h"
105 #include "llvm/Transforms/Utils/Local.h"
114 #include <type_traits>
118 using namespace llvm
;
120 #define DEBUG_TYPE "load-store-vectorizer"
122 STATISTIC(NumVectorInstructions
, "Number of vector accesses generated");
123 STATISTIC(NumScalarsVectorized
, "Number of scalar accesses vectorized");
127 // Equivalence class key, the initial tuple by which we group loads/stores.
128 // Loads/stores with different EqClassKeys are never merged.
130 // (We could in theory remove element-size from the this tuple. We'd just need
131 // to fix up the vector packing/unpacking code.)
133 std::tuple
<const Value
* /* result of getUnderlyingObject() */,
134 unsigned /* AddrSpace */,
135 unsigned /* Load/Store element size bits */,
136 char /* IsLoad; char b/c bool can't be a DenseMap key */
138 [[maybe_unused
]] llvm::raw_ostream
&operator<<(llvm::raw_ostream
&OS
,
139 const EqClassKey
&K
) {
140 const auto &[UnderlyingObject
, AddrSpace
, ElementSize
, IsLoad
] = K
;
141 return OS
<< (IsLoad
? "load" : "store") << " of " << *UnderlyingObject
142 << " of element size " << ElementSize
<< " bits in addrspace "
146 // A Chain is a set of instructions such that:
147 // - All instructions have the same equivalence class, so in particular all are
148 // loads, or all are stores.
149 // - We know the address accessed by the i'th chain elem relative to the
150 // chain's leader instruction, which is the first instr of the chain in BB
153 // Chains have two canonical orderings:
154 // - BB order, sorted by Instr->comesBefore.
155 // - Offset order, sorted by OffsetFromLeader.
156 // This pass switches back and forth between these orders.
159 APInt OffsetFromLeader
;
160 ChainElem(Instruction
*Inst
, APInt OffsetFromLeader
)
161 : Inst(std::move(Inst
)), OffsetFromLeader(std::move(OffsetFromLeader
)) {}
163 using Chain
= SmallVector
<ChainElem
, 1>;
165 void sortChainInBBOrder(Chain
&C
) {
166 sort(C
, [](auto &A
, auto &B
) { return A
.Inst
->comesBefore(B
.Inst
); });
169 void sortChainInOffsetOrder(Chain
&C
) {
170 sort(C
, [](const auto &A
, const auto &B
) {
171 if (A
.OffsetFromLeader
!= B
.OffsetFromLeader
)
172 return A
.OffsetFromLeader
.slt(B
.OffsetFromLeader
);
173 return A
.Inst
->comesBefore(B
.Inst
); // stable tiebreaker
177 [[maybe_unused
]] void dumpChain(ArrayRef
<ChainElem
> C
) {
178 for (const auto &E
: C
) {
179 dbgs() << " " << *E
.Inst
<< " (offset " << E
.OffsetFromLeader
<< ")\n";
183 using EquivalenceClassMap
=
184 MapVector
<EqClassKey
, SmallVector
<Instruction
*, 8>>;
186 // FIXME: Assuming stack alignment of 4 is always good enough
187 constexpr unsigned StackAdjustedAlignment
= 4;
189 Instruction
*propagateMetadata(Instruction
*I
, const Chain
&C
) {
190 SmallVector
<Value
*, 8> Values
;
191 for (const ChainElem
&E
: C
)
192 Values
.emplace_back(E
.Inst
);
193 return propagateMetadata(I
, Values
);
196 bool isInvariantLoad(const Instruction
*I
) {
197 const LoadInst
*LI
= dyn_cast
<LoadInst
>(I
);
198 return LI
!= nullptr && LI
->hasMetadata(LLVMContext::MD_invariant_load
);
201 /// Reorders the instructions that I depends on (the instructions defining its
202 /// operands), to ensure they dominate I.
203 void reorder(Instruction
*I
) {
204 SmallPtrSet
<Instruction
*, 16> InstructionsToMove
;
205 SmallVector
<Instruction
*, 16> Worklist
;
207 Worklist
.emplace_back(I
);
208 while (!Worklist
.empty()) {
209 Instruction
*IW
= Worklist
.pop_back_val();
210 int NumOperands
= IW
->getNumOperands();
211 for (int Idx
= 0; Idx
< NumOperands
; Idx
++) {
212 Instruction
*IM
= dyn_cast
<Instruction
>(IW
->getOperand(Idx
));
213 if (!IM
|| IM
->getOpcode() == Instruction::PHI
)
216 // If IM is in another BB, no need to move it, because this pass only
217 // vectorizes instructions within one BB.
218 if (IM
->getParent() != I
->getParent())
221 assert(IM
!= I
&& "Unexpected cycle while re-ordering instructions");
223 if (!IM
->comesBefore(I
)) {
224 InstructionsToMove
.insert(IM
);
225 Worklist
.emplace_back(IM
);
230 // All instructions to move should follow I. Start from I, not from begin().
231 for (auto BBI
= I
->getIterator(), E
= I
->getParent()->end(); BBI
!= E
;) {
232 Instruction
*IM
= &*(BBI
++);
233 if (!InstructionsToMove
.contains(IM
))
245 TargetTransformInfo
&TTI
;
246 const DataLayout
&DL
;
249 // We could erase instrs right after vectorizing them, but that can mess up
250 // our BB iterators, and also can make the equivalence class keys point to
251 // freed memory. This is fixable, but it's simpler just to wait until we're
252 // done with the BB and erase all at once.
253 SmallVector
<Instruction
*, 128> ToErase
;
256 Vectorizer(Function
&F
, AliasAnalysis
&AA
, AssumptionCache
&AC
,
257 DominatorTree
&DT
, ScalarEvolution
&SE
, TargetTransformInfo
&TTI
)
258 : F(F
), AA(AA
), AC(AC
), DT(DT
), SE(SE
), TTI(TTI
),
259 DL(F
.getDataLayout()), Builder(SE
.getContext()) {}
264 static const unsigned MaxDepth
= 3;
266 /// Runs the vectorizer on a "pseudo basic block", which is a range of
267 /// instructions [Begin, End) within one BB all of which have
268 /// isGuaranteedToTransferExecutionToSuccessor(I) == true.
269 bool runOnPseudoBB(BasicBlock::iterator Begin
, BasicBlock::iterator End
);
271 /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
272 /// in the same BB with the same value for getUnderlyingObject() etc.
273 bool runOnEquivalenceClass(const EqClassKey
&EqClassKey
,
274 ArrayRef
<Instruction
*> EqClass
);
276 /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
277 /// where all instructions access a known, constant offset from the first
279 bool runOnChain(Chain
&C
);
281 /// Splits the chain into subchains of instructions which read/write a
282 /// contiguous block of memory. Discards any length-1 subchains (because
283 /// there's nothing to vectorize in there).
284 std::vector
<Chain
> splitChainByContiguity(Chain
&C
);
286 /// Splits the chain into subchains where it's safe to hoist loads up to the
287 /// beginning of the sub-chain and it's safe to sink loads up to the end of
288 /// the sub-chain. Discards any length-1 subchains.
289 std::vector
<Chain
> splitChainByMayAliasInstrs(Chain
&C
);
291 /// Splits the chain into subchains that make legal, aligned accesses.
292 /// Discards any length-1 subchains.
293 std::vector
<Chain
> splitChainByAlignment(Chain
&C
);
295 /// Converts the instrs in the chain into a single vectorized load or store.
296 /// Adds the old scalar loads/stores to ToErase.
297 bool vectorizeChain(Chain
&C
);
299 /// Tries to compute the offset in bytes PtrB - PtrA.
300 std::optional
<APInt
> getConstantOffset(Value
*PtrA
, Value
*PtrB
,
301 Instruction
*ContextInst
,
303 std::optional
<APInt
> getConstantOffsetComplexAddrs(Value
*PtrA
, Value
*PtrB
,
304 Instruction
*ContextInst
,
306 std::optional
<APInt
> getConstantOffsetSelects(Value
*PtrA
, Value
*PtrB
,
307 Instruction
*ContextInst
,
310 /// Gets the element type of the vector that the chain will load or store.
311 /// This is nontrivial because the chain may contain elements of different
312 /// types; e.g. it's legal to have a chain that contains both i32 and float.
313 Type
*getChainElemTy(const Chain
&C
);
315 /// Determines whether ChainElem can be moved up (if IsLoad) or down (if
316 /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
319 /// The map ChainElemOffsets must contain all of the elements in
320 /// [ChainBegin, ChainElem] and their offsets from some arbitrary base
321 /// address. It's ok if it contains additional entries.
322 template <bool IsLoadChain
>
324 Instruction
*ChainElem
, Instruction
*ChainBegin
,
325 const DenseMap
<Instruction
*, APInt
/*OffsetFromLeader*/> &ChainOffsets
);
327 /// Collects loads and stores grouped by "equivalence class", where:
328 /// - all elements in an eq class are a load or all are a store,
329 /// - they all load/store the same element size (it's OK to have e.g. i8 and
330 /// <4 x i8> in the same class, but not i32 and <4 x i8>), and
331 /// - they all have the same value for getUnderlyingObject().
332 EquivalenceClassMap
collectEquivalenceClasses(BasicBlock::iterator Begin
,
333 BasicBlock::iterator End
);
335 /// Partitions Instrs into "chains" where every instruction has a known
336 /// constant offset from the first instr in the chain.
338 /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
339 /// in the chain is the leader, and an instr touches distance 0 from itself.
340 std::vector
<Chain
> gatherChains(ArrayRef
<Instruction
*> Instrs
);
343 class LoadStoreVectorizerLegacyPass
: public FunctionPass
{
347 LoadStoreVectorizerLegacyPass() : FunctionPass(ID
) {
348 initializeLoadStoreVectorizerLegacyPassPass(
349 *PassRegistry::getPassRegistry());
352 bool runOnFunction(Function
&F
) override
;
354 StringRef
getPassName() const override
{
355 return "GPU Load and Store Vectorizer";
358 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
359 AU
.addRequired
<AAResultsWrapperPass
>();
360 AU
.addRequired
<AssumptionCacheTracker
>();
361 AU
.addRequired
<ScalarEvolutionWrapperPass
>();
362 AU
.addRequired
<DominatorTreeWrapperPass
>();
363 AU
.addRequired
<TargetTransformInfoWrapperPass
>();
364 AU
.setPreservesCFG();
368 } // end anonymous namespace
370 char LoadStoreVectorizerLegacyPass::ID
= 0;
372 INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass
, DEBUG_TYPE
,
373 "Vectorize load and Store instructions", false, false)
374 INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass
)
375 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker
);
376 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass
)
377 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass
)
378 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass
)
379 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass
)
380 INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass
, DEBUG_TYPE
,
381 "Vectorize load and store instructions", false, false)
383 Pass
*llvm::createLoadStoreVectorizerPass() {
384 return new LoadStoreVectorizerLegacyPass();
387 bool LoadStoreVectorizerLegacyPass::runOnFunction(Function
&F
) {
388 // Don't vectorize when the attribute NoImplicitFloat is used.
389 if (skipFunction(F
) || F
.hasFnAttribute(Attribute::NoImplicitFloat
))
392 AliasAnalysis
&AA
= getAnalysis
<AAResultsWrapperPass
>().getAAResults();
393 DominatorTree
&DT
= getAnalysis
<DominatorTreeWrapperPass
>().getDomTree();
394 ScalarEvolution
&SE
= getAnalysis
<ScalarEvolutionWrapperPass
>().getSE();
395 TargetTransformInfo
&TTI
=
396 getAnalysis
<TargetTransformInfoWrapperPass
>().getTTI(F
);
398 AssumptionCache
&AC
=
399 getAnalysis
<AssumptionCacheTracker
>().getAssumptionCache(F
);
401 return Vectorizer(F
, AA
, AC
, DT
, SE
, TTI
).run();
404 PreservedAnalyses
LoadStoreVectorizerPass::run(Function
&F
,
405 FunctionAnalysisManager
&AM
) {
406 // Don't vectorize when the attribute NoImplicitFloat is used.
407 if (F
.hasFnAttribute(Attribute::NoImplicitFloat
))
408 return PreservedAnalyses::all();
410 AliasAnalysis
&AA
= AM
.getResult
<AAManager
>(F
);
411 DominatorTree
&DT
= AM
.getResult
<DominatorTreeAnalysis
>(F
);
412 ScalarEvolution
&SE
= AM
.getResult
<ScalarEvolutionAnalysis
>(F
);
413 TargetTransformInfo
&TTI
= AM
.getResult
<TargetIRAnalysis
>(F
);
414 AssumptionCache
&AC
= AM
.getResult
<AssumptionAnalysis
>(F
);
416 bool Changed
= Vectorizer(F
, AA
, AC
, DT
, SE
, TTI
).run();
417 PreservedAnalyses PA
;
418 PA
.preserveSet
<CFGAnalyses
>();
419 return Changed
? PA
: PreservedAnalyses::all();
422 bool Vectorizer::run() {
423 bool Changed
= false;
424 // Break up the BB if there are any instrs which aren't guaranteed to transfer
425 // execution to their successor.
427 // Consider, for example:
429 // def assert_arr_len(int n) { if (n < 2) exit(); }
432 // call assert_array_len(arr.length)
435 // Even though assert_arr_len does not read or write any memory, we can't
436 // speculate the second load before the call. More info at
437 // https://github.com/llvm/llvm-project/issues/52950.
438 for (BasicBlock
*BB
: post_order(&F
)) {
439 // BB must at least have a terminator.
440 assert(!BB
->empty());
442 SmallVector
<BasicBlock::iterator
, 8> Barriers
;
443 Barriers
.emplace_back(BB
->begin());
444 for (Instruction
&I
: *BB
)
445 if (!isGuaranteedToTransferExecutionToSuccessor(&I
))
446 Barriers
.emplace_back(I
.getIterator());
447 Barriers
.emplace_back(BB
->end());
449 for (auto It
= Barriers
.begin(), End
= std::prev(Barriers
.end()); It
!= End
;
451 Changed
|= runOnPseudoBB(*It
, *std::next(It
));
453 for (Instruction
*I
: ToErase
) {
454 auto *PtrOperand
= getLoadStorePointerOperand(I
);
456 I
->eraseFromParent();
457 RecursivelyDeleteTriviallyDeadInstructions(PtrOperand
);
465 bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin
,
466 BasicBlock::iterator End
) {
468 dbgs() << "LSV: Running on pseudo-BB [" << *Begin
<< " ... ";
469 if (End
!= Begin
->getParent()->end())
472 dbgs() << "<BB end>";
476 bool Changed
= false;
477 for (const auto &[EqClassKey
, EqClass
] :
478 collectEquivalenceClasses(Begin
, End
))
479 Changed
|= runOnEquivalenceClass(EqClassKey
, EqClass
);
484 bool Vectorizer::runOnEquivalenceClass(const EqClassKey
&EqClassKey
,
485 ArrayRef
<Instruction
*> EqClass
) {
486 bool Changed
= false;
489 dbgs() << "LSV: Running on equivalence class of size " << EqClass
.size()
490 << " keyed on " << EqClassKey
<< ":\n";
491 for (Instruction
*I
: EqClass
)
492 dbgs() << " " << *I
<< "\n";
495 std::vector
<Chain
> Chains
= gatherChains(EqClass
);
496 LLVM_DEBUG(dbgs() << "LSV: Got " << Chains
.size()
497 << " nontrivial chains.\n";);
498 for (Chain
&C
: Chains
)
499 Changed
|= runOnChain(C
);
503 bool Vectorizer::runOnChain(Chain
&C
) {
505 dbgs() << "LSV: Running on chain with " << C
.size() << " instructions:\n";
509 // Split up the chain into increasingly smaller chains, until we can finally
510 // vectorize the chains.
512 // (Don't be scared by the depth of the loop nest here. These operations are
513 // all at worst O(n lg n) in the number of instructions, and splitting chains
514 // doesn't change the number of instrs. So the whole loop nest is O(n lg n).)
515 bool Changed
= false;
516 for (auto &C
: splitChainByMayAliasInstrs(C
))
517 for (auto &C
: splitChainByContiguity(C
))
518 for (auto &C
: splitChainByAlignment(C
))
519 Changed
|= vectorizeChain(C
);
523 std::vector
<Chain
> Vectorizer::splitChainByMayAliasInstrs(Chain
&C
) {
527 sortChainInBBOrder(C
);
530 dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";
534 // We know that elements in the chain with nonverlapping offsets can't
535 // alias, but AA may not be smart enough to figure this out. Use a
537 DenseMap
<Instruction
*, APInt
/*OffsetFromLeader*/> ChainOffsets
;
538 for (const auto &E
: C
)
539 ChainOffsets
.insert({&*E
.Inst
, E
.OffsetFromLeader
});
541 // Loads get hoisted up to the first load in the chain. Stores get sunk
542 // down to the last store in the chain. Our algorithm for loads is:
544 // - Take the first element of the chain. This is the start of a new chain.
545 // - Take the next element of `Chain` and check for may-alias instructions
546 // up to the start of NewChain. If no may-alias instrs, add it to
547 // NewChain. Otherwise, start a new NewChain.
549 // For stores it's the same except in the reverse direction.
551 // We expect IsLoad to be an std::bool_constant.
552 auto Impl
= [&](auto IsLoad
) {
553 // MSVC is unhappy if IsLoad is a capture, so pass it as an arg.
554 auto [ChainBegin
, ChainEnd
] = [&](auto IsLoad
) {
555 if constexpr (IsLoad())
556 return std::make_pair(C
.begin(), C
.end());
558 return std::make_pair(C
.rbegin(), C
.rend());
560 assert(ChainBegin
!= ChainEnd
);
562 std::vector
<Chain
> Chains
;
563 SmallVector
<ChainElem
, 1> NewChain
;
564 NewChain
.emplace_back(*ChainBegin
);
565 for (auto ChainIt
= std::next(ChainBegin
); ChainIt
!= ChainEnd
; ++ChainIt
) {
566 if (isSafeToMove
<IsLoad
>(ChainIt
->Inst
, NewChain
.front().Inst
,
568 LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
569 << *ChainIt
->Inst
<< " into " << *ChainBegin
->Inst
571 NewChain
.emplace_back(*ChainIt
);
574 dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
575 << *ChainIt
->Inst
<< " into " << *ChainBegin
->Inst
<< "\n");
576 if (NewChain
.size() > 1) {
578 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
581 Chains
.emplace_back(std::move(NewChain
));
584 // Start a new chain.
585 NewChain
= SmallVector
<ChainElem
, 1>({*ChainIt
});
588 if (NewChain
.size() > 1) {
590 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
593 Chains
.emplace_back(std::move(NewChain
));
598 if (isa
<LoadInst
>(C
[0].Inst
))
599 return Impl(/*IsLoad=*/std::bool_constant
<true>());
601 assert(isa
<StoreInst
>(C
[0].Inst
));
602 return Impl(/*IsLoad=*/std::bool_constant
<false>());
605 std::vector
<Chain
> Vectorizer::splitChainByContiguity(Chain
&C
) {
609 sortChainInOffsetOrder(C
);
612 dbgs() << "LSV: splitChainByContiguity considering chain:\n";
616 std::vector
<Chain
> Ret
;
617 Ret
.push_back({C
.front()});
619 for (auto It
= std::next(C
.begin()), End
= C
.end(); It
!= End
; ++It
) {
620 // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
621 auto &CurChain
= Ret
.back();
622 const ChainElem
&Prev
= CurChain
.back();
623 unsigned SzBits
= DL
.getTypeSizeInBits(getLoadStoreType(&*Prev
.Inst
));
624 assert(SzBits
% 8 == 0 && "Non-byte sizes should have been filtered out by "
625 "collectEquivalenceClass");
626 APInt PrevReadEnd
= Prev
.OffsetFromLeader
+ SzBits
/ 8;
628 // Add this instruction to the end of the current chain, or start a new one.
629 bool AreContiguous
= It
->OffsetFromLeader
== PrevReadEnd
;
630 LLVM_DEBUG(dbgs() << "LSV: Instructions are "
631 << (AreContiguous
? "" : "not ") << "contiguous: "
632 << *Prev
.Inst
<< " (ends at offset " << PrevReadEnd
633 << ") -> " << *It
->Inst
<< " (starts at offset "
634 << It
->OffsetFromLeader
<< ")\n");
636 CurChain
.push_back(*It
);
638 Ret
.push_back({*It
});
641 // Filter out length-1 chains, these are uninteresting.
642 llvm::erase_if(Ret
, [](const auto &Chain
) { return Chain
.size() <= 1; });
646 Type
*Vectorizer::getChainElemTy(const Chain
&C
) {
649 // - If there are any pointer types in the chain, use an integer type.
650 // - Prefer an integer type if it appears in the chain.
651 // - Otherwise, use the first type in the chain.
653 // The rule about pointer types is a simplification when we merge e.g. a load
654 // of a ptr and a double. There's no direct conversion from a ptr to a
655 // double; it requires a ptrtoint followed by a bitcast.
657 // It's unclear to me if the other rules have any practical effect, but we do
658 // it to match this pass's previous behavior.
659 if (any_of(C
, [](const ChainElem
&E
) {
660 return getLoadStoreType(E
.Inst
)->getScalarType()->isPointerTy();
662 return Type::getIntNTy(
664 DL
.getTypeSizeInBits(getLoadStoreType(C
[0].Inst
)->getScalarType()));
667 for (const ChainElem
&E
: C
)
668 if (Type
*T
= getLoadStoreType(E
.Inst
)->getScalarType(); T
->isIntegerTy())
670 return getLoadStoreType(C
[0].Inst
)->getScalarType();
673 std::vector
<Chain
> Vectorizer::splitChainByAlignment(Chain
&C
) {
674 // We use a simple greedy algorithm.
675 // - Given a chain of length N, find all prefixes that
676 // (a) are not longer than the max register length, and
677 // (b) are a power of 2.
678 // - Starting from the longest prefix, try to create a vector of that length.
679 // - If one of them works, great. Repeat the algorithm on any remaining
680 // elements in the chain.
681 // - If none of them work, discard the first element and repeat on a chain
686 sortChainInOffsetOrder(C
);
689 dbgs() << "LSV: splitChainByAlignment considering chain:\n";
693 bool IsLoadChain
= isa
<LoadInst
>(C
[0].Inst
);
694 auto GetVectorFactor
= [&](unsigned VF
, unsigned LoadStoreSize
,
695 unsigned ChainSizeBytes
, VectorType
*VecTy
) {
696 return IsLoadChain
? TTI
.getLoadVectorFactor(VF
, LoadStoreSize
,
697 ChainSizeBytes
, VecTy
)
698 : TTI
.getStoreVectorFactor(VF
, LoadStoreSize
,
699 ChainSizeBytes
, VecTy
);
703 for (const auto &E
: C
) {
704 Type
*Ty
= getLoadStoreType(E
.Inst
)->getScalarType();
705 assert(isPowerOf2_32(DL
.getTypeSizeInBits(Ty
)) &&
706 "Should have filtered out non-power-of-two elements in "
707 "collectEquivalenceClasses.");
711 unsigned AS
= getLoadStoreAddressSpace(C
[0].Inst
);
712 unsigned VecRegBytes
= TTI
.getLoadStoreVecRegBitWidth(AS
) / 8;
714 std::vector
<Chain
> Ret
;
715 for (unsigned CBegin
= 0; CBegin
< C
.size(); ++CBegin
) {
716 // Find candidate chains of size not greater than the largest vector reg.
717 // These chains are over the closed interval [CBegin, CEnd].
718 SmallVector
<std::pair
<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
720 for (unsigned CEnd
= CBegin
+ 1, Size
= C
.size(); CEnd
< Size
; ++CEnd
) {
721 APInt Sz
= C
[CEnd
].OffsetFromLeader
+
722 DL
.getTypeStoreSize(getLoadStoreType(C
[CEnd
].Inst
)) -
723 C
[CBegin
].OffsetFromLeader
;
724 if (Sz
.sgt(VecRegBytes
))
726 CandidateChains
.emplace_back(CEnd
,
727 static_cast<unsigned>(Sz
.getLimitedValue()));
730 // Consider the longest chain first.
731 for (auto It
= CandidateChains
.rbegin(), End
= CandidateChains
.rend();
733 auto [CEnd
, SizeBytes
] = *It
;
735 dbgs() << "LSV: splitChainByAlignment considering candidate chain ["
736 << *C
[CBegin
].Inst
<< " ... " << *C
[CEnd
].Inst
<< "]\n");
738 Type
*VecElemTy
= getChainElemTy(C
);
739 // Note, VecElemTy is a power of 2, but might be less than one byte. For
740 // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case
741 // VecElemTy would be i4.
742 unsigned VecElemBits
= DL
.getTypeSizeInBits(VecElemTy
);
744 // SizeBytes and VecElemBits are powers of 2, so they divide evenly.
745 assert((8 * SizeBytes
) % VecElemBits
== 0);
746 unsigned NumVecElems
= 8 * SizeBytes
/ VecElemBits
;
747 FixedVectorType
*VecTy
= FixedVectorType::get(VecElemTy
, NumVecElems
);
748 unsigned VF
= 8 * VecRegBytes
/ VecElemBits
;
750 // Check that TTI is happy with this vectorization factor.
751 unsigned TargetVF
= GetVectorFactor(VF
, VecElemBits
,
752 VecElemBits
* NumVecElems
/ 8, VecTy
);
753 if (TargetVF
!= VF
&& TargetVF
< NumVecElems
) {
755 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
757 << TargetVF
<< " != VF=" << VF
758 << " and TargetVF < NumVecElems=" << NumVecElems
<< "\n");
762 // Is a load/store with this alignment allowed by TTI and at least as fast
763 // as an unvectorized load/store?
765 // TTI and F are passed as explicit captures to WAR an MSVC misparse (??).
766 auto IsAllowedAndFast
= [&, SizeBytes
= SizeBytes
, &TTI
= TTI
,
767 &F
= F
](Align Alignment
) {
768 if (Alignment
.value() % SizeBytes
== 0)
770 unsigned VectorizedSpeed
= 0;
771 bool AllowsMisaligned
= TTI
.allowsMisalignedMemoryAccesses(
772 F
.getContext(), SizeBytes
* 8, AS
, Alignment
, &VectorizedSpeed
);
773 if (!AllowsMisaligned
) {
775 << "LSV: Access of " << SizeBytes
<< "B in addrspace "
776 << AS
<< " with alignment " << Alignment
.value()
777 << " is misaligned, and therefore can't be vectorized.\n");
781 unsigned ElementwiseSpeed
= 0;
782 (TTI
).allowsMisalignedMemoryAccesses((F
).getContext(), VecElemBits
, AS
,
783 Alignment
, &ElementwiseSpeed
);
784 if (VectorizedSpeed
< ElementwiseSpeed
) {
786 << "LSV: Access of " << SizeBytes
<< "B in addrspace "
787 << AS
<< " with alignment " << Alignment
.value()
788 << " has relative speed " << VectorizedSpeed
789 << ", which is lower than the elementwise speed of "
791 << ". Therefore this access won't be vectorized.\n");
797 // If we're loading/storing from an alloca, align it if possible.
799 // FIXME: We eagerly upgrade the alignment, regardless of whether TTI
800 // tells us this is beneficial. This feels a bit odd, but it matches
801 // existing tests. This isn't *so* bad, because at most we align to 4
802 // bytes (current value of StackAdjustedAlignment).
804 // FIXME: We will upgrade the alignment of the alloca even if it turns out
805 // we can't vectorize for some other reason.
806 Value
*PtrOperand
= getLoadStorePointerOperand(C
[CBegin
].Inst
);
807 bool IsAllocaAccess
= AS
== DL
.getAllocaAddrSpace() &&
808 isa
<AllocaInst
>(PtrOperand
->stripPointerCasts());
809 Align Alignment
= getLoadStoreAlignment(C
[CBegin
].Inst
);
810 Align PrefAlign
= Align(StackAdjustedAlignment
);
811 if (IsAllocaAccess
&& Alignment
.value() % SizeBytes
!= 0 &&
812 IsAllowedAndFast(PrefAlign
)) {
813 Align NewAlign
= getOrEnforceKnownAlignment(
814 PtrOperand
, PrefAlign
, DL
, C
[CBegin
].Inst
, nullptr, &DT
);
815 if (NewAlign
>= Alignment
) {
817 << "LSV: splitByChain upgrading alloca alignment from "
818 << Alignment
.value() << " to " << NewAlign
.value()
820 Alignment
= NewAlign
;
824 if (!IsAllowedAndFast(Alignment
)) {
826 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
827 "because its alignment is not AllowedAndFast: "
828 << Alignment
.value() << "\n");
833 !TTI
.isLegalToVectorizeLoadChain(SizeBytes
, Alignment
, AS
)) ||
835 !TTI
.isLegalToVectorizeStoreChain(SizeBytes
, Alignment
, AS
))) {
837 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
838 "because !isLegalToVectorizeLoad/StoreChain.");
842 // Hooray, we can vectorize this chain!
843 Chain
&NewChain
= Ret
.emplace_back();
844 for (unsigned I
= CBegin
; I
<= CEnd
; ++I
)
845 NewChain
.emplace_back(C
[I
]);
846 CBegin
= CEnd
; // Skip over the instructions we've added to the chain.
853 bool Vectorizer::vectorizeChain(Chain
&C
) {
857 sortChainInOffsetOrder(C
);
860 dbgs() << "LSV: Vectorizing chain of " << C
.size() << " instructions:\n";
864 Type
*VecElemTy
= getChainElemTy(C
);
865 bool IsLoadChain
= isa
<LoadInst
>(C
[0].Inst
);
866 unsigned AS
= getLoadStoreAddressSpace(C
[0].Inst
);
867 unsigned ChainBytes
= std::accumulate(
868 C
.begin(), C
.end(), 0u, [&](unsigned Bytes
, const ChainElem
&E
) {
869 return Bytes
+ DL
.getTypeStoreSize(getLoadStoreType(E
.Inst
));
871 assert(ChainBytes
% DL
.getTypeStoreSize(VecElemTy
) == 0);
872 // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
873 // than 1 byte (e.g. VecTy == <32 x i1>).
874 Type
*VecTy
= FixedVectorType::get(
875 VecElemTy
, 8 * ChainBytes
/ DL
.getTypeSizeInBits(VecElemTy
));
877 Align Alignment
= getLoadStoreAlignment(C
[0].Inst
);
878 // If this is a load/store of an alloca, we might have upgraded the alloca's
879 // alignment earlier. Get the new alignment.
880 if (AS
== DL
.getAllocaAddrSpace()) {
881 Alignment
= std::max(
883 getOrEnforceKnownAlignment(getLoadStorePointerOperand(C
[0].Inst
),
884 MaybeAlign(), DL
, C
[0].Inst
, nullptr, &DT
));
887 // All elements of the chain must have the same scalar-type size.
889 for (const ChainElem
&E
: C
)
890 assert(DL
.getTypeStoreSize(getLoadStoreType(E
.Inst
)->getScalarType()) ==
891 DL
.getTypeStoreSize(VecElemTy
));
894 Instruction
*VecInst
;
896 // Loads get hoisted to the location of the first load in the chain. We may
897 // also need to hoist the (transitive) operands of the loads.
898 Builder
.SetInsertPoint(
899 llvm::min_element(C
, [](const auto &A
, const auto &B
) {
900 return A
.Inst
->comesBefore(B
.Inst
);
903 // Chain is in offset order, so C[0] is the instr with the lowest offset,
904 // i.e. the root of the vector.
905 VecInst
= Builder
.CreateAlignedLoad(VecTy
,
906 getLoadStorePointerOperand(C
[0].Inst
),
910 for (const ChainElem
&E
: C
) {
911 Instruction
*I
= E
.Inst
;
913 Type
*T
= getLoadStoreType(I
);
914 if (auto *VT
= dyn_cast
<FixedVectorType
>(T
)) {
915 auto Mask
= llvm::to_vector
<8>(
916 llvm::seq
<int>(VecIdx
, VecIdx
+ VT
->getNumElements()));
917 V
= Builder
.CreateShuffleVector(VecInst
, Mask
, I
->getName());
918 VecIdx
+= VT
->getNumElements();
920 V
= Builder
.CreateExtractElement(VecInst
, Builder
.getInt32(VecIdx
),
924 if (V
->getType() != I
->getType())
925 V
= Builder
.CreateBitOrPointerCast(V
, I
->getType());
926 I
->replaceAllUsesWith(V
);
929 // Finally, we need to reorder the instrs in the BB so that the (transitive)
930 // operands of VecInst appear before it. To see why, suppose we have
931 // vectorized the following code:
934 // load1 = load i32 ptr1
936 // load0 = load i32 ptr0
938 // We will put the vectorized load at the location of the earliest load in
939 // the BB, i.e. load1. We get:
942 // loadv = load <2 x i32> ptr0
943 // load0 = extractelement loadv, 0
944 // load1 = extractelement loadv, 1
947 // Notice that loadv uses ptr0, which is defined *after* it!
950 // Stores get sunk to the location of the last store in the chain.
951 Builder
.SetInsertPoint(llvm::max_element(C
, [](auto &A
, auto &B
) {
952 return A
.Inst
->comesBefore(B
.Inst
);
955 // Build the vector to store.
956 Value
*Vec
= PoisonValue::get(VecTy
);
958 auto InsertElem
= [&](Value
*V
) {
959 if (V
->getType() != VecElemTy
)
960 V
= Builder
.CreateBitOrPointerCast(V
, VecElemTy
);
961 Vec
= Builder
.CreateInsertElement(Vec
, V
, Builder
.getInt32(VecIdx
++));
963 for (const ChainElem
&E
: C
) {
964 auto *I
= cast
<StoreInst
>(E
.Inst
);
965 if (FixedVectorType
*VT
=
966 dyn_cast
<FixedVectorType
>(getLoadStoreType(I
))) {
967 for (int J
= 0, JE
= VT
->getNumElements(); J
< JE
; ++J
) {
968 InsertElem(Builder
.CreateExtractElement(I
->getValueOperand(),
969 Builder
.getInt32(J
)));
972 InsertElem(I
->getValueOperand());
976 // Chain is in offset order, so C[0] is the instr with the lowest offset,
977 // i.e. the root of the vector.
978 VecInst
= Builder
.CreateAlignedStore(
980 getLoadStorePointerOperand(C
[0].Inst
),
984 propagateMetadata(VecInst
, C
);
986 for (const ChainElem
&E
: C
)
987 ToErase
.emplace_back(E
.Inst
);
989 ++NumVectorInstructions
;
990 NumScalarsVectorized
+= C
.size();
994 template <bool IsLoadChain
>
995 bool Vectorizer::isSafeToMove(
996 Instruction
*ChainElem
, Instruction
*ChainBegin
,
997 const DenseMap
<Instruction
*, APInt
/*OffsetFromLeader*/> &ChainOffsets
) {
998 LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem
<< " -> "
999 << *ChainBegin
<< ")\n");
1001 assert(isa
<LoadInst
>(ChainElem
) == IsLoadChain
);
1002 if (ChainElem
== ChainBegin
)
1005 // Invariant loads can always be reordered; by definition they are not
1006 // clobbered by stores.
1007 if (isInvariantLoad(ChainElem
))
1010 auto BBIt
= std::next([&] {
1011 if constexpr (IsLoadChain
)
1012 return BasicBlock::reverse_iterator(ChainElem
);
1014 return BasicBlock::iterator(ChainElem
);
1016 auto BBItEnd
= std::next([&] {
1017 if constexpr (IsLoadChain
)
1018 return BasicBlock::reverse_iterator(ChainBegin
);
1020 return BasicBlock::iterator(ChainBegin
);
1023 const APInt
&ChainElemOffset
= ChainOffsets
.at(ChainElem
);
1024 const unsigned ChainElemSize
=
1025 DL
.getTypeStoreSize(getLoadStoreType(ChainElem
));
1027 for (; BBIt
!= BBItEnd
; ++BBIt
) {
1028 Instruction
*I
= &*BBIt
;
1030 if (!I
->mayReadOrWriteMemory())
1033 // Loads can be reordered with other loads.
1034 if (IsLoadChain
&& isa
<LoadInst
>(I
))
1037 // Stores can be sunk below invariant loads.
1038 if (!IsLoadChain
&& isInvariantLoad(I
))
1041 // If I is in the chain, we can tell whether it aliases ChainIt by checking
1042 // what offset ChainIt accesses. This may be better than AA is able to do.
1044 // We should really only have duplicate offsets for stores (the duplicate
1045 // loads should be CSE'ed), but in case we have a duplicate load, we'll
1046 // split the chain so we don't have to handle this case specially.
1047 if (auto OffsetIt
= ChainOffsets
.find(I
); OffsetIt
!= ChainOffsets
.end()) {
1048 // I and ChainElem overlap if:
1049 // - I and ChainElem have the same offset, OR
1050 // - I's offset is less than ChainElem's, but I touches past the
1051 // beginning of ChainElem, OR
1052 // - ChainElem's offset is less than I's, but ChainElem touches past the
1054 const APInt
&IOffset
= OffsetIt
->second
;
1055 unsigned IElemSize
= DL
.getTypeStoreSize(getLoadStoreType(I
));
1056 if (IOffset
== ChainElemOffset
||
1057 (IOffset
.sle(ChainElemOffset
) &&
1058 (IOffset
+ IElemSize
).sgt(ChainElemOffset
)) ||
1059 (ChainElemOffset
.sle(IOffset
) &&
1060 (ChainElemOffset
+ ChainElemSize
).sgt(OffsetIt
->second
))) {
1062 // Double check that AA also sees this alias. If not, we probably
1064 ModRefInfo MR
= AA
.getModRefInfo(I
, MemoryLocation::get(ChainElem
));
1065 assert(IsLoadChain
? isModSet(MR
) : isModOrRefSet(MR
));
1066 dbgs() << "LSV: Found alias in chain: " << *I
<< "\n";
1068 return false; // We found an aliasing instruction; bail.
1071 continue; // We're confident there's no alias.
1074 LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I
<< "\n");
1075 ModRefInfo MR
= AA
.getModRefInfo(I
, MemoryLocation::get(ChainElem
));
1076 if (IsLoadChain
? isModSet(MR
) : isModOrRefSet(MR
)) {
1077 LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
1078 << " Aliasing instruction:\n"
1079 << " " << *I
<< '\n'
1080 << " Aliased instruction and pointer:\n"
1081 << " " << *ChainElem
<< '\n'
1082 << " " << *getLoadStorePointerOperand(ChainElem
)
1091 static bool checkNoWrapFlags(Instruction
*I
, bool Signed
) {
1092 BinaryOperator
*BinOpI
= cast
<BinaryOperator
>(I
);
1093 return (Signed
&& BinOpI
->hasNoSignedWrap()) ||
1094 (!Signed
&& BinOpI
->hasNoUnsignedWrap());
1097 static bool checkIfSafeAddSequence(const APInt
&IdxDiff
, Instruction
*AddOpA
,
1098 unsigned MatchingOpIdxA
, Instruction
*AddOpB
,
1099 unsigned MatchingOpIdxB
, bool Signed
) {
1100 LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
1101 << ", AddOpA=" << *AddOpA
<< ", MatchingOpIdxA="
1102 << MatchingOpIdxA
<< ", AddOpB=" << *AddOpB
1103 << ", MatchingOpIdxB=" << MatchingOpIdxB
1104 << ", Signed=" << Signed
<< "\n");
1105 // If both OpA and OpB are adds with NSW/NUW and with one of the operands
1106 // being the same, we can guarantee that the transformation is safe if we can
1107 // prove that OpA won't overflow when Ret added to the other operand of OpA.
1109 // %tmp7 = add nsw i32 %tmp2, %v0
1110 // %tmp8 = sext i32 %tmp7 to i64
1112 // %tmp11 = add nsw i32 %v0, 1
1113 // %tmp12 = add nsw i32 %tmp2, %tmp11
1114 // %tmp13 = sext i32 %tmp12 to i64
1116 // Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.
1117 // It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds
1118 // 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.
1119 assert(AddOpA
->getOpcode() == Instruction::Add
&&
1120 AddOpB
->getOpcode() == Instruction::Add
&&
1121 checkNoWrapFlags(AddOpA
, Signed
) && checkNoWrapFlags(AddOpB
, Signed
));
1122 if (AddOpA
->getOperand(MatchingOpIdxA
) ==
1123 AddOpB
->getOperand(MatchingOpIdxB
)) {
1124 Value
*OtherOperandA
= AddOpA
->getOperand(MatchingOpIdxA
== 1 ? 0 : 1);
1125 Value
*OtherOperandB
= AddOpB
->getOperand(MatchingOpIdxB
== 1 ? 0 : 1);
1126 Instruction
*OtherInstrA
= dyn_cast
<Instruction
>(OtherOperandA
);
1127 Instruction
*OtherInstrB
= dyn_cast
<Instruction
>(OtherOperandB
);
1128 // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
1129 if (OtherInstrB
&& OtherInstrB
->getOpcode() == Instruction::Add
&&
1130 checkNoWrapFlags(OtherInstrB
, Signed
) &&
1131 isa
<ConstantInt
>(OtherInstrB
->getOperand(1))) {
1133 cast
<ConstantInt
>(OtherInstrB
->getOperand(1))->getSExtValue();
1134 if (OtherInstrB
->getOperand(0) == OtherOperandA
&&
1135 IdxDiff
.getSExtValue() == CstVal
)
1138 // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
1139 if (OtherInstrA
&& OtherInstrA
->getOpcode() == Instruction::Add
&&
1140 checkNoWrapFlags(OtherInstrA
, Signed
) &&
1141 isa
<ConstantInt
>(OtherInstrA
->getOperand(1))) {
1143 cast
<ConstantInt
>(OtherInstrA
->getOperand(1))->getSExtValue();
1144 if (OtherInstrA
->getOperand(0) == OtherOperandB
&&
1145 IdxDiff
.getSExtValue() == -CstVal
)
1148 // Match `x +nsw/nuw (y +nsw/nuw c)` and
1149 // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
1150 if (OtherInstrA
&& OtherInstrB
&&
1151 OtherInstrA
->getOpcode() == Instruction::Add
&&
1152 OtherInstrB
->getOpcode() == Instruction::Add
&&
1153 checkNoWrapFlags(OtherInstrA
, Signed
) &&
1154 checkNoWrapFlags(OtherInstrB
, Signed
) &&
1155 isa
<ConstantInt
>(OtherInstrA
->getOperand(1)) &&
1156 isa
<ConstantInt
>(OtherInstrB
->getOperand(1))) {
1158 cast
<ConstantInt
>(OtherInstrA
->getOperand(1))->getSExtValue();
1160 cast
<ConstantInt
>(OtherInstrB
->getOperand(1))->getSExtValue();
1161 if (OtherInstrA
->getOperand(0) == OtherInstrB
->getOperand(0) &&
1162 IdxDiff
.getSExtValue() == (CstValB
- CstValA
))
1169 std::optional
<APInt
> Vectorizer::getConstantOffsetComplexAddrs(
1170 Value
*PtrA
, Value
*PtrB
, Instruction
*ContextInst
, unsigned Depth
) {
1171 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA
1172 << " PtrB=" << *PtrB
<< " ContextInst=" << *ContextInst
1173 << " Depth=" << Depth
<< "\n");
1174 auto *GEPA
= dyn_cast
<GetElementPtrInst
>(PtrA
);
1175 auto *GEPB
= dyn_cast
<GetElementPtrInst
>(PtrB
);
1177 return getConstantOffsetSelects(PtrA
, PtrB
, ContextInst
, Depth
);
1179 // Look through GEPs after checking they're the same except for the last
1181 if (GEPA
->getNumOperands() != GEPB
->getNumOperands() ||
1182 GEPA
->getPointerOperand() != GEPB
->getPointerOperand())
1183 return std::nullopt
;
1184 gep_type_iterator GTIA
= gep_type_begin(GEPA
);
1185 gep_type_iterator GTIB
= gep_type_begin(GEPB
);
1186 for (unsigned I
= 0, E
= GEPA
->getNumIndices() - 1; I
< E
; ++I
) {
1187 if (GTIA
.getOperand() != GTIB
.getOperand())
1188 return std::nullopt
;
1193 Instruction
*OpA
= dyn_cast
<Instruction
>(GTIA
.getOperand());
1194 Instruction
*OpB
= dyn_cast
<Instruction
>(GTIB
.getOperand());
1195 if (!OpA
|| !OpB
|| OpA
->getOpcode() != OpB
->getOpcode() ||
1196 OpA
->getType() != OpB
->getType())
1197 return std::nullopt
;
1199 uint64_t Stride
= GTIA
.getSequentialElementStride(DL
);
1201 // Only look through a ZExt/SExt.
1202 if (!isa
<SExtInst
>(OpA
) && !isa
<ZExtInst
>(OpA
))
1203 return std::nullopt
;
1205 bool Signed
= isa
<SExtInst
>(OpA
);
1207 // At this point A could be a function parameter, i.e. not an instruction
1208 Value
*ValA
= OpA
->getOperand(0);
1209 OpB
= dyn_cast
<Instruction
>(OpB
->getOperand(0));
1210 if (!OpB
|| ValA
->getType() != OpB
->getType())
1211 return std::nullopt
;
1213 const SCEV
*OffsetSCEVA
= SE
.getSCEV(ValA
);
1214 const SCEV
*OffsetSCEVB
= SE
.getSCEV(OpB
);
1215 const SCEV
*IdxDiffSCEV
= SE
.getMinusSCEV(OffsetSCEVB
, OffsetSCEVA
);
1216 if (IdxDiffSCEV
== SE
.getCouldNotCompute())
1217 return std::nullopt
;
1219 ConstantRange IdxDiffRange
= SE
.getSignedRange(IdxDiffSCEV
);
1220 if (!IdxDiffRange
.isSingleElement())
1221 return std::nullopt
;
1222 APInt IdxDiff
= *IdxDiffRange
.getSingleElement();
1224 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
1227 // Now we need to prove that adding IdxDiff to ValA won't overflow.
1230 // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
1231 // ValA, we're okay.
1232 if (OpB
->getOpcode() == Instruction::Add
&&
1233 isa
<ConstantInt
>(OpB
->getOperand(1)) &&
1234 IdxDiff
.sle(cast
<ConstantInt
>(OpB
->getOperand(1))->getSExtValue()) &&
1235 checkNoWrapFlags(OpB
, Signed
))
1238 // Second attempt: check if we have eligible add NSW/NUW instruction
1240 OpA
= dyn_cast
<Instruction
>(ValA
);
1241 if (!Safe
&& OpA
&& OpA
->getOpcode() == Instruction::Add
&&
1242 OpB
->getOpcode() == Instruction::Add
&& checkNoWrapFlags(OpA
, Signed
) &&
1243 checkNoWrapFlags(OpB
, Signed
)) {
1244 // In the checks below a matching operand in OpA and OpB is an operand which
1245 // is the same in those two instructions. Below we account for possible
1246 // orders of the operands of these add instructions.
1247 for (unsigned MatchingOpIdxA
: {0, 1})
1248 for (unsigned MatchingOpIdxB
: {0, 1})
1250 Safe
= checkIfSafeAddSequence(IdxDiff
, OpA
, MatchingOpIdxA
, OpB
,
1251 MatchingOpIdxB
, Signed
);
1254 unsigned BitWidth
= ValA
->getType()->getScalarSizeInBits();
1258 // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher
1259 // order bit other than the sign bit are known to be zero in ValA, we can add
1260 // Diff to it while guaranteeing no overflow of any sort.
1262 // If IdxDiff is negative, do the same, but swap ValA and ValB.
1264 // When computing known bits, use the GEPs as context instructions, since
1265 // they likely are in the same BB as the load/store.
1266 KnownBits
Known(BitWidth
);
1267 computeKnownBits((IdxDiff
.sge(0) ? ValA
: OpB
), Known
, DL
, 0, &AC
,
1269 APInt BitsAllowedToBeSet
= Known
.Zero
.zext(IdxDiff
.getBitWidth());
1271 BitsAllowedToBeSet
.clearBit(BitWidth
- 1);
1272 if (BitsAllowedToBeSet
.ult(IdxDiff
.abs()))
1273 return std::nullopt
;
1278 return IdxDiff
* Stride
;
1279 return std::nullopt
;
1282 std::optional
<APInt
> Vectorizer::getConstantOffsetSelects(
1283 Value
*PtrA
, Value
*PtrB
, Instruction
*ContextInst
, unsigned Depth
) {
1284 if (Depth
++ == MaxDepth
)
1285 return std::nullopt
;
1287 if (auto *SelectA
= dyn_cast
<SelectInst
>(PtrA
)) {
1288 if (auto *SelectB
= dyn_cast
<SelectInst
>(PtrB
)) {
1289 if (SelectA
->getCondition() != SelectB
->getCondition())
1290 return std::nullopt
;
1291 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA
1292 << ", PtrB=" << *PtrB
<< ", ContextInst="
1293 << *ContextInst
<< ", Depth=" << Depth
<< "\n");
1294 std::optional
<APInt
> TrueDiff
= getConstantOffset(
1295 SelectA
->getTrueValue(), SelectB
->getTrueValue(), ContextInst
, Depth
);
1297 return std::nullopt
;
1298 std::optional
<APInt
> FalseDiff
=
1299 getConstantOffset(SelectA
->getFalseValue(), SelectB
->getFalseValue(),
1300 ContextInst
, Depth
);
1301 if (TrueDiff
== FalseDiff
)
1305 return std::nullopt
;
1309 Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin
,
1310 BasicBlock::iterator End
) {
1311 EquivalenceClassMap Ret
;
1313 auto GetUnderlyingObject
= [](const Value
*Ptr
) -> const Value
* {
1314 const Value
*ObjPtr
= llvm::getUnderlyingObject(Ptr
);
1315 if (const auto *Sel
= dyn_cast
<SelectInst
>(ObjPtr
)) {
1316 // The select's themselves are distinct instructions even if they share
1317 // the same condition and evaluate to consecutive pointers for true and
1318 // false values of the condition. Therefore using the select's themselves
1319 // for grouping instructions would put consecutive accesses into different
1320 // lists and they won't be even checked for being consecutive, and won't
1322 return Sel
->getCondition();
1327 for (Instruction
&I
: make_range(Begin
, End
)) {
1328 auto *LI
= dyn_cast
<LoadInst
>(&I
);
1329 auto *SI
= dyn_cast
<StoreInst
>(&I
);
1333 if ((LI
&& !LI
->isSimple()) || (SI
&& !SI
->isSimple()))
1336 if ((LI
&& !TTI
.isLegalToVectorizeLoad(LI
)) ||
1337 (SI
&& !TTI
.isLegalToVectorizeStore(SI
)))
1340 Type
*Ty
= getLoadStoreType(&I
);
1341 if (!VectorType::isValidElementType(Ty
->getScalarType()))
1344 // Skip weird non-byte sizes. They probably aren't worth the effort of
1345 // handling correctly.
1346 unsigned TySize
= DL
.getTypeSizeInBits(Ty
);
1347 if ((TySize
% 8) != 0)
1350 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
1351 // functions are currently using an integer type for the vectorized
1352 // load/store, and does not support casting between the integer type and a
1353 // vector of pointers (e.g. i64 to <2 x i16*>)
1354 if (Ty
->isVectorTy() && Ty
->isPtrOrPtrVectorTy())
1357 Value
*Ptr
= getLoadStorePointerOperand(&I
);
1358 unsigned AS
= Ptr
->getType()->getPointerAddressSpace();
1359 unsigned VecRegSize
= TTI
.getLoadStoreVecRegBitWidth(AS
);
1361 unsigned VF
= VecRegSize
/ TySize
;
1362 VectorType
*VecTy
= dyn_cast
<VectorType
>(Ty
);
1364 // Only handle power-of-two sized elements.
1365 if ((!VecTy
&& !isPowerOf2_32(DL
.getTypeSizeInBits(Ty
))) ||
1366 (VecTy
&& !isPowerOf2_32(DL
.getTypeSizeInBits(VecTy
->getScalarType()))))
1369 // No point in looking at these if they're too big to vectorize.
1370 if (TySize
> VecRegSize
/ 2 ||
1371 (VecTy
&& TTI
.getLoadVectorFactor(VF
, TySize
, TySize
/ 8, VecTy
) == 0))
1374 Ret
[{GetUnderlyingObject(Ptr
), AS
,
1375 DL
.getTypeSizeInBits(getLoadStoreType(&I
)->getScalarType()),
1376 /*IsLoad=*/LI
!= nullptr}]
1383 std::vector
<Chain
> Vectorizer::gatherChains(ArrayRef
<Instruction
*> Instrs
) {
1387 unsigned AS
= getLoadStoreAddressSpace(Instrs
[0]);
1388 unsigned ASPtrBits
= DL
.getIndexSizeInBits(AS
);
1391 // Check that Instrs is in BB order and all have the same addr space.
1392 for (size_t I
= 1; I
< Instrs
.size(); ++I
) {
1393 assert(Instrs
[I
- 1]->comesBefore(Instrs
[I
]));
1394 assert(getLoadStoreAddressSpace(Instrs
[I
]) == AS
);
1398 // Machinery to build an MRU-hashtable of Chains.
1400 // (Ideally this could be done with MapVector, but as currently implemented,
1401 // moving an element to the front of a MapVector is O(n).)
1402 struct InstrListElem
: ilist_node
<InstrListElem
>,
1403 std::pair
<Instruction
*, Chain
> {
1404 explicit InstrListElem(Instruction
*I
)
1405 : std::pair
<Instruction
*, Chain
>(I
, {}) {}
1407 struct InstrListElemDenseMapInfo
{
1408 using PtrInfo
= DenseMapInfo
<InstrListElem
*>;
1409 using IInfo
= DenseMapInfo
<Instruction
*>;
1410 static InstrListElem
*getEmptyKey() { return PtrInfo::getEmptyKey(); }
1411 static InstrListElem
*getTombstoneKey() {
1412 return PtrInfo::getTombstoneKey();
1414 static unsigned getHashValue(const InstrListElem
*E
) {
1415 return IInfo::getHashValue(E
->first
);
1417 static bool isEqual(const InstrListElem
*A
, const InstrListElem
*B
) {
1418 if (A
== getEmptyKey() || B
== getEmptyKey())
1419 return A
== getEmptyKey() && B
== getEmptyKey();
1420 if (A
== getTombstoneKey() || B
== getTombstoneKey())
1421 return A
== getTombstoneKey() && B
== getTombstoneKey();
1422 return IInfo::isEqual(A
->first
, B
->first
);
1425 SpecificBumpPtrAllocator
<InstrListElem
> Allocator
;
1426 simple_ilist
<InstrListElem
> MRU
;
1427 DenseSet
<InstrListElem
*, InstrListElemDenseMapInfo
> Chains
;
1429 // Compare each instruction in `instrs` to leader of the N most recently-used
1430 // chains. This limits the O(n^2) behavior of this pass while also allowing
1431 // us to build arbitrarily long chains.
1432 for (Instruction
*I
: Instrs
) {
1433 constexpr int MaxChainsToTry
= 64;
1435 bool MatchFound
= false;
1436 auto ChainIter
= MRU
.begin();
1437 for (size_t J
= 0; J
< MaxChainsToTry
&& ChainIter
!= MRU
.end();
1439 if (std::optional
<APInt
> Offset
= getConstantOffset(
1440 getLoadStorePointerOperand(ChainIter
->first
),
1441 getLoadStorePointerOperand(I
),
1443 (ChainIter
->first
->comesBefore(I
) ? I
: ChainIter
->first
))) {
1444 // `Offset` might not have the expected number of bits, if e.g. AS has a
1445 // different number of bits than opaque pointers.
1446 ChainIter
->second
.emplace_back(I
, Offset
.value());
1447 // Move ChainIter to the front of the MRU list.
1448 MRU
.remove(*ChainIter
);
1449 MRU
.push_front(*ChainIter
);
1456 APInt
ZeroOffset(ASPtrBits
, 0);
1457 InstrListElem
*E
= new (Allocator
.Allocate()) InstrListElem(I
);
1458 E
->second
.emplace_back(I
, ZeroOffset
);
1464 std::vector
<Chain
> Ret
;
1465 Ret
.reserve(Chains
.size());
1466 // Iterate over MRU rather than Chains so the order is deterministic.
1468 if (E
.second
.size() > 1)
1469 Ret
.emplace_back(std::move(E
.second
));
1473 std::optional
<APInt
> Vectorizer::getConstantOffset(Value
*PtrA
, Value
*PtrB
,
1474 Instruction
*ContextInst
,
1476 LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA
1477 << ", PtrB=" << *PtrB
<< ", ContextInst= " << *ContextInst
1478 << ", Depth=" << Depth
<< "\n");
1479 // We'll ultimately return a value of this bit width, even if computations
1480 // happen in a different width.
1481 unsigned OrigBitWidth
= DL
.getIndexTypeSizeInBits(PtrA
->getType());
1482 APInt
OffsetA(OrigBitWidth
, 0);
1483 APInt
OffsetB(OrigBitWidth
, 0);
1484 PtrA
= PtrA
->stripAndAccumulateInBoundsConstantOffsets(DL
, OffsetA
);
1485 PtrB
= PtrB
->stripAndAccumulateInBoundsConstantOffsets(DL
, OffsetB
);
1486 unsigned NewPtrBitWidth
= DL
.getTypeStoreSizeInBits(PtrA
->getType());
1487 if (NewPtrBitWidth
!= DL
.getTypeStoreSizeInBits(PtrB
->getType()))
1488 return std::nullopt
;
1490 // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets
1491 // should properly handle a possible overflow and the value should fit into
1492 // the smallest data type used in the cast/gep chain.
1493 assert(OffsetA
.getSignificantBits() <= NewPtrBitWidth
&&
1494 OffsetB
.getSignificantBits() <= NewPtrBitWidth
);
1496 OffsetA
= OffsetA
.sextOrTrunc(NewPtrBitWidth
);
1497 OffsetB
= OffsetB
.sextOrTrunc(NewPtrBitWidth
);
1499 return (OffsetB
- OffsetA
).sextOrTrunc(OrigBitWidth
);
1501 // Try to compute B - A.
1502 const SCEV
*DistScev
= SE
.getMinusSCEV(SE
.getSCEV(PtrB
), SE
.getSCEV(PtrA
));
1503 if (DistScev
!= SE
.getCouldNotCompute()) {
1504 LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev
<< "\n");
1505 ConstantRange DistRange
= SE
.getSignedRange(DistScev
);
1506 if (DistRange
.isSingleElement()) {
1507 // Handle index width (the width of Dist) != pointer width (the width of
1508 // the Offset*s at this point).
1509 APInt Dist
= DistRange
.getSingleElement()->sextOrTrunc(NewPtrBitWidth
);
1510 return (OffsetB
- OffsetA
+ Dist
).sextOrTrunc(OrigBitWidth
);
1513 if (std::optional
<APInt
> Diff
=
1514 getConstantOffsetComplexAddrs(PtrA
, PtrB
, ContextInst
, Depth
))
1515 return (OffsetB
- OffsetA
+ Diff
->sext(OffsetB
.getBitWidth()))
1516 .sextOrTrunc(OrigBitWidth
);
1517 return std::nullopt
;