1 //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass merges loads/stores to/from sequential memory addresses into vector
10 // loads/stores. Although there's nothing GPU-specific in here, this pass is
11 // motivated by the microarchitectural quirks of nVidia and AMD GPUs.
13 // (For simplicity below we talk about loads only, but everything also applies
16 // This pass is intended to be run late in the pipeline, after other
17 // vectorization opportunities have been exploited. So the assumption here is
18 // that immediately following our new vector load we'll need to extract out the
19 // individual elements of the load, so we can operate on them individually.
21 // On CPUs this transformation is usually not beneficial, because extracting the
22 // elements of a vector register is expensive on most architectures. It's
23 // usually better just to load each element individually into its own scalar
26 // However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27 // "vector load" loads directly into a series of scalar registers. In effect,
28 // extracting the elements of the vector is free. It's therefore always
29 // beneficial to vectorize a sequence of loads on these architectures.
31 // Vectorizing (perhaps a better name might be "coalescing") loads can have
32 // large performance impacts on GPU kernels, and opportunities for vectorizing
33 // are common in GPU code. This pass tries very hard to find such
34 // opportunities; its runtime is quadratic in the number of loads in a BB.
36 // Some CPU architectures, such as ARM, have instructions that load into
37 // multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38 // could use this pass (with some modifications), but currently it implements
39 // its own pass to do something similar to what we do here.
41 #include "llvm/ADT/APInt.h"
42 #include "llvm/ADT/ArrayRef.h"
43 #include "llvm/ADT/MapVector.h"
44 #include "llvm/ADT/PostOrderIterator.h"
45 #include "llvm/ADT/STLExtras.h"
46 #include "llvm/ADT/SmallPtrSet.h"
47 #include "llvm/ADT/SmallVector.h"
48 #include "llvm/ADT/Statistic.h"
49 #include "llvm/ADT/iterator_range.h"
50 #include "llvm/Analysis/AliasAnalysis.h"
51 #include "llvm/Analysis/MemoryLocation.h"
52 #include "llvm/Analysis/OrderedBasicBlock.h"
53 #include "llvm/Analysis/ScalarEvolution.h"
54 #include "llvm/Analysis/TargetTransformInfo.h"
55 #include "llvm/Transforms/Utils/Local.h"
56 #include "llvm/Analysis/ValueTracking.h"
57 #include "llvm/Analysis/VectorUtils.h"
58 #include "llvm/IR/Attributes.h"
59 #include "llvm/IR/BasicBlock.h"
60 #include "llvm/IR/Constants.h"
61 #include "llvm/IR/DataLayout.h"
62 #include "llvm/IR/DerivedTypes.h"
63 #include "llvm/IR/Dominators.h"
64 #include "llvm/IR/Function.h"
65 #include "llvm/IR/IRBuilder.h"
66 #include "llvm/IR/InstrTypes.h"
67 #include "llvm/IR/Instruction.h"
68 #include "llvm/IR/Instructions.h"
69 #include "llvm/IR/IntrinsicInst.h"
70 #include "llvm/IR/Module.h"
71 #include "llvm/IR/Type.h"
72 #include "llvm/IR/User.h"
73 #include "llvm/IR/Value.h"
74 #include "llvm/Pass.h"
75 #include "llvm/Support/Casting.h"
76 #include "llvm/Support/Debug.h"
77 #include "llvm/Support/KnownBits.h"
78 #include "llvm/Support/MathExtras.h"
79 #include "llvm/Support/raw_ostream.h"
80 #include "llvm/Transforms/Vectorize.h"
81 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
90 #define DEBUG_TYPE "load-store-vectorizer"
92 STATISTIC(NumVectorInstructions
, "Number of vector accesses generated");
93 STATISTIC(NumScalarsVectorized
, "Number of scalar accesses vectorized");
95 // FIXME: Assuming stack alignment of 4 is always good enough
96 static const unsigned StackAdjustedAlignment
= 4;
100 /// ChainID is an arbitrary token that is allowed to be different only for the
101 /// accesses that are guaranteed to be considered non-consecutive by
102 /// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
103 /// together and reducing the number of instructions the main search operates on
104 /// at a time, i.e. this is to reduce compile time and nothing else as the main
105 /// search has O(n^2) time complexity. The underlying type of ChainID should not
107 using ChainID
= const Value
*;
108 using InstrList
= SmallVector
<Instruction
*, 8>;
109 using InstrListMap
= MapVector
<ChainID
, InstrList
>;
116 TargetTransformInfo
&TTI
;
117 const DataLayout
&DL
;
121 Vectorizer(Function
&F
, AliasAnalysis
&AA
, DominatorTree
&DT
,
122 ScalarEvolution
&SE
, TargetTransformInfo
&TTI
)
123 : F(F
), AA(AA
), DT(DT
), SE(SE
), TTI(TTI
),
124 DL(F
.getParent()->getDataLayout()), Builder(SE
.getContext()) {}
129 unsigned getPointerAddressSpace(Value
*I
);
131 unsigned getAlignment(LoadInst
*LI
) const {
132 unsigned Align
= LI
->getAlignment();
136 return DL
.getABITypeAlignment(LI
->getType());
139 unsigned getAlignment(StoreInst
*SI
) const {
140 unsigned Align
= SI
->getAlignment();
144 return DL
.getABITypeAlignment(SI
->getValueOperand()->getType());
147 static const unsigned MaxDepth
= 3;
149 bool isConsecutiveAccess(Value
*A
, Value
*B
);
150 bool areConsecutivePointers(Value
*PtrA
, Value
*PtrB
, const APInt
&PtrDelta
,
151 unsigned Depth
= 0) const;
152 bool lookThroughComplexAddresses(Value
*PtrA
, Value
*PtrB
, APInt PtrDelta
,
153 unsigned Depth
) const;
154 bool lookThroughSelects(Value
*PtrA
, Value
*PtrB
, const APInt
&PtrDelta
,
155 unsigned Depth
) const;
157 /// After vectorization, reorder the instructions that I depends on
158 /// (the instructions defining its operands), to ensure they dominate I.
159 void reorder(Instruction
*I
);
161 /// Returns the first and the last instructions in Chain.
162 std::pair
<BasicBlock::iterator
, BasicBlock::iterator
>
163 getBoundaryInstrs(ArrayRef
<Instruction
*> Chain
);
165 /// Erases the original instructions after vectorizing.
166 void eraseInstructions(ArrayRef
<Instruction
*> Chain
);
168 /// "Legalize" the vector type that would be produced by combining \p
169 /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
170 /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
171 /// expected to have more than 4 elements.
172 std::pair
<ArrayRef
<Instruction
*>, ArrayRef
<Instruction
*>>
173 splitOddVectorElts(ArrayRef
<Instruction
*> Chain
, unsigned ElementSizeBits
);
175 /// Finds the largest prefix of Chain that's vectorizable, checking for
176 /// intervening instructions which may affect the memory accessed by the
177 /// instructions within Chain.
179 /// The elements of \p Chain must be all loads or all stores and must be in
181 ArrayRef
<Instruction
*> getVectorizablePrefix(ArrayRef
<Instruction
*> Chain
);
183 /// Collects load and store instructions to vectorize.
184 std::pair
<InstrListMap
, InstrListMap
> collectInstructions(BasicBlock
*BB
);
186 /// Processes the collected instructions, the \p Map. The values of \p Map
187 /// should be all loads or all stores.
188 bool vectorizeChains(InstrListMap
&Map
);
190 /// Finds the load/stores to consecutive memory addresses and vectorizes them.
191 bool vectorizeInstructions(ArrayRef
<Instruction
*> Instrs
);
193 /// Vectorizes the load instructions in Chain.
195 vectorizeLoadChain(ArrayRef
<Instruction
*> Chain
,
196 SmallPtrSet
<Instruction
*, 16> *InstructionsProcessed
);
198 /// Vectorizes the store instructions in Chain.
200 vectorizeStoreChain(ArrayRef
<Instruction
*> Chain
,
201 SmallPtrSet
<Instruction
*, 16> *InstructionsProcessed
);
203 /// Check if this load/store access is misaligned accesses.
204 bool accessIsMisaligned(unsigned SzInBytes
, unsigned AddressSpace
,
208 class LoadStoreVectorizerLegacyPass
: public FunctionPass
{
212 LoadStoreVectorizerLegacyPass() : FunctionPass(ID
) {
213 initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry());
216 bool runOnFunction(Function
&F
) override
;
218 StringRef
getPassName() const override
{
219 return "GPU Load and Store Vectorizer";
222 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
223 AU
.addRequired
<AAResultsWrapperPass
>();
224 AU
.addRequired
<ScalarEvolutionWrapperPass
>();
225 AU
.addRequired
<DominatorTreeWrapperPass
>();
226 AU
.addRequired
<TargetTransformInfoWrapperPass
>();
227 AU
.setPreservesCFG();
231 } // end anonymous namespace
233 char LoadStoreVectorizerLegacyPass::ID
= 0;
235 INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass
, DEBUG_TYPE
,
236 "Vectorize load and Store instructions", false, false)
237 INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass
)
238 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass
)
239 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass
)
240 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass
)
241 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass
)
242 INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass
, DEBUG_TYPE
,
243 "Vectorize load and store instructions", false, false)
245 Pass
*llvm::createLoadStoreVectorizerPass() {
246 return new LoadStoreVectorizerLegacyPass();
249 bool LoadStoreVectorizerLegacyPass::runOnFunction(Function
&F
) {
250 // Don't vectorize when the attribute NoImplicitFloat is used.
251 if (skipFunction(F
) || F
.hasFnAttribute(Attribute::NoImplicitFloat
))
254 AliasAnalysis
&AA
= getAnalysis
<AAResultsWrapperPass
>().getAAResults();
255 DominatorTree
&DT
= getAnalysis
<DominatorTreeWrapperPass
>().getDomTree();
256 ScalarEvolution
&SE
= getAnalysis
<ScalarEvolutionWrapperPass
>().getSE();
257 TargetTransformInfo
&TTI
=
258 getAnalysis
<TargetTransformInfoWrapperPass
>().getTTI(F
);
260 Vectorizer
V(F
, AA
, DT
, SE
, TTI
);
264 PreservedAnalyses
LoadStoreVectorizerPass::run(Function
&F
, FunctionAnalysisManager
&AM
) {
265 // Don't vectorize when the attribute NoImplicitFloat is used.
266 if (F
.hasFnAttribute(Attribute::NoImplicitFloat
))
267 return PreservedAnalyses::all();
269 AliasAnalysis
&AA
= AM
.getResult
<AAManager
>(F
);
270 DominatorTree
&DT
= AM
.getResult
<DominatorTreeAnalysis
>(F
);
271 ScalarEvolution
&SE
= AM
.getResult
<ScalarEvolutionAnalysis
>(F
);
272 TargetTransformInfo
&TTI
= AM
.getResult
<TargetIRAnalysis
>(F
);
274 Vectorizer
V(F
, AA
, DT
, SE
, TTI
);
275 bool Changed
= V
.run();
276 PreservedAnalyses PA
;
277 PA
.preserveSet
<CFGAnalyses
>();
278 return Changed
? PA
: PreservedAnalyses::all();
281 // The real propagateMetadata expects a SmallVector<Value*>, but we deal in
282 // vectors of Instructions.
283 static void propagateMetadata(Instruction
*I
, ArrayRef
<Instruction
*> IL
) {
284 SmallVector
<Value
*, 8> VL(IL
.begin(), IL
.end());
285 propagateMetadata(I
, VL
);
288 // Vectorizer Implementation
289 bool Vectorizer::run() {
290 bool Changed
= false;
292 // Scan the blocks in the function in post order.
293 for (BasicBlock
*BB
: post_order(&F
)) {
294 InstrListMap LoadRefs
, StoreRefs
;
295 std::tie(LoadRefs
, StoreRefs
) = collectInstructions(BB
);
296 Changed
|= vectorizeChains(LoadRefs
);
297 Changed
|= vectorizeChains(StoreRefs
);
303 unsigned Vectorizer::getPointerAddressSpace(Value
*I
) {
304 if (LoadInst
*L
= dyn_cast
<LoadInst
>(I
))
305 return L
->getPointerAddressSpace();
306 if (StoreInst
*S
= dyn_cast
<StoreInst
>(I
))
307 return S
->getPointerAddressSpace();
311 // FIXME: Merge with llvm::isConsecutiveAccess
312 bool Vectorizer::isConsecutiveAccess(Value
*A
, Value
*B
) {
313 Value
*PtrA
= getLoadStorePointerOperand(A
);
314 Value
*PtrB
= getLoadStorePointerOperand(B
);
315 unsigned ASA
= getPointerAddressSpace(A
);
316 unsigned ASB
= getPointerAddressSpace(B
);
318 // Check that the address spaces match and that the pointers are valid.
319 if (!PtrA
|| !PtrB
|| (ASA
!= ASB
))
322 // Make sure that A and B are different pointers of the same size type.
323 Type
*PtrATy
= PtrA
->getType()->getPointerElementType();
324 Type
*PtrBTy
= PtrB
->getType()->getPointerElementType();
326 PtrATy
->isVectorTy() != PtrBTy
->isVectorTy() ||
327 DL
.getTypeStoreSize(PtrATy
) != DL
.getTypeStoreSize(PtrBTy
) ||
328 DL
.getTypeStoreSize(PtrATy
->getScalarType()) !=
329 DL
.getTypeStoreSize(PtrBTy
->getScalarType()))
332 unsigned PtrBitWidth
= DL
.getPointerSizeInBits(ASA
);
333 APInt
Size(PtrBitWidth
, DL
.getTypeStoreSize(PtrATy
));
335 return areConsecutivePointers(PtrA
, PtrB
, Size
);
338 bool Vectorizer::areConsecutivePointers(Value
*PtrA
, Value
*PtrB
,
339 const APInt
&PtrDelta
,
340 unsigned Depth
) const {
341 unsigned PtrBitWidth
= DL
.getPointerTypeSizeInBits(PtrA
->getType());
342 APInt
OffsetA(PtrBitWidth
, 0);
343 APInt
OffsetB(PtrBitWidth
, 0);
344 PtrA
= PtrA
->stripAndAccumulateInBoundsConstantOffsets(DL
, OffsetA
);
345 PtrB
= PtrB
->stripAndAccumulateInBoundsConstantOffsets(DL
, OffsetB
);
347 APInt OffsetDelta
= OffsetB
- OffsetA
;
349 // Check if they are based on the same pointer. That makes the offsets
352 return OffsetDelta
== PtrDelta
;
354 // Compute the necessary base pointer delta to have the necessary final delta
355 // equal to the pointer delta requested.
356 APInt BaseDelta
= PtrDelta
- OffsetDelta
;
358 // Compute the distance with SCEV between the base pointers.
359 const SCEV
*PtrSCEVA
= SE
.getSCEV(PtrA
);
360 const SCEV
*PtrSCEVB
= SE
.getSCEV(PtrB
);
361 const SCEV
*C
= SE
.getConstant(BaseDelta
);
362 const SCEV
*X
= SE
.getAddExpr(PtrSCEVA
, C
);
366 // The above check will not catch the cases where one of the pointers is
367 // factorized but the other one is not, such as (C + (S * (A + B))) vs
368 // (AS + BS). Get the minus scev. That will allow re-combining the expresions
369 // and getting the simplified difference.
370 const SCEV
*Dist
= SE
.getMinusSCEV(PtrSCEVB
, PtrSCEVA
);
374 // Sometimes even this doesn't work, because SCEV can't always see through
375 // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
376 // things the hard way.
377 return lookThroughComplexAddresses(PtrA
, PtrB
, BaseDelta
, Depth
);
380 bool Vectorizer::lookThroughComplexAddresses(Value
*PtrA
, Value
*PtrB
,
382 unsigned Depth
) const {
383 auto *GEPA
= dyn_cast
<GetElementPtrInst
>(PtrA
);
384 auto *GEPB
= dyn_cast
<GetElementPtrInst
>(PtrB
);
386 return lookThroughSelects(PtrA
, PtrB
, PtrDelta
, Depth
);
388 // Look through GEPs after checking they're the same except for the last
390 if (GEPA
->getNumOperands() != GEPB
->getNumOperands() ||
391 GEPA
->getPointerOperand() != GEPB
->getPointerOperand())
393 gep_type_iterator GTIA
= gep_type_begin(GEPA
);
394 gep_type_iterator GTIB
= gep_type_begin(GEPB
);
395 for (unsigned I
= 0, E
= GEPA
->getNumIndices() - 1; I
< E
; ++I
) {
396 if (GTIA
.getOperand() != GTIB
.getOperand())
402 Instruction
*OpA
= dyn_cast
<Instruction
>(GTIA
.getOperand());
403 Instruction
*OpB
= dyn_cast
<Instruction
>(GTIB
.getOperand());
404 if (!OpA
|| !OpB
|| OpA
->getOpcode() != OpB
->getOpcode() ||
405 OpA
->getType() != OpB
->getType())
408 if (PtrDelta
.isNegative()) {
409 if (PtrDelta
.isMinSignedValue())
414 uint64_t Stride
= DL
.getTypeAllocSize(GTIA
.getIndexedType());
415 if (PtrDelta
.urem(Stride
) != 0)
417 unsigned IdxBitWidth
= OpA
->getType()->getScalarSizeInBits();
418 APInt IdxDiff
= PtrDelta
.udiv(Stride
).zextOrSelf(IdxBitWidth
);
420 // Only look through a ZExt/SExt.
421 if (!isa
<SExtInst
>(OpA
) && !isa
<ZExtInst
>(OpA
))
424 bool Signed
= isa
<SExtInst
>(OpA
);
426 // At this point A could be a function parameter, i.e. not an instruction
427 Value
*ValA
= OpA
->getOperand(0);
428 OpB
= dyn_cast
<Instruction
>(OpB
->getOperand(0));
429 if (!OpB
|| ValA
->getType() != OpB
->getType())
432 // Now we need to prove that adding IdxDiff to ValA won't overflow.
434 // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
436 if (OpB
->getOpcode() == Instruction::Add
&&
437 isa
<ConstantInt
>(OpB
->getOperand(1)) &&
438 IdxDiff
.sle(cast
<ConstantInt
>(OpB
->getOperand(1))->getSExtValue())) {
440 Safe
= cast
<BinaryOperator
>(OpB
)->hasNoSignedWrap();
442 Safe
= cast
<BinaryOperator
>(OpB
)->hasNoUnsignedWrap();
445 unsigned BitWidth
= ValA
->getType()->getScalarSizeInBits();
448 // If all set bits of IdxDiff or any higher order bit other than the sign bit
449 // are known to be zero in ValA, we can add Diff to it while guaranteeing no
450 // overflow of any sort.
452 OpA
= dyn_cast
<Instruction
>(ValA
);
455 KnownBits
Known(BitWidth
);
456 computeKnownBits(OpA
, Known
, DL
, 0, nullptr, OpA
, &DT
);
457 APInt BitsAllowedToBeSet
= Known
.Zero
.zext(IdxDiff
.getBitWidth());
459 BitsAllowedToBeSet
.clearBit(BitWidth
- 1);
460 if (BitsAllowedToBeSet
.ult(IdxDiff
))
464 const SCEV
*OffsetSCEVA
= SE
.getSCEV(ValA
);
465 const SCEV
*OffsetSCEVB
= SE
.getSCEV(OpB
);
466 const SCEV
*C
= SE
.getConstant(IdxDiff
.trunc(BitWidth
));
467 const SCEV
*X
= SE
.getAddExpr(OffsetSCEVA
, C
);
468 return X
== OffsetSCEVB
;
471 bool Vectorizer::lookThroughSelects(Value
*PtrA
, Value
*PtrB
,
472 const APInt
&PtrDelta
,
473 unsigned Depth
) const {
474 if (Depth
++ == MaxDepth
)
477 if (auto *SelectA
= dyn_cast
<SelectInst
>(PtrA
)) {
478 if (auto *SelectB
= dyn_cast
<SelectInst
>(PtrB
)) {
479 return SelectA
->getCondition() == SelectB
->getCondition() &&
480 areConsecutivePointers(SelectA
->getTrueValue(),
481 SelectB
->getTrueValue(), PtrDelta
, Depth
) &&
482 areConsecutivePointers(SelectA
->getFalseValue(),
483 SelectB
->getFalseValue(), PtrDelta
, Depth
);
489 void Vectorizer::reorder(Instruction
*I
) {
490 OrderedBasicBlock
OBB(I
->getParent());
491 SmallPtrSet
<Instruction
*, 16> InstructionsToMove
;
492 SmallVector
<Instruction
*, 16> Worklist
;
494 Worklist
.push_back(I
);
495 while (!Worklist
.empty()) {
496 Instruction
*IW
= Worklist
.pop_back_val();
497 int NumOperands
= IW
->getNumOperands();
498 for (int i
= 0; i
< NumOperands
; i
++) {
499 Instruction
*IM
= dyn_cast
<Instruction
>(IW
->getOperand(i
));
500 if (!IM
|| IM
->getOpcode() == Instruction::PHI
)
503 // If IM is in another BB, no need to move it, because this pass only
504 // vectorizes instructions within one BB.
505 if (IM
->getParent() != I
->getParent())
508 if (!OBB
.dominates(IM
, I
)) {
509 InstructionsToMove
.insert(IM
);
510 Worklist
.push_back(IM
);
515 // All instructions to move should follow I. Start from I, not from begin().
516 for (auto BBI
= I
->getIterator(), E
= I
->getParent()->end(); BBI
!= E
;
518 if (!InstructionsToMove
.count(&*BBI
))
520 Instruction
*IM
= &*BBI
;
522 IM
->removeFromParent();
527 std::pair
<BasicBlock::iterator
, BasicBlock::iterator
>
528 Vectorizer::getBoundaryInstrs(ArrayRef
<Instruction
*> Chain
) {
529 Instruction
*C0
= Chain
[0];
530 BasicBlock::iterator FirstInstr
= C0
->getIterator();
531 BasicBlock::iterator LastInstr
= C0
->getIterator();
533 BasicBlock
*BB
= C0
->getParent();
534 unsigned NumFound
= 0;
535 for (Instruction
&I
: *BB
) {
536 if (!is_contained(Chain
, &I
))
541 FirstInstr
= I
.getIterator();
543 if (NumFound
== Chain
.size()) {
544 LastInstr
= I
.getIterator();
549 // Range is [first, last).
550 return std::make_pair(FirstInstr
, ++LastInstr
);
553 void Vectorizer::eraseInstructions(ArrayRef
<Instruction
*> Chain
) {
554 SmallVector
<Instruction
*, 16> Instrs
;
555 for (Instruction
*I
: Chain
) {
556 Value
*PtrOperand
= getLoadStorePointerOperand(I
);
557 assert(PtrOperand
&& "Instruction must have a pointer operand.");
559 if (GetElementPtrInst
*GEP
= dyn_cast
<GetElementPtrInst
>(PtrOperand
))
560 Instrs
.push_back(GEP
);
563 // Erase instructions.
564 for (Instruction
*I
: Instrs
)
566 I
->eraseFromParent();
569 std::pair
<ArrayRef
<Instruction
*>, ArrayRef
<Instruction
*>>
570 Vectorizer::splitOddVectorElts(ArrayRef
<Instruction
*> Chain
,
571 unsigned ElementSizeBits
) {
572 unsigned ElementSizeBytes
= ElementSizeBits
/ 8;
573 unsigned SizeBytes
= ElementSizeBytes
* Chain
.size();
574 unsigned NumLeft
= (SizeBytes
- (SizeBytes
% 4)) / ElementSizeBytes
;
575 if (NumLeft
== Chain
.size()) {
576 if ((NumLeft
& 1) == 0)
577 NumLeft
/= 2; // Split even in half
579 --NumLeft
; // Split off last element
580 } else if (NumLeft
== 0)
582 return std::make_pair(Chain
.slice(0, NumLeft
), Chain
.slice(NumLeft
));
585 ArrayRef
<Instruction
*>
586 Vectorizer::getVectorizablePrefix(ArrayRef
<Instruction
*> Chain
) {
587 // These are in BB order, unlike Chain, which is in address order.
588 SmallVector
<Instruction
*, 16> MemoryInstrs
;
589 SmallVector
<Instruction
*, 16> ChainInstrs
;
591 bool IsLoadChain
= isa
<LoadInst
>(Chain
[0]);
593 for (Instruction
*I
: Chain
) {
595 assert(isa
<LoadInst
>(I
) &&
596 "All elements of Chain must be loads, or all must be stores.");
598 assert(isa
<StoreInst
>(I
) &&
599 "All elements of Chain must be loads, or all must be stores.");
603 for (Instruction
&I
: make_range(getBoundaryInstrs(Chain
))) {
604 if (isa
<LoadInst
>(I
) || isa
<StoreInst
>(I
)) {
605 if (!is_contained(Chain
, &I
))
606 MemoryInstrs
.push_back(&I
);
608 ChainInstrs
.push_back(&I
);
609 } else if (isa
<IntrinsicInst
>(&I
) &&
610 cast
<IntrinsicInst
>(&I
)->getIntrinsicID() ==
611 Intrinsic::sideeffect
) {
612 // Ignore llvm.sideeffect calls.
613 } else if (IsLoadChain
&& (I
.mayWriteToMemory() || I
.mayThrow())) {
614 LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
617 } else if (!IsLoadChain
&& (I
.mayReadOrWriteMemory() || I
.mayThrow())) {
618 LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
624 OrderedBasicBlock
OBB(Chain
[0]->getParent());
626 // Loop until we find an instruction in ChainInstrs that we can't vectorize.
627 unsigned ChainInstrIdx
= 0;
628 Instruction
*BarrierMemoryInstr
= nullptr;
630 for (unsigned E
= ChainInstrs
.size(); ChainInstrIdx
< E
; ++ChainInstrIdx
) {
631 Instruction
*ChainInstr
= ChainInstrs
[ChainInstrIdx
];
633 // If a barrier memory instruction was found, chain instructions that follow
634 // will not be added to the valid prefix.
635 if (BarrierMemoryInstr
&& OBB
.dominates(BarrierMemoryInstr
, ChainInstr
))
638 // Check (in BB order) if any instruction prevents ChainInstr from being
639 // vectorized. Find and store the first such "conflicting" instruction.
640 for (Instruction
*MemInstr
: MemoryInstrs
) {
641 // If a barrier memory instruction was found, do not check past it.
642 if (BarrierMemoryInstr
&& OBB
.dominates(BarrierMemoryInstr
, MemInstr
))
645 auto *MemLoad
= dyn_cast
<LoadInst
>(MemInstr
);
646 auto *ChainLoad
= dyn_cast
<LoadInst
>(ChainInstr
);
647 if (MemLoad
&& ChainLoad
)
650 // We can ignore the alias if the we have a load store pair and the load
651 // is known to be invariant. The load cannot be clobbered by the store.
652 auto IsInvariantLoad
= [](const LoadInst
*LI
) -> bool {
653 return LI
->getMetadata(LLVMContext::MD_invariant_load
);
656 // We can ignore the alias as long as the load comes before the store,
657 // because that means we won't be moving the load past the store to
658 // vectorize it (the vectorized load is inserted at the location of the
659 // first load in the chain).
660 if (isa
<StoreInst
>(MemInstr
) && ChainLoad
&&
661 (IsInvariantLoad(ChainLoad
) || OBB
.dominates(ChainLoad
, MemInstr
)))
664 // Same case, but in reverse.
665 if (MemLoad
&& isa
<StoreInst
>(ChainInstr
) &&
666 (IsInvariantLoad(MemLoad
) || OBB
.dominates(MemLoad
, ChainInstr
)))
669 if (!AA
.isNoAlias(MemoryLocation::get(MemInstr
),
670 MemoryLocation::get(ChainInstr
))) {
672 dbgs() << "LSV: Found alias:\n"
673 " Aliasing instruction and pointer:\n"
674 << " " << *MemInstr
<< '\n'
675 << " " << *getLoadStorePointerOperand(MemInstr
) << '\n'
676 << " Aliased instruction and pointer:\n"
677 << " " << *ChainInstr
<< '\n'
678 << " " << *getLoadStorePointerOperand(ChainInstr
) << '\n';
680 // Save this aliasing memory instruction as a barrier, but allow other
681 // instructions that precede the barrier to be vectorized with this one.
682 BarrierMemoryInstr
= MemInstr
;
686 // Continue the search only for store chains, since vectorizing stores that
687 // precede an aliasing load is valid. Conversely, vectorizing loads is valid
688 // up to an aliasing store, but should not pull loads from further down in
690 if (IsLoadChain
&& BarrierMemoryInstr
) {
691 // The BarrierMemoryInstr is a store that precedes ChainInstr.
692 assert(OBB
.dominates(BarrierMemoryInstr
, ChainInstr
));
697 // Find the largest prefix of Chain whose elements are all in
698 // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of
699 // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB
701 SmallPtrSet
<Instruction
*, 8> VectorizableChainInstrs(
702 ChainInstrs
.begin(), ChainInstrs
.begin() + ChainInstrIdx
);
703 unsigned ChainIdx
= 0;
704 for (unsigned ChainLen
= Chain
.size(); ChainIdx
< ChainLen
; ++ChainIdx
) {
705 if (!VectorizableChainInstrs
.count(Chain
[ChainIdx
]))
708 return Chain
.slice(0, ChainIdx
);
711 static ChainID
getChainID(const Value
*Ptr
, const DataLayout
&DL
) {
712 const Value
*ObjPtr
= GetUnderlyingObject(Ptr
, DL
);
713 if (const auto *Sel
= dyn_cast
<SelectInst
>(ObjPtr
)) {
714 // The select's themselves are distinct instructions even if they share the
715 // same condition and evaluate to consecutive pointers for true and false
716 // values of the condition. Therefore using the select's themselves for
717 // grouping instructions would put consecutive accesses into different lists
718 // and they won't be even checked for being consecutive, and won't be
720 return Sel
->getCondition();
725 std::pair
<InstrListMap
, InstrListMap
>
726 Vectorizer::collectInstructions(BasicBlock
*BB
) {
727 InstrListMap LoadRefs
;
728 InstrListMap StoreRefs
;
730 for (Instruction
&I
: *BB
) {
731 if (!I
.mayReadOrWriteMemory())
734 if (LoadInst
*LI
= dyn_cast
<LoadInst
>(&I
)) {
738 // Skip if it's not legal.
739 if (!TTI
.isLegalToVectorizeLoad(LI
))
742 Type
*Ty
= LI
->getType();
743 if (!VectorType::isValidElementType(Ty
->getScalarType()))
746 // Skip weird non-byte sizes. They probably aren't worth the effort of
747 // handling correctly.
748 unsigned TySize
= DL
.getTypeSizeInBits(Ty
);
749 if ((TySize
% 8) != 0)
752 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
753 // functions are currently using an integer type for the vectorized
754 // load/store, and does not support casting between the integer type and a
755 // vector of pointers (e.g. i64 to <2 x i16*>)
756 if (Ty
->isVectorTy() && Ty
->isPtrOrPtrVectorTy())
759 Value
*Ptr
= LI
->getPointerOperand();
760 unsigned AS
= Ptr
->getType()->getPointerAddressSpace();
761 unsigned VecRegSize
= TTI
.getLoadStoreVecRegBitWidth(AS
);
763 unsigned VF
= VecRegSize
/ TySize
;
764 VectorType
*VecTy
= dyn_cast
<VectorType
>(Ty
);
766 // No point in looking at these if they're too big to vectorize.
767 if (TySize
> VecRegSize
/ 2 ||
768 (VecTy
&& TTI
.getLoadVectorFactor(VF
, TySize
, TySize
/ 8, VecTy
) == 0))
771 // Make sure all the users of a vector are constant-index extracts.
772 if (isa
<VectorType
>(Ty
) && !llvm::all_of(LI
->users(), [](const User
*U
) {
773 const ExtractElementInst
*EEI
= dyn_cast
<ExtractElementInst
>(U
);
774 return EEI
&& isa
<ConstantInt
>(EEI
->getOperand(1));
778 // Save the load locations.
779 const ChainID ID
= getChainID(Ptr
, DL
);
780 LoadRefs
[ID
].push_back(LI
);
781 } else if (StoreInst
*SI
= dyn_cast
<StoreInst
>(&I
)) {
785 // Skip if it's not legal.
786 if (!TTI
.isLegalToVectorizeStore(SI
))
789 Type
*Ty
= SI
->getValueOperand()->getType();
790 if (!VectorType::isValidElementType(Ty
->getScalarType()))
793 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
794 // functions are currently using an integer type for the vectorized
795 // load/store, and does not support casting between the integer type and a
796 // vector of pointers (e.g. i64 to <2 x i16*>)
797 if (Ty
->isVectorTy() && Ty
->isPtrOrPtrVectorTy())
800 // Skip weird non-byte sizes. They probably aren't worth the effort of
801 // handling correctly.
802 unsigned TySize
= DL
.getTypeSizeInBits(Ty
);
803 if ((TySize
% 8) != 0)
806 Value
*Ptr
= SI
->getPointerOperand();
807 unsigned AS
= Ptr
->getType()->getPointerAddressSpace();
808 unsigned VecRegSize
= TTI
.getLoadStoreVecRegBitWidth(AS
);
810 unsigned VF
= VecRegSize
/ TySize
;
811 VectorType
*VecTy
= dyn_cast
<VectorType
>(Ty
);
813 // No point in looking at these if they're too big to vectorize.
814 if (TySize
> VecRegSize
/ 2 ||
815 (VecTy
&& TTI
.getStoreVectorFactor(VF
, TySize
, TySize
/ 8, VecTy
) == 0))
818 if (isa
<VectorType
>(Ty
) && !llvm::all_of(SI
->users(), [](const User
*U
) {
819 const ExtractElementInst
*EEI
= dyn_cast
<ExtractElementInst
>(U
);
820 return EEI
&& isa
<ConstantInt
>(EEI
->getOperand(1));
824 // Save store location.
825 const ChainID ID
= getChainID(Ptr
, DL
);
826 StoreRefs
[ID
].push_back(SI
);
830 return {LoadRefs
, StoreRefs
};
833 bool Vectorizer::vectorizeChains(InstrListMap
&Map
) {
834 bool Changed
= false;
836 for (const std::pair
<ChainID
, InstrList
> &Chain
: Map
) {
837 unsigned Size
= Chain
.second
.size();
841 LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size
<< ".\n");
843 // Process the stores in chunks of 64.
844 for (unsigned CI
= 0, CE
= Size
; CI
< CE
; CI
+= 64) {
845 unsigned Len
= std::min
<unsigned>(CE
- CI
, 64);
846 ArrayRef
<Instruction
*> Chunk(&Chain
.second
[CI
], Len
);
847 Changed
|= vectorizeInstructions(Chunk
);
854 bool Vectorizer::vectorizeInstructions(ArrayRef
<Instruction
*> Instrs
) {
855 LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs
.size()
856 << " instructions.\n");
857 SmallVector
<int, 16> Heads
, Tails
;
858 int ConsecutiveChain
[64];
860 // Do a quadratic search on all of the given loads/stores and find all of the
861 // pairs of loads/stores that follow each other.
862 for (int i
= 0, e
= Instrs
.size(); i
< e
; ++i
) {
863 ConsecutiveChain
[i
] = -1;
864 for (int j
= e
- 1; j
>= 0; --j
) {
868 if (isConsecutiveAccess(Instrs
[i
], Instrs
[j
])) {
869 if (ConsecutiveChain
[i
] != -1) {
870 int CurDistance
= std::abs(ConsecutiveChain
[i
] - i
);
871 int NewDistance
= std::abs(ConsecutiveChain
[i
] - j
);
872 if (j
< i
|| NewDistance
> CurDistance
)
873 continue; // Should not insert.
878 ConsecutiveChain
[i
] = j
;
883 bool Changed
= false;
884 SmallPtrSet
<Instruction
*, 16> InstructionsProcessed
;
886 for (int Head
: Heads
) {
887 if (InstructionsProcessed
.count(Instrs
[Head
]))
889 bool LongerChainExists
= false;
890 for (unsigned TIt
= 0; TIt
< Tails
.size(); TIt
++)
891 if (Head
== Tails
[TIt
] &&
892 !InstructionsProcessed
.count(Instrs
[Heads
[TIt
]])) {
893 LongerChainExists
= true;
896 if (LongerChainExists
)
899 // We found an instr that starts a chain. Now follow the chain and try to
901 SmallVector
<Instruction
*, 16> Operands
;
903 while (I
!= -1 && (is_contained(Tails
, I
) || is_contained(Heads
, I
))) {
904 if (InstructionsProcessed
.count(Instrs
[I
]))
907 Operands
.push_back(Instrs
[I
]);
908 I
= ConsecutiveChain
[I
];
911 bool Vectorized
= false;
912 if (isa
<LoadInst
>(*Operands
.begin()))
913 Vectorized
= vectorizeLoadChain(Operands
, &InstructionsProcessed
);
915 Vectorized
= vectorizeStoreChain(Operands
, &InstructionsProcessed
);
917 Changed
|= Vectorized
;
923 bool Vectorizer::vectorizeStoreChain(
924 ArrayRef
<Instruction
*> Chain
,
925 SmallPtrSet
<Instruction
*, 16> *InstructionsProcessed
) {
926 StoreInst
*S0
= cast
<StoreInst
>(Chain
[0]);
928 // If the vector has an int element, default to int for the whole store.
930 for (Instruction
*I
: Chain
) {
931 StoreTy
= cast
<StoreInst
>(I
)->getValueOperand()->getType();
932 if (StoreTy
->isIntOrIntVectorTy())
935 if (StoreTy
->isPtrOrPtrVectorTy()) {
936 StoreTy
= Type::getIntNTy(F
.getParent()->getContext(),
937 DL
.getTypeSizeInBits(StoreTy
));
942 unsigned Sz
= DL
.getTypeSizeInBits(StoreTy
);
943 unsigned AS
= S0
->getPointerAddressSpace();
944 unsigned VecRegSize
= TTI
.getLoadStoreVecRegBitWidth(AS
);
945 unsigned VF
= VecRegSize
/ Sz
;
946 unsigned ChainSize
= Chain
.size();
947 unsigned Alignment
= getAlignment(S0
);
949 if (!isPowerOf2_32(Sz
) || VF
< 2 || ChainSize
< 2) {
950 InstructionsProcessed
->insert(Chain
.begin(), Chain
.end());
954 ArrayRef
<Instruction
*> NewChain
= getVectorizablePrefix(Chain
);
955 if (NewChain
.empty()) {
956 // No vectorization possible.
957 InstructionsProcessed
->insert(Chain
.begin(), Chain
.end());
960 if (NewChain
.size() == 1) {
961 // Failed after the first instruction. Discard it and try the smaller chain.
962 InstructionsProcessed
->insert(NewChain
.front());
966 // Update Chain to the valid vectorizable subchain.
968 ChainSize
= Chain
.size();
970 // Check if it's legal to vectorize this chain. If not, split the chain and
972 unsigned EltSzInBytes
= Sz
/ 8;
973 unsigned SzInBytes
= EltSzInBytes
* ChainSize
;
976 VectorType
*VecStoreTy
= dyn_cast
<VectorType
>(StoreTy
);
978 VecTy
= VectorType::get(StoreTy
->getScalarType(),
979 Chain
.size() * VecStoreTy
->getNumElements());
981 VecTy
= VectorType::get(StoreTy
, Chain
.size());
983 // If it's more than the max vector size or the target has a better
984 // vector factor, break it into two pieces.
985 unsigned TargetVF
= TTI
.getStoreVectorFactor(VF
, Sz
, SzInBytes
, VecTy
);
986 if (ChainSize
> VF
|| (VF
!= TargetVF
&& TargetVF
< ChainSize
)) {
987 LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
988 " Creating two separate arrays.\n");
989 return vectorizeStoreChain(Chain
.slice(0, TargetVF
),
990 InstructionsProcessed
) |
991 vectorizeStoreChain(Chain
.slice(TargetVF
), InstructionsProcessed
);
995 dbgs() << "LSV: Stores to vectorize:\n";
996 for (Instruction
*I
: Chain
)
997 dbgs() << " " << *I
<< "\n";
1000 // We won't try again to vectorize the elements of the chain, regardless of
1001 // whether we succeed below.
1002 InstructionsProcessed
->insert(Chain
.begin(), Chain
.end());
1004 // If the store is going to be misaligned, don't vectorize it.
1005 if (accessIsMisaligned(SzInBytes
, AS
, Alignment
)) {
1006 if (S0
->getPointerAddressSpace() != DL
.getAllocaAddrSpace()) {
1007 auto Chains
= splitOddVectorElts(Chain
, Sz
);
1008 return vectorizeStoreChain(Chains
.first
, InstructionsProcessed
) |
1009 vectorizeStoreChain(Chains
.second
, InstructionsProcessed
);
1012 unsigned NewAlign
= getOrEnforceKnownAlignment(S0
->getPointerOperand(),
1013 StackAdjustedAlignment
,
1014 DL
, S0
, nullptr, &DT
);
1016 Alignment
= NewAlign
;
1019 if (!TTI
.isLegalToVectorizeStoreChain(SzInBytes
, Alignment
, AS
)) {
1020 auto Chains
= splitOddVectorElts(Chain
, Sz
);
1021 return vectorizeStoreChain(Chains
.first
, InstructionsProcessed
) |
1022 vectorizeStoreChain(Chains
.second
, InstructionsProcessed
);
1025 BasicBlock::iterator First
, Last
;
1026 std::tie(First
, Last
) = getBoundaryInstrs(Chain
);
1027 Builder
.SetInsertPoint(&*Last
);
1029 Value
*Vec
= UndefValue::get(VecTy
);
1032 unsigned VecWidth
= VecStoreTy
->getNumElements();
1033 for (unsigned I
= 0, E
= Chain
.size(); I
!= E
; ++I
) {
1034 StoreInst
*Store
= cast
<StoreInst
>(Chain
[I
]);
1035 for (unsigned J
= 0, NE
= VecStoreTy
->getNumElements(); J
!= NE
; ++J
) {
1036 unsigned NewIdx
= J
+ I
* VecWidth
;
1037 Value
*Extract
= Builder
.CreateExtractElement(Store
->getValueOperand(),
1038 Builder
.getInt32(J
));
1039 if (Extract
->getType() != StoreTy
->getScalarType())
1040 Extract
= Builder
.CreateBitCast(Extract
, StoreTy
->getScalarType());
1043 Builder
.CreateInsertElement(Vec
, Extract
, Builder
.getInt32(NewIdx
));
1048 for (unsigned I
= 0, E
= Chain
.size(); I
!= E
; ++I
) {
1049 StoreInst
*Store
= cast
<StoreInst
>(Chain
[I
]);
1050 Value
*Extract
= Store
->getValueOperand();
1051 if (Extract
->getType() != StoreTy
->getScalarType())
1053 Builder
.CreateBitOrPointerCast(Extract
, StoreTy
->getScalarType());
1056 Builder
.CreateInsertElement(Vec
, Extract
, Builder
.getInt32(I
));
1061 StoreInst
*SI
= Builder
.CreateAlignedStore(
1063 Builder
.CreateBitCast(S0
->getPointerOperand(), VecTy
->getPointerTo(AS
)),
1065 propagateMetadata(SI
, Chain
);
1067 eraseInstructions(Chain
);
1068 ++NumVectorInstructions
;
1069 NumScalarsVectorized
+= Chain
.size();
1073 bool Vectorizer::vectorizeLoadChain(
1074 ArrayRef
<Instruction
*> Chain
,
1075 SmallPtrSet
<Instruction
*, 16> *InstructionsProcessed
) {
1076 LoadInst
*L0
= cast
<LoadInst
>(Chain
[0]);
1078 // If the vector has an int element, default to int for the whole load.
1080 for (const auto &V
: Chain
) {
1081 LoadTy
= cast
<LoadInst
>(V
)->getType();
1082 if (LoadTy
->isIntOrIntVectorTy())
1085 if (LoadTy
->isPtrOrPtrVectorTy()) {
1086 LoadTy
= Type::getIntNTy(F
.getParent()->getContext(),
1087 DL
.getTypeSizeInBits(LoadTy
));
1092 unsigned Sz
= DL
.getTypeSizeInBits(LoadTy
);
1093 unsigned AS
= L0
->getPointerAddressSpace();
1094 unsigned VecRegSize
= TTI
.getLoadStoreVecRegBitWidth(AS
);
1095 unsigned VF
= VecRegSize
/ Sz
;
1096 unsigned ChainSize
= Chain
.size();
1097 unsigned Alignment
= getAlignment(L0
);
1099 if (!isPowerOf2_32(Sz
) || VF
< 2 || ChainSize
< 2) {
1100 InstructionsProcessed
->insert(Chain
.begin(), Chain
.end());
1104 ArrayRef
<Instruction
*> NewChain
= getVectorizablePrefix(Chain
);
1105 if (NewChain
.empty()) {
1106 // No vectorization possible.
1107 InstructionsProcessed
->insert(Chain
.begin(), Chain
.end());
1110 if (NewChain
.size() == 1) {
1111 // Failed after the first instruction. Discard it and try the smaller chain.
1112 InstructionsProcessed
->insert(NewChain
.front());
1116 // Update Chain to the valid vectorizable subchain.
1118 ChainSize
= Chain
.size();
1120 // Check if it's legal to vectorize this chain. If not, split the chain and
1122 unsigned EltSzInBytes
= Sz
/ 8;
1123 unsigned SzInBytes
= EltSzInBytes
* ChainSize
;
1125 VectorType
*VecLoadTy
= dyn_cast
<VectorType
>(LoadTy
);
1127 VecTy
= VectorType::get(LoadTy
->getScalarType(),
1128 Chain
.size() * VecLoadTy
->getNumElements());
1130 VecTy
= VectorType::get(LoadTy
, Chain
.size());
1132 // If it's more than the max vector size or the target has a better
1133 // vector factor, break it into two pieces.
1134 unsigned TargetVF
= TTI
.getLoadVectorFactor(VF
, Sz
, SzInBytes
, VecTy
);
1135 if (ChainSize
> VF
|| (VF
!= TargetVF
&& TargetVF
< ChainSize
)) {
1136 LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1137 " Creating two separate arrays.\n");
1138 return vectorizeLoadChain(Chain
.slice(0, TargetVF
), InstructionsProcessed
) |
1139 vectorizeLoadChain(Chain
.slice(TargetVF
), InstructionsProcessed
);
1142 // We won't try again to vectorize the elements of the chain, regardless of
1143 // whether we succeed below.
1144 InstructionsProcessed
->insert(Chain
.begin(), Chain
.end());
1146 // If the load is going to be misaligned, don't vectorize it.
1147 if (accessIsMisaligned(SzInBytes
, AS
, Alignment
)) {
1148 if (L0
->getPointerAddressSpace() != DL
.getAllocaAddrSpace()) {
1149 auto Chains
= splitOddVectorElts(Chain
, Sz
);
1150 return vectorizeLoadChain(Chains
.first
, InstructionsProcessed
) |
1151 vectorizeLoadChain(Chains
.second
, InstructionsProcessed
);
1154 unsigned NewAlign
= getOrEnforceKnownAlignment(L0
->getPointerOperand(),
1155 StackAdjustedAlignment
,
1156 DL
, L0
, nullptr, &DT
);
1158 Alignment
= NewAlign
;
1160 Alignment
= NewAlign
;
1163 if (!TTI
.isLegalToVectorizeLoadChain(SzInBytes
, Alignment
, AS
)) {
1164 auto Chains
= splitOddVectorElts(Chain
, Sz
);
1165 return vectorizeLoadChain(Chains
.first
, InstructionsProcessed
) |
1166 vectorizeLoadChain(Chains
.second
, InstructionsProcessed
);
1170 dbgs() << "LSV: Loads to vectorize:\n";
1171 for (Instruction
*I
: Chain
)
1175 // getVectorizablePrefix already computed getBoundaryInstrs. The value of
1176 // Last may have changed since then, but the value of First won't have. If it
1177 // matters, we could compute getBoundaryInstrs only once and reuse it here.
1178 BasicBlock::iterator First
, Last
;
1179 std::tie(First
, Last
) = getBoundaryInstrs(Chain
);
1180 Builder
.SetInsertPoint(&*First
);
1183 Builder
.CreateBitCast(L0
->getPointerOperand(), VecTy
->getPointerTo(AS
));
1184 LoadInst
*LI
= Builder
.CreateAlignedLoad(VecTy
, Bitcast
, Alignment
);
1185 propagateMetadata(LI
, Chain
);
1188 SmallVector
<Instruction
*, 16> InstrsToErase
;
1190 unsigned VecWidth
= VecLoadTy
->getNumElements();
1191 for (unsigned I
= 0, E
= Chain
.size(); I
!= E
; ++I
) {
1192 for (auto Use
: Chain
[I
]->users()) {
1193 // All users of vector loads are ExtractElement instructions with
1194 // constant indices, otherwise we would have bailed before now.
1195 Instruction
*UI
= cast
<Instruction
>(Use
);
1196 unsigned Idx
= cast
<ConstantInt
>(UI
->getOperand(1))->getZExtValue();
1197 unsigned NewIdx
= Idx
+ I
* VecWidth
;
1198 Value
*V
= Builder
.CreateExtractElement(LI
, Builder
.getInt32(NewIdx
),
1200 if (V
->getType() != UI
->getType())
1201 V
= Builder
.CreateBitCast(V
, UI
->getType());
1203 // Replace the old instruction.
1204 UI
->replaceAllUsesWith(V
);
1205 InstrsToErase
.push_back(UI
);
1209 // Bitcast might not be an Instruction, if the value being loaded is a
1210 // constant. In that case, no need to reorder anything.
1211 if (Instruction
*BitcastInst
= dyn_cast
<Instruction
>(Bitcast
))
1212 reorder(BitcastInst
);
1214 for (auto I
: InstrsToErase
)
1215 I
->eraseFromParent();
1217 for (unsigned I
= 0, E
= Chain
.size(); I
!= E
; ++I
) {
1218 Value
*CV
= Chain
[I
];
1220 Builder
.CreateExtractElement(LI
, Builder
.getInt32(I
), CV
->getName());
1221 if (V
->getType() != CV
->getType()) {
1222 V
= Builder
.CreateBitOrPointerCast(V
, CV
->getType());
1225 // Replace the old instruction.
1226 CV
->replaceAllUsesWith(V
);
1229 if (Instruction
*BitcastInst
= dyn_cast
<Instruction
>(Bitcast
))
1230 reorder(BitcastInst
);
1233 eraseInstructions(Chain
);
1235 ++NumVectorInstructions
;
1236 NumScalarsVectorized
+= Chain
.size();
1240 bool Vectorizer::accessIsMisaligned(unsigned SzInBytes
, unsigned AddressSpace
,
1241 unsigned Alignment
) {
1242 if (Alignment
% SzInBytes
== 0)
1246 bool Allows
= TTI
.allowsMisalignedMemoryAccesses(F
.getParent()->getContext(),
1247 SzInBytes
* 8, AddressSpace
,
1249 LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
1250 << " and fast? " << Fast
<< "\n";);
1251 return !Allows
|| !Fast
;