1 //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass merges loads/stores to/from sequential memory addresses into vector
10 // loads/stores. Although there's nothing GPU-specific in here, this pass is
11 // motivated by the microarchitectural quirks of nVidia and AMD GPUs.
13 // (For simplicity below we talk about loads only, but everything also applies
16 // This pass is intended to be run late in the pipeline, after other
17 // vectorization opportunities have been exploited. So the assumption here is
18 // that immediately following our new vector load we'll need to extract out the
19 // individual elements of the load, so we can operate on them individually.
21 // On CPUs this transformation is usually not beneficial, because extracting the
22 // elements of a vector register is expensive on most architectures. It's
23 // usually better just to load each element individually into its own scalar
26 // However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27 // "vector load" loads directly into a series of scalar registers. In effect,
28 // extracting the elements of the vector is free. It's therefore always
29 // beneficial to vectorize a sequence of loads on these architectures.
31 // Vectorizing (perhaps a better name might be "coalescing") loads can have
32 // large performance impacts on GPU kernels, and opportunities for vectorizing
33 // are common in GPU code. This pass tries very hard to find such
34 // opportunities; its runtime is quadratic in the number of loads in a BB.
36 // Some CPU architectures, such as ARM, have instructions that load into
37 // multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38 // could use this pass (with some modifications), but currently it implements
39 // its own pass to do something similar to what we do here.
41 #include "llvm/ADT/APInt.h"
42 #include "llvm/ADT/ArrayRef.h"
43 #include "llvm/ADT/MapVector.h"
44 #include "llvm/ADT/PostOrderIterator.h"
45 #include "llvm/ADT/STLExtras.h"
46 #include "llvm/ADT/SmallPtrSet.h"
47 #include "llvm/ADT/SmallVector.h"
48 #include "llvm/ADT/Statistic.h"
49 #include "llvm/ADT/iterator_range.h"
50 #include "llvm/Analysis/AliasAnalysis.h"
51 #include "llvm/Analysis/MemoryLocation.h"
52 #include "llvm/Analysis/OrderedBasicBlock.h"
53 #include "llvm/Analysis/ScalarEvolution.h"
54 #include "llvm/Analysis/TargetTransformInfo.h"
55 #include "llvm/Transforms/Utils/Local.h"
56 #include "llvm/Analysis/ValueTracking.h"
57 #include "llvm/Analysis/VectorUtils.h"
58 #include "llvm/IR/Attributes.h"
59 #include "llvm/IR/BasicBlock.h"
60 #include "llvm/IR/Constants.h"
61 #include "llvm/IR/DataLayout.h"
62 #include "llvm/IR/DerivedTypes.h"
63 #include "llvm/IR/Dominators.h"
64 #include "llvm/IR/Function.h"
65 #include "llvm/IR/IRBuilder.h"
66 #include "llvm/IR/InstrTypes.h"
67 #include "llvm/IR/Instruction.h"
68 #include "llvm/IR/Instructions.h"
69 #include "llvm/IR/IntrinsicInst.h"
70 #include "llvm/IR/Module.h"
71 #include "llvm/IR/Type.h"
72 #include "llvm/IR/User.h"
73 #include "llvm/IR/Value.h"
74 #include "llvm/Pass.h"
75 #include "llvm/Support/Casting.h"
76 #include "llvm/Support/Debug.h"
77 #include "llvm/Support/KnownBits.h"
78 #include "llvm/Support/MathExtras.h"
79 #include "llvm/Support/raw_ostream.h"
80 #include "llvm/Transforms/Vectorize.h"
81 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
90 #define DEBUG_TYPE "load-store-vectorizer"
92 STATISTIC(NumVectorInstructions
, "Number of vector accesses generated");
93 STATISTIC(NumScalarsVectorized
, "Number of scalar accesses vectorized");
95 // FIXME: Assuming stack alignment of 4 is always good enough
96 static const unsigned StackAdjustedAlignment
= 4;
100 /// ChainID is an arbitrary token that is allowed to be different only for the
101 /// accesses that are guaranteed to be considered non-consecutive by
102 /// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
103 /// together and reducing the number of instructions the main search operates on
104 /// at a time, i.e. this is to reduce compile time and nothing else as the main
105 /// search has O(n^2) time complexity. The underlying type of ChainID should not
107 using ChainID
= const Value
*;
108 using InstrList
= SmallVector
<Instruction
*, 8>;
109 using InstrListMap
= MapVector
<ChainID
, InstrList
>;
116 TargetTransformInfo
&TTI
;
117 const DataLayout
&DL
;
121 Vectorizer(Function
&F
, AliasAnalysis
&AA
, DominatorTree
&DT
,
122 ScalarEvolution
&SE
, TargetTransformInfo
&TTI
)
123 : F(F
), AA(AA
), DT(DT
), SE(SE
), TTI(TTI
),
124 DL(F
.getParent()->getDataLayout()), Builder(SE
.getContext()) {}
129 unsigned getPointerAddressSpace(Value
*I
);
131 unsigned getAlignment(LoadInst
*LI
) const {
132 unsigned Align
= LI
->getAlignment();
136 return DL
.getABITypeAlignment(LI
->getType());
139 unsigned getAlignment(StoreInst
*SI
) const {
140 unsigned Align
= SI
->getAlignment();
144 return DL
.getABITypeAlignment(SI
->getValueOperand()->getType());
147 static const unsigned MaxDepth
= 3;
149 bool isConsecutiveAccess(Value
*A
, Value
*B
);
150 bool areConsecutivePointers(Value
*PtrA
, Value
*PtrB
, APInt PtrDelta
,
151 unsigned Depth
= 0) const;
152 bool lookThroughComplexAddresses(Value
*PtrA
, Value
*PtrB
, APInt PtrDelta
,
153 unsigned Depth
) const;
154 bool lookThroughSelects(Value
*PtrA
, Value
*PtrB
, const APInt
&PtrDelta
,
155 unsigned Depth
) const;
157 /// After vectorization, reorder the instructions that I depends on
158 /// (the instructions defining its operands), to ensure they dominate I.
159 void reorder(Instruction
*I
);
161 /// Returns the first and the last instructions in Chain.
162 std::pair
<BasicBlock::iterator
, BasicBlock::iterator
>
163 getBoundaryInstrs(ArrayRef
<Instruction
*> Chain
);
165 /// Erases the original instructions after vectorizing.
166 void eraseInstructions(ArrayRef
<Instruction
*> Chain
);
168 /// "Legalize" the vector type that would be produced by combining \p
169 /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
170 /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
171 /// expected to have more than 4 elements.
172 std::pair
<ArrayRef
<Instruction
*>, ArrayRef
<Instruction
*>>
173 splitOddVectorElts(ArrayRef
<Instruction
*> Chain
, unsigned ElementSizeBits
);
175 /// Finds the largest prefix of Chain that's vectorizable, checking for
176 /// intervening instructions which may affect the memory accessed by the
177 /// instructions within Chain.
179 /// The elements of \p Chain must be all loads or all stores and must be in
181 ArrayRef
<Instruction
*> getVectorizablePrefix(ArrayRef
<Instruction
*> Chain
);
183 /// Collects load and store instructions to vectorize.
184 std::pair
<InstrListMap
, InstrListMap
> collectInstructions(BasicBlock
*BB
);
186 /// Processes the collected instructions, the \p Map. The values of \p Map
187 /// should be all loads or all stores.
188 bool vectorizeChains(InstrListMap
&Map
);
190 /// Finds the load/stores to consecutive memory addresses and vectorizes them.
191 bool vectorizeInstructions(ArrayRef
<Instruction
*> Instrs
);
193 /// Vectorizes the load instructions in Chain.
195 vectorizeLoadChain(ArrayRef
<Instruction
*> Chain
,
196 SmallPtrSet
<Instruction
*, 16> *InstructionsProcessed
);
198 /// Vectorizes the store instructions in Chain.
200 vectorizeStoreChain(ArrayRef
<Instruction
*> Chain
,
201 SmallPtrSet
<Instruction
*, 16> *InstructionsProcessed
);
203 /// Check if this load/store access is misaligned accesses.
204 bool accessIsMisaligned(unsigned SzInBytes
, unsigned AddressSpace
,
208 class LoadStoreVectorizerLegacyPass
: public FunctionPass
{
212 LoadStoreVectorizerLegacyPass() : FunctionPass(ID
) {
213 initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry());
216 bool runOnFunction(Function
&F
) override
;
218 StringRef
getPassName() const override
{
219 return "GPU Load and Store Vectorizer";
222 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
223 AU
.addRequired
<AAResultsWrapperPass
>();
224 AU
.addRequired
<ScalarEvolutionWrapperPass
>();
225 AU
.addRequired
<DominatorTreeWrapperPass
>();
226 AU
.addRequired
<TargetTransformInfoWrapperPass
>();
227 AU
.setPreservesCFG();
231 } // end anonymous namespace
233 char LoadStoreVectorizerLegacyPass::ID
= 0;
235 INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass
, DEBUG_TYPE
,
236 "Vectorize load and Store instructions", false, false)
237 INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass
)
238 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass
)
239 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass
)
240 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass
)
241 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass
)
242 INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass
, DEBUG_TYPE
,
243 "Vectorize load and store instructions", false, false)
245 Pass
*llvm::createLoadStoreVectorizerPass() {
246 return new LoadStoreVectorizerLegacyPass();
249 bool LoadStoreVectorizerLegacyPass::runOnFunction(Function
&F
) {
250 // Don't vectorize when the attribute NoImplicitFloat is used.
251 if (skipFunction(F
) || F
.hasFnAttribute(Attribute::NoImplicitFloat
))
254 AliasAnalysis
&AA
= getAnalysis
<AAResultsWrapperPass
>().getAAResults();
255 DominatorTree
&DT
= getAnalysis
<DominatorTreeWrapperPass
>().getDomTree();
256 ScalarEvolution
&SE
= getAnalysis
<ScalarEvolutionWrapperPass
>().getSE();
257 TargetTransformInfo
&TTI
=
258 getAnalysis
<TargetTransformInfoWrapperPass
>().getTTI(F
);
260 Vectorizer
V(F
, AA
, DT
, SE
, TTI
);
264 PreservedAnalyses
LoadStoreVectorizerPass::run(Function
&F
, FunctionAnalysisManager
&AM
) {
265 // Don't vectorize when the attribute NoImplicitFloat is used.
266 if (F
.hasFnAttribute(Attribute::NoImplicitFloat
))
267 return PreservedAnalyses::all();
269 AliasAnalysis
&AA
= AM
.getResult
<AAManager
>(F
);
270 DominatorTree
&DT
= AM
.getResult
<DominatorTreeAnalysis
>(F
);
271 ScalarEvolution
&SE
= AM
.getResult
<ScalarEvolutionAnalysis
>(F
);
272 TargetTransformInfo
&TTI
= AM
.getResult
<TargetIRAnalysis
>(F
);
274 Vectorizer
V(F
, AA
, DT
, SE
, TTI
);
275 bool Changed
= V
.run();
276 PreservedAnalyses PA
;
277 PA
.preserveSet
<CFGAnalyses
>();
278 return Changed
? PA
: PreservedAnalyses::all();
281 // The real propagateMetadata expects a SmallVector<Value*>, but we deal in
282 // vectors of Instructions.
283 static void propagateMetadata(Instruction
*I
, ArrayRef
<Instruction
*> IL
) {
284 SmallVector
<Value
*, 8> VL(IL
.begin(), IL
.end());
285 propagateMetadata(I
, VL
);
288 // Vectorizer Implementation
289 bool Vectorizer::run() {
290 bool Changed
= false;
292 // Scan the blocks in the function in post order.
293 for (BasicBlock
*BB
: post_order(&F
)) {
294 InstrListMap LoadRefs
, StoreRefs
;
295 std::tie(LoadRefs
, StoreRefs
) = collectInstructions(BB
);
296 Changed
|= vectorizeChains(LoadRefs
);
297 Changed
|= vectorizeChains(StoreRefs
);
303 unsigned Vectorizer::getPointerAddressSpace(Value
*I
) {
304 if (LoadInst
*L
= dyn_cast
<LoadInst
>(I
))
305 return L
->getPointerAddressSpace();
306 if (StoreInst
*S
= dyn_cast
<StoreInst
>(I
))
307 return S
->getPointerAddressSpace();
311 // FIXME: Merge with llvm::isConsecutiveAccess
312 bool Vectorizer::isConsecutiveAccess(Value
*A
, Value
*B
) {
313 Value
*PtrA
= getLoadStorePointerOperand(A
);
314 Value
*PtrB
= getLoadStorePointerOperand(B
);
315 unsigned ASA
= getPointerAddressSpace(A
);
316 unsigned ASB
= getPointerAddressSpace(B
);
318 // Check that the address spaces match and that the pointers are valid.
319 if (!PtrA
|| !PtrB
|| (ASA
!= ASB
))
322 // Make sure that A and B are different pointers of the same size type.
323 Type
*PtrATy
= PtrA
->getType()->getPointerElementType();
324 Type
*PtrBTy
= PtrB
->getType()->getPointerElementType();
326 PtrATy
->isVectorTy() != PtrBTy
->isVectorTy() ||
327 DL
.getTypeStoreSize(PtrATy
) != DL
.getTypeStoreSize(PtrBTy
) ||
328 DL
.getTypeStoreSize(PtrATy
->getScalarType()) !=
329 DL
.getTypeStoreSize(PtrBTy
->getScalarType()))
332 unsigned PtrBitWidth
= DL
.getPointerSizeInBits(ASA
);
333 APInt
Size(PtrBitWidth
, DL
.getTypeStoreSize(PtrATy
));
335 return areConsecutivePointers(PtrA
, PtrB
, Size
);
338 bool Vectorizer::areConsecutivePointers(Value
*PtrA
, Value
*PtrB
,
339 APInt PtrDelta
, unsigned Depth
) const {
340 unsigned PtrBitWidth
= DL
.getPointerTypeSizeInBits(PtrA
->getType());
341 APInt
OffsetA(PtrBitWidth
, 0);
342 APInt
OffsetB(PtrBitWidth
, 0);
343 PtrA
= PtrA
->stripAndAccumulateInBoundsConstantOffsets(DL
, OffsetA
);
344 PtrB
= PtrB
->stripAndAccumulateInBoundsConstantOffsets(DL
, OffsetB
);
346 unsigned NewPtrBitWidth
= DL
.getTypeStoreSizeInBits(PtrA
->getType());
348 if (NewPtrBitWidth
!= DL
.getTypeStoreSizeInBits(PtrB
->getType()))
351 // In case if we have to shrink the pointer
352 // stripAndAccumulateInBoundsConstantOffsets should properly handle a
353 // possible overflow and the value should fit into a smallest data type
354 // used in the cast/gep chain.
355 assert(OffsetA
.getMinSignedBits() <= NewPtrBitWidth
&&
356 OffsetB
.getMinSignedBits() <= NewPtrBitWidth
);
358 OffsetA
= OffsetA
.sextOrTrunc(NewPtrBitWidth
);
359 OffsetB
= OffsetB
.sextOrTrunc(NewPtrBitWidth
);
360 PtrDelta
= PtrDelta
.sextOrTrunc(NewPtrBitWidth
);
362 APInt OffsetDelta
= OffsetB
- OffsetA
;
364 // Check if they are based on the same pointer. That makes the offsets
367 return OffsetDelta
== PtrDelta
;
369 // Compute the necessary base pointer delta to have the necessary final delta
370 // equal to the pointer delta requested.
371 APInt BaseDelta
= PtrDelta
- OffsetDelta
;
373 // Compute the distance with SCEV between the base pointers.
374 const SCEV
*PtrSCEVA
= SE
.getSCEV(PtrA
);
375 const SCEV
*PtrSCEVB
= SE
.getSCEV(PtrB
);
376 const SCEV
*C
= SE
.getConstant(BaseDelta
);
377 const SCEV
*X
= SE
.getAddExpr(PtrSCEVA
, C
);
381 // The above check will not catch the cases where one of the pointers is
382 // factorized but the other one is not, such as (C + (S * (A + B))) vs
383 // (AS + BS). Get the minus scev. That will allow re-combining the expresions
384 // and getting the simplified difference.
385 const SCEV
*Dist
= SE
.getMinusSCEV(PtrSCEVB
, PtrSCEVA
);
389 // Sometimes even this doesn't work, because SCEV can't always see through
390 // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
391 // things the hard way.
392 return lookThroughComplexAddresses(PtrA
, PtrB
, BaseDelta
, Depth
);
395 bool Vectorizer::lookThroughComplexAddresses(Value
*PtrA
, Value
*PtrB
,
397 unsigned Depth
) const {
398 auto *GEPA
= dyn_cast
<GetElementPtrInst
>(PtrA
);
399 auto *GEPB
= dyn_cast
<GetElementPtrInst
>(PtrB
);
401 return lookThroughSelects(PtrA
, PtrB
, PtrDelta
, Depth
);
403 // Look through GEPs after checking they're the same except for the last
405 if (GEPA
->getNumOperands() != GEPB
->getNumOperands() ||
406 GEPA
->getPointerOperand() != GEPB
->getPointerOperand())
408 gep_type_iterator GTIA
= gep_type_begin(GEPA
);
409 gep_type_iterator GTIB
= gep_type_begin(GEPB
);
410 for (unsigned I
= 0, E
= GEPA
->getNumIndices() - 1; I
< E
; ++I
) {
411 if (GTIA
.getOperand() != GTIB
.getOperand())
417 Instruction
*OpA
= dyn_cast
<Instruction
>(GTIA
.getOperand());
418 Instruction
*OpB
= dyn_cast
<Instruction
>(GTIB
.getOperand());
419 if (!OpA
|| !OpB
|| OpA
->getOpcode() != OpB
->getOpcode() ||
420 OpA
->getType() != OpB
->getType())
423 if (PtrDelta
.isNegative()) {
424 if (PtrDelta
.isMinSignedValue())
429 uint64_t Stride
= DL
.getTypeAllocSize(GTIA
.getIndexedType());
430 if (PtrDelta
.urem(Stride
) != 0)
432 unsigned IdxBitWidth
= OpA
->getType()->getScalarSizeInBits();
433 APInt IdxDiff
= PtrDelta
.udiv(Stride
).zextOrSelf(IdxBitWidth
);
435 // Only look through a ZExt/SExt.
436 if (!isa
<SExtInst
>(OpA
) && !isa
<ZExtInst
>(OpA
))
439 bool Signed
= isa
<SExtInst
>(OpA
);
441 // At this point A could be a function parameter, i.e. not an instruction
442 Value
*ValA
= OpA
->getOperand(0);
443 OpB
= dyn_cast
<Instruction
>(OpB
->getOperand(0));
444 if (!OpB
|| ValA
->getType() != OpB
->getType())
447 // Now we need to prove that adding IdxDiff to ValA won't overflow.
449 // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
451 if (OpB
->getOpcode() == Instruction::Add
&&
452 isa
<ConstantInt
>(OpB
->getOperand(1)) &&
453 IdxDiff
.sle(cast
<ConstantInt
>(OpB
->getOperand(1))->getSExtValue())) {
455 Safe
= cast
<BinaryOperator
>(OpB
)->hasNoSignedWrap();
457 Safe
= cast
<BinaryOperator
>(OpB
)->hasNoUnsignedWrap();
460 unsigned BitWidth
= ValA
->getType()->getScalarSizeInBits();
463 // If all set bits of IdxDiff or any higher order bit other than the sign bit
464 // are known to be zero in ValA, we can add Diff to it while guaranteeing no
465 // overflow of any sort.
467 OpA
= dyn_cast
<Instruction
>(ValA
);
470 KnownBits
Known(BitWidth
);
471 computeKnownBits(OpA
, Known
, DL
, 0, nullptr, OpA
, &DT
);
472 APInt BitsAllowedToBeSet
= Known
.Zero
.zext(IdxDiff
.getBitWidth());
474 BitsAllowedToBeSet
.clearBit(BitWidth
- 1);
475 if (BitsAllowedToBeSet
.ult(IdxDiff
))
479 const SCEV
*OffsetSCEVA
= SE
.getSCEV(ValA
);
480 const SCEV
*OffsetSCEVB
= SE
.getSCEV(OpB
);
481 const SCEV
*C
= SE
.getConstant(IdxDiff
.trunc(BitWidth
));
482 const SCEV
*X
= SE
.getAddExpr(OffsetSCEVA
, C
);
483 return X
== OffsetSCEVB
;
486 bool Vectorizer::lookThroughSelects(Value
*PtrA
, Value
*PtrB
,
487 const APInt
&PtrDelta
,
488 unsigned Depth
) const {
489 if (Depth
++ == MaxDepth
)
492 if (auto *SelectA
= dyn_cast
<SelectInst
>(PtrA
)) {
493 if (auto *SelectB
= dyn_cast
<SelectInst
>(PtrB
)) {
494 return SelectA
->getCondition() == SelectB
->getCondition() &&
495 areConsecutivePointers(SelectA
->getTrueValue(),
496 SelectB
->getTrueValue(), PtrDelta
, Depth
) &&
497 areConsecutivePointers(SelectA
->getFalseValue(),
498 SelectB
->getFalseValue(), PtrDelta
, Depth
);
504 void Vectorizer::reorder(Instruction
*I
) {
505 OrderedBasicBlock
OBB(I
->getParent());
506 SmallPtrSet
<Instruction
*, 16> InstructionsToMove
;
507 SmallVector
<Instruction
*, 16> Worklist
;
509 Worklist
.push_back(I
);
510 while (!Worklist
.empty()) {
511 Instruction
*IW
= Worklist
.pop_back_val();
512 int NumOperands
= IW
->getNumOperands();
513 for (int i
= 0; i
< NumOperands
; i
++) {
514 Instruction
*IM
= dyn_cast
<Instruction
>(IW
->getOperand(i
));
515 if (!IM
|| IM
->getOpcode() == Instruction::PHI
)
518 // If IM is in another BB, no need to move it, because this pass only
519 // vectorizes instructions within one BB.
520 if (IM
->getParent() != I
->getParent())
523 if (!OBB
.dominates(IM
, I
)) {
524 InstructionsToMove
.insert(IM
);
525 Worklist
.push_back(IM
);
530 // All instructions to move should follow I. Start from I, not from begin().
531 for (auto BBI
= I
->getIterator(), E
= I
->getParent()->end(); BBI
!= E
;
533 if (!InstructionsToMove
.count(&*BBI
))
535 Instruction
*IM
= &*BBI
;
537 IM
->removeFromParent();
542 std::pair
<BasicBlock::iterator
, BasicBlock::iterator
>
543 Vectorizer::getBoundaryInstrs(ArrayRef
<Instruction
*> Chain
) {
544 Instruction
*C0
= Chain
[0];
545 BasicBlock::iterator FirstInstr
= C0
->getIterator();
546 BasicBlock::iterator LastInstr
= C0
->getIterator();
548 BasicBlock
*BB
= C0
->getParent();
549 unsigned NumFound
= 0;
550 for (Instruction
&I
: *BB
) {
551 if (!is_contained(Chain
, &I
))
556 FirstInstr
= I
.getIterator();
558 if (NumFound
== Chain
.size()) {
559 LastInstr
= I
.getIterator();
564 // Range is [first, last).
565 return std::make_pair(FirstInstr
, ++LastInstr
);
568 void Vectorizer::eraseInstructions(ArrayRef
<Instruction
*> Chain
) {
569 SmallVector
<Instruction
*, 16> Instrs
;
570 for (Instruction
*I
: Chain
) {
571 Value
*PtrOperand
= getLoadStorePointerOperand(I
);
572 assert(PtrOperand
&& "Instruction must have a pointer operand.");
574 if (GetElementPtrInst
*GEP
= dyn_cast
<GetElementPtrInst
>(PtrOperand
))
575 Instrs
.push_back(GEP
);
578 // Erase instructions.
579 for (Instruction
*I
: Instrs
)
581 I
->eraseFromParent();
584 std::pair
<ArrayRef
<Instruction
*>, ArrayRef
<Instruction
*>>
585 Vectorizer::splitOddVectorElts(ArrayRef
<Instruction
*> Chain
,
586 unsigned ElementSizeBits
) {
587 unsigned ElementSizeBytes
= ElementSizeBits
/ 8;
588 unsigned SizeBytes
= ElementSizeBytes
* Chain
.size();
589 unsigned NumLeft
= (SizeBytes
- (SizeBytes
% 4)) / ElementSizeBytes
;
590 if (NumLeft
== Chain
.size()) {
591 if ((NumLeft
& 1) == 0)
592 NumLeft
/= 2; // Split even in half
594 --NumLeft
; // Split off last element
595 } else if (NumLeft
== 0)
597 return std::make_pair(Chain
.slice(0, NumLeft
), Chain
.slice(NumLeft
));
600 ArrayRef
<Instruction
*>
601 Vectorizer::getVectorizablePrefix(ArrayRef
<Instruction
*> Chain
) {
602 // These are in BB order, unlike Chain, which is in address order.
603 SmallVector
<Instruction
*, 16> MemoryInstrs
;
604 SmallVector
<Instruction
*, 16> ChainInstrs
;
606 bool IsLoadChain
= isa
<LoadInst
>(Chain
[0]);
608 for (Instruction
*I
: Chain
) {
610 assert(isa
<LoadInst
>(I
) &&
611 "All elements of Chain must be loads, or all must be stores.");
613 assert(isa
<StoreInst
>(I
) &&
614 "All elements of Chain must be loads, or all must be stores.");
618 for (Instruction
&I
: make_range(getBoundaryInstrs(Chain
))) {
619 if (isa
<LoadInst
>(I
) || isa
<StoreInst
>(I
)) {
620 if (!is_contained(Chain
, &I
))
621 MemoryInstrs
.push_back(&I
);
623 ChainInstrs
.push_back(&I
);
624 } else if (isa
<IntrinsicInst
>(&I
) &&
625 cast
<IntrinsicInst
>(&I
)->getIntrinsicID() ==
626 Intrinsic::sideeffect
) {
627 // Ignore llvm.sideeffect calls.
628 } else if (IsLoadChain
&& (I
.mayWriteToMemory() || I
.mayThrow())) {
629 LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
632 } else if (!IsLoadChain
&& (I
.mayReadOrWriteMemory() || I
.mayThrow())) {
633 LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
639 OrderedBasicBlock
OBB(Chain
[0]->getParent());
641 // Loop until we find an instruction in ChainInstrs that we can't vectorize.
642 unsigned ChainInstrIdx
= 0;
643 Instruction
*BarrierMemoryInstr
= nullptr;
645 for (unsigned E
= ChainInstrs
.size(); ChainInstrIdx
< E
; ++ChainInstrIdx
) {
646 Instruction
*ChainInstr
= ChainInstrs
[ChainInstrIdx
];
648 // If a barrier memory instruction was found, chain instructions that follow
649 // will not be added to the valid prefix.
650 if (BarrierMemoryInstr
&& OBB
.dominates(BarrierMemoryInstr
, ChainInstr
))
653 // Check (in BB order) if any instruction prevents ChainInstr from being
654 // vectorized. Find and store the first such "conflicting" instruction.
655 for (Instruction
*MemInstr
: MemoryInstrs
) {
656 // If a barrier memory instruction was found, do not check past it.
657 if (BarrierMemoryInstr
&& OBB
.dominates(BarrierMemoryInstr
, MemInstr
))
660 auto *MemLoad
= dyn_cast
<LoadInst
>(MemInstr
);
661 auto *ChainLoad
= dyn_cast
<LoadInst
>(ChainInstr
);
662 if (MemLoad
&& ChainLoad
)
665 // We can ignore the alias if the we have a load store pair and the load
666 // is known to be invariant. The load cannot be clobbered by the store.
667 auto IsInvariantLoad
= [](const LoadInst
*LI
) -> bool {
668 return LI
->hasMetadata(LLVMContext::MD_invariant_load
);
671 // We can ignore the alias as long as the load comes before the store,
672 // because that means we won't be moving the load past the store to
673 // vectorize it (the vectorized load is inserted at the location of the
674 // first load in the chain).
675 if (isa
<StoreInst
>(MemInstr
) && ChainLoad
&&
676 (IsInvariantLoad(ChainLoad
) || OBB
.dominates(ChainLoad
, MemInstr
)))
679 // Same case, but in reverse.
680 if (MemLoad
&& isa
<StoreInst
>(ChainInstr
) &&
681 (IsInvariantLoad(MemLoad
) || OBB
.dominates(MemLoad
, ChainInstr
)))
684 if (!AA
.isNoAlias(MemoryLocation::get(MemInstr
),
685 MemoryLocation::get(ChainInstr
))) {
687 dbgs() << "LSV: Found alias:\n"
688 " Aliasing instruction and pointer:\n"
689 << " " << *MemInstr
<< '\n'
690 << " " << *getLoadStorePointerOperand(MemInstr
) << '\n'
691 << " Aliased instruction and pointer:\n"
692 << " " << *ChainInstr
<< '\n'
693 << " " << *getLoadStorePointerOperand(ChainInstr
) << '\n';
695 // Save this aliasing memory instruction as a barrier, but allow other
696 // instructions that precede the barrier to be vectorized with this one.
697 BarrierMemoryInstr
= MemInstr
;
701 // Continue the search only for store chains, since vectorizing stores that
702 // precede an aliasing load is valid. Conversely, vectorizing loads is valid
703 // up to an aliasing store, but should not pull loads from further down in
705 if (IsLoadChain
&& BarrierMemoryInstr
) {
706 // The BarrierMemoryInstr is a store that precedes ChainInstr.
707 assert(OBB
.dominates(BarrierMemoryInstr
, ChainInstr
));
712 // Find the largest prefix of Chain whose elements are all in
713 // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of
714 // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB
716 SmallPtrSet
<Instruction
*, 8> VectorizableChainInstrs(
717 ChainInstrs
.begin(), ChainInstrs
.begin() + ChainInstrIdx
);
718 unsigned ChainIdx
= 0;
719 for (unsigned ChainLen
= Chain
.size(); ChainIdx
< ChainLen
; ++ChainIdx
) {
720 if (!VectorizableChainInstrs
.count(Chain
[ChainIdx
]))
723 return Chain
.slice(0, ChainIdx
);
726 static ChainID
getChainID(const Value
*Ptr
, const DataLayout
&DL
) {
727 const Value
*ObjPtr
= GetUnderlyingObject(Ptr
, DL
);
728 if (const auto *Sel
= dyn_cast
<SelectInst
>(ObjPtr
)) {
729 // The select's themselves are distinct instructions even if they share the
730 // same condition and evaluate to consecutive pointers for true and false
731 // values of the condition. Therefore using the select's themselves for
732 // grouping instructions would put consecutive accesses into different lists
733 // and they won't be even checked for being consecutive, and won't be
735 return Sel
->getCondition();
740 std::pair
<InstrListMap
, InstrListMap
>
741 Vectorizer::collectInstructions(BasicBlock
*BB
) {
742 InstrListMap LoadRefs
;
743 InstrListMap StoreRefs
;
745 for (Instruction
&I
: *BB
) {
746 if (!I
.mayReadOrWriteMemory())
749 if (LoadInst
*LI
= dyn_cast
<LoadInst
>(&I
)) {
753 // Skip if it's not legal.
754 if (!TTI
.isLegalToVectorizeLoad(LI
))
757 Type
*Ty
= LI
->getType();
758 if (!VectorType::isValidElementType(Ty
->getScalarType()))
761 // Skip weird non-byte sizes. They probably aren't worth the effort of
762 // handling correctly.
763 unsigned TySize
= DL
.getTypeSizeInBits(Ty
);
764 if ((TySize
% 8) != 0)
767 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
768 // functions are currently using an integer type for the vectorized
769 // load/store, and does not support casting between the integer type and a
770 // vector of pointers (e.g. i64 to <2 x i16*>)
771 if (Ty
->isVectorTy() && Ty
->isPtrOrPtrVectorTy())
774 Value
*Ptr
= LI
->getPointerOperand();
775 unsigned AS
= Ptr
->getType()->getPointerAddressSpace();
776 unsigned VecRegSize
= TTI
.getLoadStoreVecRegBitWidth(AS
);
778 unsigned VF
= VecRegSize
/ TySize
;
779 VectorType
*VecTy
= dyn_cast
<VectorType
>(Ty
);
781 // No point in looking at these if they're too big to vectorize.
782 if (TySize
> VecRegSize
/ 2 ||
783 (VecTy
&& TTI
.getLoadVectorFactor(VF
, TySize
, TySize
/ 8, VecTy
) == 0))
786 // Make sure all the users of a vector are constant-index extracts.
787 if (isa
<VectorType
>(Ty
) && !llvm::all_of(LI
->users(), [](const User
*U
) {
788 const ExtractElementInst
*EEI
= dyn_cast
<ExtractElementInst
>(U
);
789 return EEI
&& isa
<ConstantInt
>(EEI
->getOperand(1));
793 // Save the load locations.
794 const ChainID ID
= getChainID(Ptr
, DL
);
795 LoadRefs
[ID
].push_back(LI
);
796 } else if (StoreInst
*SI
= dyn_cast
<StoreInst
>(&I
)) {
800 // Skip if it's not legal.
801 if (!TTI
.isLegalToVectorizeStore(SI
))
804 Type
*Ty
= SI
->getValueOperand()->getType();
805 if (!VectorType::isValidElementType(Ty
->getScalarType()))
808 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
809 // functions are currently using an integer type for the vectorized
810 // load/store, and does not support casting between the integer type and a
811 // vector of pointers (e.g. i64 to <2 x i16*>)
812 if (Ty
->isVectorTy() && Ty
->isPtrOrPtrVectorTy())
815 // Skip weird non-byte sizes. They probably aren't worth the effort of
816 // handling correctly.
817 unsigned TySize
= DL
.getTypeSizeInBits(Ty
);
818 if ((TySize
% 8) != 0)
821 Value
*Ptr
= SI
->getPointerOperand();
822 unsigned AS
= Ptr
->getType()->getPointerAddressSpace();
823 unsigned VecRegSize
= TTI
.getLoadStoreVecRegBitWidth(AS
);
825 unsigned VF
= VecRegSize
/ TySize
;
826 VectorType
*VecTy
= dyn_cast
<VectorType
>(Ty
);
828 // No point in looking at these if they're too big to vectorize.
829 if (TySize
> VecRegSize
/ 2 ||
830 (VecTy
&& TTI
.getStoreVectorFactor(VF
, TySize
, TySize
/ 8, VecTy
) == 0))
833 if (isa
<VectorType
>(Ty
) && !llvm::all_of(SI
->users(), [](const User
*U
) {
834 const ExtractElementInst
*EEI
= dyn_cast
<ExtractElementInst
>(U
);
835 return EEI
&& isa
<ConstantInt
>(EEI
->getOperand(1));
839 // Save store location.
840 const ChainID ID
= getChainID(Ptr
, DL
);
841 StoreRefs
[ID
].push_back(SI
);
845 return {LoadRefs
, StoreRefs
};
848 bool Vectorizer::vectorizeChains(InstrListMap
&Map
) {
849 bool Changed
= false;
851 for (const std::pair
<ChainID
, InstrList
> &Chain
: Map
) {
852 unsigned Size
= Chain
.second
.size();
856 LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size
<< ".\n");
858 // Process the stores in chunks of 64.
859 for (unsigned CI
= 0, CE
= Size
; CI
< CE
; CI
+= 64) {
860 unsigned Len
= std::min
<unsigned>(CE
- CI
, 64);
861 ArrayRef
<Instruction
*> Chunk(&Chain
.second
[CI
], Len
);
862 Changed
|= vectorizeInstructions(Chunk
);
869 bool Vectorizer::vectorizeInstructions(ArrayRef
<Instruction
*> Instrs
) {
870 LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs
.size()
871 << " instructions.\n");
872 SmallVector
<int, 16> Heads
, Tails
;
873 int ConsecutiveChain
[64];
875 // Do a quadratic search on all of the given loads/stores and find all of the
876 // pairs of loads/stores that follow each other.
877 for (int i
= 0, e
= Instrs
.size(); i
< e
; ++i
) {
878 ConsecutiveChain
[i
] = -1;
879 for (int j
= e
- 1; j
>= 0; --j
) {
883 if (isConsecutiveAccess(Instrs
[i
], Instrs
[j
])) {
884 if (ConsecutiveChain
[i
] != -1) {
885 int CurDistance
= std::abs(ConsecutiveChain
[i
] - i
);
886 int NewDistance
= std::abs(ConsecutiveChain
[i
] - j
);
887 if (j
< i
|| NewDistance
> CurDistance
)
888 continue; // Should not insert.
893 ConsecutiveChain
[i
] = j
;
898 bool Changed
= false;
899 SmallPtrSet
<Instruction
*, 16> InstructionsProcessed
;
901 for (int Head
: Heads
) {
902 if (InstructionsProcessed
.count(Instrs
[Head
]))
904 bool LongerChainExists
= false;
905 for (unsigned TIt
= 0; TIt
< Tails
.size(); TIt
++)
906 if (Head
== Tails
[TIt
] &&
907 !InstructionsProcessed
.count(Instrs
[Heads
[TIt
]])) {
908 LongerChainExists
= true;
911 if (LongerChainExists
)
914 // We found an instr that starts a chain. Now follow the chain and try to
916 SmallVector
<Instruction
*, 16> Operands
;
918 while (I
!= -1 && (is_contained(Tails
, I
) || is_contained(Heads
, I
))) {
919 if (InstructionsProcessed
.count(Instrs
[I
]))
922 Operands
.push_back(Instrs
[I
]);
923 I
= ConsecutiveChain
[I
];
926 bool Vectorized
= false;
927 if (isa
<LoadInst
>(*Operands
.begin()))
928 Vectorized
= vectorizeLoadChain(Operands
, &InstructionsProcessed
);
930 Vectorized
= vectorizeStoreChain(Operands
, &InstructionsProcessed
);
932 Changed
|= Vectorized
;
938 bool Vectorizer::vectorizeStoreChain(
939 ArrayRef
<Instruction
*> Chain
,
940 SmallPtrSet
<Instruction
*, 16> *InstructionsProcessed
) {
941 StoreInst
*S0
= cast
<StoreInst
>(Chain
[0]);
943 // If the vector has an int element, default to int for the whole store.
944 Type
*StoreTy
= nullptr;
945 for (Instruction
*I
: Chain
) {
946 StoreTy
= cast
<StoreInst
>(I
)->getValueOperand()->getType();
947 if (StoreTy
->isIntOrIntVectorTy())
950 if (StoreTy
->isPtrOrPtrVectorTy()) {
951 StoreTy
= Type::getIntNTy(F
.getParent()->getContext(),
952 DL
.getTypeSizeInBits(StoreTy
));
956 assert(StoreTy
&& "Failed to find store type");
958 unsigned Sz
= DL
.getTypeSizeInBits(StoreTy
);
959 unsigned AS
= S0
->getPointerAddressSpace();
960 unsigned VecRegSize
= TTI
.getLoadStoreVecRegBitWidth(AS
);
961 unsigned VF
= VecRegSize
/ Sz
;
962 unsigned ChainSize
= Chain
.size();
963 unsigned Alignment
= getAlignment(S0
);
965 if (!isPowerOf2_32(Sz
) || VF
< 2 || ChainSize
< 2) {
966 InstructionsProcessed
->insert(Chain
.begin(), Chain
.end());
970 ArrayRef
<Instruction
*> NewChain
= getVectorizablePrefix(Chain
);
971 if (NewChain
.empty()) {
972 // No vectorization possible.
973 InstructionsProcessed
->insert(Chain
.begin(), Chain
.end());
976 if (NewChain
.size() == 1) {
977 // Failed after the first instruction. Discard it and try the smaller chain.
978 InstructionsProcessed
->insert(NewChain
.front());
982 // Update Chain to the valid vectorizable subchain.
984 ChainSize
= Chain
.size();
986 // Check if it's legal to vectorize this chain. If not, split the chain and
988 unsigned EltSzInBytes
= Sz
/ 8;
989 unsigned SzInBytes
= EltSzInBytes
* ChainSize
;
992 VectorType
*VecStoreTy
= dyn_cast
<VectorType
>(StoreTy
);
994 VecTy
= VectorType::get(StoreTy
->getScalarType(),
995 Chain
.size() * VecStoreTy
->getNumElements());
997 VecTy
= VectorType::get(StoreTy
, Chain
.size());
999 // If it's more than the max vector size or the target has a better
1000 // vector factor, break it into two pieces.
1001 unsigned TargetVF
= TTI
.getStoreVectorFactor(VF
, Sz
, SzInBytes
, VecTy
);
1002 if (ChainSize
> VF
|| (VF
!= TargetVF
&& TargetVF
< ChainSize
)) {
1003 LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1004 " Creating two separate arrays.\n");
1005 return vectorizeStoreChain(Chain
.slice(0, TargetVF
),
1006 InstructionsProcessed
) |
1007 vectorizeStoreChain(Chain
.slice(TargetVF
), InstructionsProcessed
);
1011 dbgs() << "LSV: Stores to vectorize:\n";
1012 for (Instruction
*I
: Chain
)
1013 dbgs() << " " << *I
<< "\n";
1016 // We won't try again to vectorize the elements of the chain, regardless of
1017 // whether we succeed below.
1018 InstructionsProcessed
->insert(Chain
.begin(), Chain
.end());
1020 // If the store is going to be misaligned, don't vectorize it.
1021 if (accessIsMisaligned(SzInBytes
, AS
, Alignment
)) {
1022 if (S0
->getPointerAddressSpace() != DL
.getAllocaAddrSpace()) {
1023 auto Chains
= splitOddVectorElts(Chain
, Sz
);
1024 return vectorizeStoreChain(Chains
.first
, InstructionsProcessed
) |
1025 vectorizeStoreChain(Chains
.second
, InstructionsProcessed
);
1028 unsigned NewAlign
= getOrEnforceKnownAlignment(S0
->getPointerOperand(),
1029 StackAdjustedAlignment
,
1030 DL
, S0
, nullptr, &DT
);
1032 Alignment
= NewAlign
;
1035 if (!TTI
.isLegalToVectorizeStoreChain(SzInBytes
, Alignment
, AS
)) {
1036 auto Chains
= splitOddVectorElts(Chain
, Sz
);
1037 return vectorizeStoreChain(Chains
.first
, InstructionsProcessed
) |
1038 vectorizeStoreChain(Chains
.second
, InstructionsProcessed
);
1041 BasicBlock::iterator First
, Last
;
1042 std::tie(First
, Last
) = getBoundaryInstrs(Chain
);
1043 Builder
.SetInsertPoint(&*Last
);
1045 Value
*Vec
= UndefValue::get(VecTy
);
1048 unsigned VecWidth
= VecStoreTy
->getNumElements();
1049 for (unsigned I
= 0, E
= Chain
.size(); I
!= E
; ++I
) {
1050 StoreInst
*Store
= cast
<StoreInst
>(Chain
[I
]);
1051 for (unsigned J
= 0, NE
= VecStoreTy
->getNumElements(); J
!= NE
; ++J
) {
1052 unsigned NewIdx
= J
+ I
* VecWidth
;
1053 Value
*Extract
= Builder
.CreateExtractElement(Store
->getValueOperand(),
1054 Builder
.getInt32(J
));
1055 if (Extract
->getType() != StoreTy
->getScalarType())
1056 Extract
= Builder
.CreateBitCast(Extract
, StoreTy
->getScalarType());
1059 Builder
.CreateInsertElement(Vec
, Extract
, Builder
.getInt32(NewIdx
));
1064 for (unsigned I
= 0, E
= Chain
.size(); I
!= E
; ++I
) {
1065 StoreInst
*Store
= cast
<StoreInst
>(Chain
[I
]);
1066 Value
*Extract
= Store
->getValueOperand();
1067 if (Extract
->getType() != StoreTy
->getScalarType())
1069 Builder
.CreateBitOrPointerCast(Extract
, StoreTy
->getScalarType());
1072 Builder
.CreateInsertElement(Vec
, Extract
, Builder
.getInt32(I
));
1077 StoreInst
*SI
= Builder
.CreateAlignedStore(
1079 Builder
.CreateBitCast(S0
->getPointerOperand(), VecTy
->getPointerTo(AS
)),
1081 propagateMetadata(SI
, Chain
);
1083 eraseInstructions(Chain
);
1084 ++NumVectorInstructions
;
1085 NumScalarsVectorized
+= Chain
.size();
1089 bool Vectorizer::vectorizeLoadChain(
1090 ArrayRef
<Instruction
*> Chain
,
1091 SmallPtrSet
<Instruction
*, 16> *InstructionsProcessed
) {
1092 LoadInst
*L0
= cast
<LoadInst
>(Chain
[0]);
1094 // If the vector has an int element, default to int for the whole load.
1095 Type
*LoadTy
= nullptr;
1096 for (const auto &V
: Chain
) {
1097 LoadTy
= cast
<LoadInst
>(V
)->getType();
1098 if (LoadTy
->isIntOrIntVectorTy())
1101 if (LoadTy
->isPtrOrPtrVectorTy()) {
1102 LoadTy
= Type::getIntNTy(F
.getParent()->getContext(),
1103 DL
.getTypeSizeInBits(LoadTy
));
1107 assert(LoadTy
&& "Can't determine LoadInst type from chain");
1109 unsigned Sz
= DL
.getTypeSizeInBits(LoadTy
);
1110 unsigned AS
= L0
->getPointerAddressSpace();
1111 unsigned VecRegSize
= TTI
.getLoadStoreVecRegBitWidth(AS
);
1112 unsigned VF
= VecRegSize
/ Sz
;
1113 unsigned ChainSize
= Chain
.size();
1114 unsigned Alignment
= getAlignment(L0
);
1116 if (!isPowerOf2_32(Sz
) || VF
< 2 || ChainSize
< 2) {
1117 InstructionsProcessed
->insert(Chain
.begin(), Chain
.end());
1121 ArrayRef
<Instruction
*> NewChain
= getVectorizablePrefix(Chain
);
1122 if (NewChain
.empty()) {
1123 // No vectorization possible.
1124 InstructionsProcessed
->insert(Chain
.begin(), Chain
.end());
1127 if (NewChain
.size() == 1) {
1128 // Failed after the first instruction. Discard it and try the smaller chain.
1129 InstructionsProcessed
->insert(NewChain
.front());
1133 // Update Chain to the valid vectorizable subchain.
1135 ChainSize
= Chain
.size();
1137 // Check if it's legal to vectorize this chain. If not, split the chain and
1139 unsigned EltSzInBytes
= Sz
/ 8;
1140 unsigned SzInBytes
= EltSzInBytes
* ChainSize
;
1142 VectorType
*VecLoadTy
= dyn_cast
<VectorType
>(LoadTy
);
1144 VecTy
= VectorType::get(LoadTy
->getScalarType(),
1145 Chain
.size() * VecLoadTy
->getNumElements());
1147 VecTy
= VectorType::get(LoadTy
, Chain
.size());
1149 // If it's more than the max vector size or the target has a better
1150 // vector factor, break it into two pieces.
1151 unsigned TargetVF
= TTI
.getLoadVectorFactor(VF
, Sz
, SzInBytes
, VecTy
);
1152 if (ChainSize
> VF
|| (VF
!= TargetVF
&& TargetVF
< ChainSize
)) {
1153 LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1154 " Creating two separate arrays.\n");
1155 return vectorizeLoadChain(Chain
.slice(0, TargetVF
), InstructionsProcessed
) |
1156 vectorizeLoadChain(Chain
.slice(TargetVF
), InstructionsProcessed
);
1159 // We won't try again to vectorize the elements of the chain, regardless of
1160 // whether we succeed below.
1161 InstructionsProcessed
->insert(Chain
.begin(), Chain
.end());
1163 // If the load is going to be misaligned, don't vectorize it.
1164 if (accessIsMisaligned(SzInBytes
, AS
, Alignment
)) {
1165 if (L0
->getPointerAddressSpace() != DL
.getAllocaAddrSpace()) {
1166 auto Chains
= splitOddVectorElts(Chain
, Sz
);
1167 return vectorizeLoadChain(Chains
.first
, InstructionsProcessed
) |
1168 vectorizeLoadChain(Chains
.second
, InstructionsProcessed
);
1171 Alignment
= getOrEnforceKnownAlignment(
1172 L0
->getPointerOperand(), StackAdjustedAlignment
, DL
, L0
, nullptr, &DT
);
1175 if (!TTI
.isLegalToVectorizeLoadChain(SzInBytes
, Alignment
, AS
)) {
1176 auto Chains
= splitOddVectorElts(Chain
, Sz
);
1177 return vectorizeLoadChain(Chains
.first
, InstructionsProcessed
) |
1178 vectorizeLoadChain(Chains
.second
, InstructionsProcessed
);
1182 dbgs() << "LSV: Loads to vectorize:\n";
1183 for (Instruction
*I
: Chain
)
1187 // getVectorizablePrefix already computed getBoundaryInstrs. The value of
1188 // Last may have changed since then, but the value of First won't have. If it
1189 // matters, we could compute getBoundaryInstrs only once and reuse it here.
1190 BasicBlock::iterator First
, Last
;
1191 std::tie(First
, Last
) = getBoundaryInstrs(Chain
);
1192 Builder
.SetInsertPoint(&*First
);
1195 Builder
.CreateBitCast(L0
->getPointerOperand(), VecTy
->getPointerTo(AS
));
1196 LoadInst
*LI
= Builder
.CreateAlignedLoad(VecTy
, Bitcast
, Alignment
);
1197 propagateMetadata(LI
, Chain
);
1200 SmallVector
<Instruction
*, 16> InstrsToErase
;
1202 unsigned VecWidth
= VecLoadTy
->getNumElements();
1203 for (unsigned I
= 0, E
= Chain
.size(); I
!= E
; ++I
) {
1204 for (auto Use
: Chain
[I
]->users()) {
1205 // All users of vector loads are ExtractElement instructions with
1206 // constant indices, otherwise we would have bailed before now.
1207 Instruction
*UI
= cast
<Instruction
>(Use
);
1208 unsigned Idx
= cast
<ConstantInt
>(UI
->getOperand(1))->getZExtValue();
1209 unsigned NewIdx
= Idx
+ I
* VecWidth
;
1210 Value
*V
= Builder
.CreateExtractElement(LI
, Builder
.getInt32(NewIdx
),
1212 if (V
->getType() != UI
->getType())
1213 V
= Builder
.CreateBitCast(V
, UI
->getType());
1215 // Replace the old instruction.
1216 UI
->replaceAllUsesWith(V
);
1217 InstrsToErase
.push_back(UI
);
1221 // Bitcast might not be an Instruction, if the value being loaded is a
1222 // constant. In that case, no need to reorder anything.
1223 if (Instruction
*BitcastInst
= dyn_cast
<Instruction
>(Bitcast
))
1224 reorder(BitcastInst
);
1226 for (auto I
: InstrsToErase
)
1227 I
->eraseFromParent();
1229 for (unsigned I
= 0, E
= Chain
.size(); I
!= E
; ++I
) {
1230 Value
*CV
= Chain
[I
];
1232 Builder
.CreateExtractElement(LI
, Builder
.getInt32(I
), CV
->getName());
1233 if (V
->getType() != CV
->getType()) {
1234 V
= Builder
.CreateBitOrPointerCast(V
, CV
->getType());
1237 // Replace the old instruction.
1238 CV
->replaceAllUsesWith(V
);
1241 if (Instruction
*BitcastInst
= dyn_cast
<Instruction
>(Bitcast
))
1242 reorder(BitcastInst
);
1245 eraseInstructions(Chain
);
1247 ++NumVectorInstructions
;
1248 NumScalarsVectorized
+= Chain
.size();
1252 bool Vectorizer::accessIsMisaligned(unsigned SzInBytes
, unsigned AddressSpace
,
1253 unsigned Alignment
) {
1254 if (Alignment
% SzInBytes
== 0)
1258 bool Allows
= TTI
.allowsMisalignedMemoryAccesses(F
.getParent()->getContext(),
1259 SzInBytes
* 8, AddressSpace
,
1261 LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
1262 << " and fast? " << Fast
<< "\n";);
1263 return !Allows
|| !Fast
;