1 //===----- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer ----------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 //===----------------------------------------------------------------------===//
12 #include "llvm/ADT/MapVector.h"
13 #include "llvm/ADT/PostOrderIterator.h"
14 #include "llvm/ADT/SetVector.h"
15 #include "llvm/ADT/Statistic.h"
16 #include "llvm/ADT/Triple.h"
17 #include "llvm/Analysis/AliasAnalysis.h"
18 #include "llvm/Analysis/OrderedBasicBlock.h"
19 #include "llvm/Analysis/ScalarEvolution.h"
20 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
21 #include "llvm/Analysis/TargetTransformInfo.h"
22 #include "llvm/Analysis/ValueTracking.h"
23 #include "llvm/Analysis/VectorUtils.h"
24 #include "llvm/IR/DataLayout.h"
25 #include "llvm/IR/Dominators.h"
26 #include "llvm/IR/IRBuilder.h"
27 #include "llvm/IR/Instructions.h"
28 #include "llvm/IR/Module.h"
29 #include "llvm/IR/Type.h"
30 #include "llvm/IR/Value.h"
31 #include "llvm/Support/CommandLine.h"
32 #include "llvm/Support/Debug.h"
33 #include "llvm/Support/KnownBits.h"
34 #include "llvm/Support/raw_ostream.h"
35 #include "llvm/Transforms/Utils/Local.h"
36 #include "llvm/Transforms/Vectorize.h"
40 #define DEBUG_TYPE "load-store-vectorizer"
41 STATISTIC(NumVectorInstructions
, "Number of vector accesses generated");
42 STATISTIC(NumScalarsVectorized
, "Number of scalar accesses vectorized");
46 // FIXME: Assuming stack alignment of 4 is always good enough
47 static const unsigned StackAdjustedAlignment
= 4;
48 typedef SmallVector
<Instruction
*, 8> InstrList
;
49 typedef MapVector
<Value
*, InstrList
> InstrListMap
;
56 TargetTransformInfo
&TTI
;
61 Vectorizer(Function
&F
, AliasAnalysis
&AA
, DominatorTree
&DT
,
62 ScalarEvolution
&SE
, TargetTransformInfo
&TTI
)
63 : F(F
), AA(AA
), DT(DT
), SE(SE
), TTI(TTI
),
64 DL(F
.getParent()->getDataLayout()), Builder(SE
.getContext()) {}
69 Value
*getPointerOperand(Value
*I
) const;
71 GetElementPtrInst
*getSourceGEP(Value
*Src
) const;
73 unsigned getPointerAddressSpace(Value
*I
);
75 unsigned getAlignment(LoadInst
*LI
) const {
76 unsigned Align
= LI
->getAlignment();
80 return DL
.getABITypeAlignment(LI
->getType());
83 unsigned getAlignment(StoreInst
*SI
) const {
84 unsigned Align
= SI
->getAlignment();
88 return DL
.getABITypeAlignment(SI
->getValueOperand()->getType());
91 bool isConsecutiveAccess(Value
*A
, Value
*B
);
93 /// After vectorization, reorder the instructions that I depends on
94 /// (the instructions defining its operands), to ensure they dominate I.
95 void reorder(Instruction
*I
);
97 /// Returns the first and the last instructions in Chain.
98 std::pair
<BasicBlock::iterator
, BasicBlock::iterator
>
99 getBoundaryInstrs(ArrayRef
<Instruction
*> Chain
);
101 /// Erases the original instructions after vectorizing.
102 void eraseInstructions(ArrayRef
<Instruction
*> Chain
);
104 /// "Legalize" the vector type that would be produced by combining \p
105 /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
106 /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
107 /// expected to have more than 4 elements.
108 std::pair
<ArrayRef
<Instruction
*>, ArrayRef
<Instruction
*>>
109 splitOddVectorElts(ArrayRef
<Instruction
*> Chain
, unsigned ElementSizeBits
);
111 /// Finds the largest prefix of Chain that's vectorizable, checking for
112 /// intervening instructions which may affect the memory accessed by the
113 /// instructions within Chain.
115 /// The elements of \p Chain must be all loads or all stores and must be in
117 ArrayRef
<Instruction
*> getVectorizablePrefix(ArrayRef
<Instruction
*> Chain
);
119 /// Collects load and store instructions to vectorize.
120 std::pair
<InstrListMap
, InstrListMap
> collectInstructions(BasicBlock
*BB
);
122 /// Processes the collected instructions, the \p Map. The values of \p Map
123 /// should be all loads or all stores.
124 bool vectorizeChains(InstrListMap
&Map
);
126 /// Finds the load/stores to consecutive memory addresses and vectorizes them.
127 bool vectorizeInstructions(ArrayRef
<Instruction
*> Instrs
);
129 /// Vectorizes the load instructions in Chain.
131 vectorizeLoadChain(ArrayRef
<Instruction
*> Chain
,
132 SmallPtrSet
<Instruction
*, 16> *InstructionsProcessed
);
134 /// Vectorizes the store instructions in Chain.
136 vectorizeStoreChain(ArrayRef
<Instruction
*> Chain
,
137 SmallPtrSet
<Instruction
*, 16> *InstructionsProcessed
);
139 /// Check if this load/store access is misaligned accesses.
140 bool accessIsMisaligned(unsigned SzInBytes
, unsigned AddressSpace
,
144 class LoadStoreVectorizer
: public FunctionPass
{
148 LoadStoreVectorizer() : FunctionPass(ID
) {
149 initializeLoadStoreVectorizerPass(*PassRegistry::getPassRegistry());
152 bool runOnFunction(Function
&F
) override
;
154 StringRef
getPassName() const override
{
155 return "GPU Load and Store Vectorizer";
158 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
159 AU
.addRequired
<AAResultsWrapperPass
>();
160 AU
.addRequired
<ScalarEvolutionWrapperPass
>();
161 AU
.addRequired
<DominatorTreeWrapperPass
>();
162 AU
.addRequired
<TargetTransformInfoWrapperPass
>();
163 AU
.setPreservesCFG();
168 INITIALIZE_PASS_BEGIN(LoadStoreVectorizer
, DEBUG_TYPE
,
169 "Vectorize load and Store instructions", false, false)
170 INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass
)
171 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass
)
172 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass
)
173 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass
)
174 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass
)
175 INITIALIZE_PASS_END(LoadStoreVectorizer
, DEBUG_TYPE
,
176 "Vectorize load and store instructions", false, false)
178 char LoadStoreVectorizer::ID
= 0;
180 Pass
*llvm::createLoadStoreVectorizerPass() {
181 return new LoadStoreVectorizer();
184 // The real propagateMetadata expects a SmallVector<Value*>, but we deal in
185 // vectors of Instructions.
186 static void propagateMetadata(Instruction
*I
, ArrayRef
<Instruction
*> IL
) {
187 SmallVector
<Value
*, 8> VL(IL
.begin(), IL
.end());
188 propagateMetadata(I
, VL
);
191 bool LoadStoreVectorizer::runOnFunction(Function
&F
) {
192 // Don't vectorize when the attribute NoImplicitFloat is used.
193 if (skipFunction(F
) || F
.hasFnAttribute(Attribute::NoImplicitFloat
))
196 AliasAnalysis
&AA
= getAnalysis
<AAResultsWrapperPass
>().getAAResults();
197 DominatorTree
&DT
= getAnalysis
<DominatorTreeWrapperPass
>().getDomTree();
198 ScalarEvolution
&SE
= getAnalysis
<ScalarEvolutionWrapperPass
>().getSE();
199 TargetTransformInfo
&TTI
=
200 getAnalysis
<TargetTransformInfoWrapperPass
>().getTTI(F
);
202 Vectorizer
V(F
, AA
, DT
, SE
, TTI
);
206 // Vectorizer Implementation
207 bool Vectorizer::run() {
208 bool Changed
= false;
210 // Scan the blocks in the function in post order.
211 for (BasicBlock
*BB
: post_order(&F
)) {
212 InstrListMap LoadRefs
, StoreRefs
;
213 std::tie(LoadRefs
, StoreRefs
) = collectInstructions(BB
);
214 Changed
|= vectorizeChains(LoadRefs
);
215 Changed
|= vectorizeChains(StoreRefs
);
221 Value
*Vectorizer::getPointerOperand(Value
*I
) const {
222 if (LoadInst
*LI
= dyn_cast
<LoadInst
>(I
))
223 return LI
->getPointerOperand();
224 if (StoreInst
*SI
= dyn_cast
<StoreInst
>(I
))
225 return SI
->getPointerOperand();
229 unsigned Vectorizer::getPointerAddressSpace(Value
*I
) {
230 if (LoadInst
*L
= dyn_cast
<LoadInst
>(I
))
231 return L
->getPointerAddressSpace();
232 if (StoreInst
*S
= dyn_cast
<StoreInst
>(I
))
233 return S
->getPointerAddressSpace();
237 GetElementPtrInst
*Vectorizer::getSourceGEP(Value
*Src
) const {
238 // First strip pointer bitcasts. Make sure pointee size is the same with
239 // and without casts.
240 // TODO: a stride set by the add instruction below can match the difference
241 // in pointee type size here. Currently it will not be vectorized.
242 Value
*SrcPtr
= getPointerOperand(Src
);
243 Value
*SrcBase
= SrcPtr
->stripPointerCasts();
244 if (DL
.getTypeStoreSize(SrcPtr
->getType()->getPointerElementType()) ==
245 DL
.getTypeStoreSize(SrcBase
->getType()->getPointerElementType()))
247 return dyn_cast
<GetElementPtrInst
>(SrcPtr
);
250 // FIXME: Merge with llvm::isConsecutiveAccess
251 bool Vectorizer::isConsecutiveAccess(Value
*A
, Value
*B
) {
252 Value
*PtrA
= getPointerOperand(A
);
253 Value
*PtrB
= getPointerOperand(B
);
254 unsigned ASA
= getPointerAddressSpace(A
);
255 unsigned ASB
= getPointerAddressSpace(B
);
257 // Check that the address spaces match and that the pointers are valid.
258 if (!PtrA
|| !PtrB
|| (ASA
!= ASB
))
261 // Make sure that A and B are different pointers of the same size type.
262 unsigned PtrBitWidth
= DL
.getPointerSizeInBits(ASA
);
263 Type
*PtrATy
= PtrA
->getType()->getPointerElementType();
264 Type
*PtrBTy
= PtrB
->getType()->getPointerElementType();
266 DL
.getTypeStoreSize(PtrATy
) != DL
.getTypeStoreSize(PtrBTy
) ||
267 DL
.getTypeStoreSize(PtrATy
->getScalarType()) !=
268 DL
.getTypeStoreSize(PtrBTy
->getScalarType()))
271 APInt
Size(PtrBitWidth
, DL
.getTypeStoreSize(PtrATy
));
273 APInt
OffsetA(PtrBitWidth
, 0), OffsetB(PtrBitWidth
, 0);
274 PtrA
= PtrA
->stripAndAccumulateInBoundsConstantOffsets(DL
, OffsetA
);
275 PtrB
= PtrB
->stripAndAccumulateInBoundsConstantOffsets(DL
, OffsetB
);
277 APInt OffsetDelta
= OffsetB
- OffsetA
;
279 // Check if they are based on the same pointer. That makes the offsets
282 return OffsetDelta
== Size
;
284 // Compute the necessary base pointer delta to have the necessary final delta
285 // equal to the size.
286 APInt BaseDelta
= Size
- OffsetDelta
;
288 // Compute the distance with SCEV between the base pointers.
289 const SCEV
*PtrSCEVA
= SE
.getSCEV(PtrA
);
290 const SCEV
*PtrSCEVB
= SE
.getSCEV(PtrB
);
291 const SCEV
*C
= SE
.getConstant(BaseDelta
);
292 const SCEV
*X
= SE
.getAddExpr(PtrSCEVA
, C
);
296 // Sometimes even this doesn't work, because SCEV can't always see through
297 // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
298 // things the hard way.
300 // Look through GEPs after checking they're the same except for the last
302 GetElementPtrInst
*GEPA
= getSourceGEP(A
);
303 GetElementPtrInst
*GEPB
= getSourceGEP(B
);
304 if (!GEPA
|| !GEPB
|| GEPA
->getNumOperands() != GEPB
->getNumOperands())
306 unsigned FinalIndex
= GEPA
->getNumOperands() - 1;
307 for (unsigned i
= 0; i
< FinalIndex
; i
++)
308 if (GEPA
->getOperand(i
) != GEPB
->getOperand(i
))
311 Instruction
*OpA
= dyn_cast
<Instruction
>(GEPA
->getOperand(FinalIndex
));
312 Instruction
*OpB
= dyn_cast
<Instruction
>(GEPB
->getOperand(FinalIndex
));
313 if (!OpA
|| !OpB
|| OpA
->getOpcode() != OpB
->getOpcode() ||
314 OpA
->getType() != OpB
->getType())
317 // Only look through a ZExt/SExt.
318 if (!isa
<SExtInst
>(OpA
) && !isa
<ZExtInst
>(OpA
))
321 bool Signed
= isa
<SExtInst
>(OpA
);
323 OpA
= dyn_cast
<Instruction
>(OpA
->getOperand(0));
324 OpB
= dyn_cast
<Instruction
>(OpB
->getOperand(0));
325 if (!OpA
|| !OpB
|| OpA
->getType() != OpB
->getType())
328 // Now we need to prove that adding 1 to OpA won't overflow.
330 // First attempt: if OpB is an add with NSW/NUW, and OpB is 1 added to OpA,
332 if (OpB
->getOpcode() == Instruction::Add
&&
333 isa
<ConstantInt
>(OpB
->getOperand(1)) &&
334 cast
<ConstantInt
>(OpB
->getOperand(1))->getSExtValue() > 0) {
336 Safe
= cast
<BinaryOperator
>(OpB
)->hasNoSignedWrap();
338 Safe
= cast
<BinaryOperator
>(OpB
)->hasNoUnsignedWrap();
341 unsigned BitWidth
= OpA
->getType()->getScalarSizeInBits();
344 // If any bits are known to be zero other than the sign bit in OpA, we can
345 // add 1 to it while guaranteeing no overflow of any sort.
347 KnownBits
Known(BitWidth
);
348 computeKnownBits(OpA
, Known
, DL
, 0, nullptr, OpA
, &DT
);
349 if (Known
.countMaxTrailingOnes() < (BitWidth
- 1))
356 const SCEV
*OffsetSCEVA
= SE
.getSCEV(OpA
);
357 const SCEV
*OffsetSCEVB
= SE
.getSCEV(OpB
);
358 const SCEV
*One
= SE
.getConstant(APInt(BitWidth
, 1));
359 const SCEV
*X2
= SE
.getAddExpr(OffsetSCEVA
, One
);
360 return X2
== OffsetSCEVB
;
363 void Vectorizer::reorder(Instruction
*I
) {
364 OrderedBasicBlock
OBB(I
->getParent());
365 SmallPtrSet
<Instruction
*, 16> InstructionsToMove
;
366 SmallVector
<Instruction
*, 16> Worklist
;
368 Worklist
.push_back(I
);
369 while (!Worklist
.empty()) {
370 Instruction
*IW
= Worklist
.pop_back_val();
371 int NumOperands
= IW
->getNumOperands();
372 for (int i
= 0; i
< NumOperands
; i
++) {
373 Instruction
*IM
= dyn_cast
<Instruction
>(IW
->getOperand(i
));
374 if (!IM
|| IM
->getOpcode() == Instruction::PHI
)
377 // If IM is in another BB, no need to move it, because this pass only
378 // vectorizes instructions within one BB.
379 if (IM
->getParent() != I
->getParent())
382 if (!OBB
.dominates(IM
, I
)) {
383 InstructionsToMove
.insert(IM
);
384 Worklist
.push_back(IM
);
389 // All instructions to move should follow I. Start from I, not from begin().
390 for (auto BBI
= I
->getIterator(), E
= I
->getParent()->end(); BBI
!= E
;
392 if (!InstructionsToMove
.count(&*BBI
))
394 Instruction
*IM
= &*BBI
;
396 IM
->removeFromParent();
401 std::pair
<BasicBlock::iterator
, BasicBlock::iterator
>
402 Vectorizer::getBoundaryInstrs(ArrayRef
<Instruction
*> Chain
) {
403 Instruction
*C0
= Chain
[0];
404 BasicBlock::iterator FirstInstr
= C0
->getIterator();
405 BasicBlock::iterator LastInstr
= C0
->getIterator();
407 BasicBlock
*BB
= C0
->getParent();
408 unsigned NumFound
= 0;
409 for (Instruction
&I
: *BB
) {
410 if (!is_contained(Chain
, &I
))
415 FirstInstr
= I
.getIterator();
417 if (NumFound
== Chain
.size()) {
418 LastInstr
= I
.getIterator();
423 // Range is [first, last).
424 return std::make_pair(FirstInstr
, ++LastInstr
);
427 void Vectorizer::eraseInstructions(ArrayRef
<Instruction
*> Chain
) {
428 SmallVector
<Instruction
*, 16> Instrs
;
429 for (Instruction
*I
: Chain
) {
430 Value
*PtrOperand
= getPointerOperand(I
);
431 assert(PtrOperand
&& "Instruction must have a pointer operand.");
433 if (GetElementPtrInst
*GEP
= dyn_cast
<GetElementPtrInst
>(PtrOperand
))
434 Instrs
.push_back(GEP
);
437 // Erase instructions.
438 for (Instruction
*I
: Instrs
)
440 I
->eraseFromParent();
443 std::pair
<ArrayRef
<Instruction
*>, ArrayRef
<Instruction
*>>
444 Vectorizer::splitOddVectorElts(ArrayRef
<Instruction
*> Chain
,
445 unsigned ElementSizeBits
) {
446 unsigned ElementSizeBytes
= ElementSizeBits
/ 8;
447 unsigned SizeBytes
= ElementSizeBytes
* Chain
.size();
448 unsigned NumLeft
= (SizeBytes
- (SizeBytes
% 4)) / ElementSizeBytes
;
449 if (NumLeft
== Chain
.size()) {
450 if ((NumLeft
& 1) == 0)
451 NumLeft
/= 2; // Split even in half
453 --NumLeft
; // Split off last element
454 } else if (NumLeft
== 0)
456 return std::make_pair(Chain
.slice(0, NumLeft
), Chain
.slice(NumLeft
));
459 ArrayRef
<Instruction
*>
460 Vectorizer::getVectorizablePrefix(ArrayRef
<Instruction
*> Chain
) {
461 // These are in BB order, unlike Chain, which is in address order.
462 SmallVector
<Instruction
*, 16> MemoryInstrs
;
463 SmallVector
<Instruction
*, 16> ChainInstrs
;
465 bool IsLoadChain
= isa
<LoadInst
>(Chain
[0]);
467 for (Instruction
*I
: Chain
) {
469 assert(isa
<LoadInst
>(I
) &&
470 "All elements of Chain must be loads, or all must be stores.");
472 assert(isa
<StoreInst
>(I
) &&
473 "All elements of Chain must be loads, or all must be stores.");
477 for (Instruction
&I
: make_range(getBoundaryInstrs(Chain
))) {
478 if (isa
<LoadInst
>(I
) || isa
<StoreInst
>(I
)) {
479 if (!is_contained(Chain
, &I
))
480 MemoryInstrs
.push_back(&I
);
482 ChainInstrs
.push_back(&I
);
483 } else if (IsLoadChain
&& (I
.mayWriteToMemory() || I
.mayThrow())) {
484 DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
<< '\n');
486 } else if (!IsLoadChain
&& (I
.mayReadOrWriteMemory() || I
.mayThrow())) {
487 DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
493 OrderedBasicBlock
OBB(Chain
[0]->getParent());
495 // Loop until we find an instruction in ChainInstrs that we can't vectorize.
496 unsigned ChainInstrIdx
= 0;
497 Instruction
*BarrierMemoryInstr
= nullptr;
499 for (unsigned E
= ChainInstrs
.size(); ChainInstrIdx
< E
; ++ChainInstrIdx
) {
500 Instruction
*ChainInstr
= ChainInstrs
[ChainInstrIdx
];
502 // If a barrier memory instruction was found, chain instructions that follow
503 // will not be added to the valid prefix.
504 if (BarrierMemoryInstr
&& OBB
.dominates(BarrierMemoryInstr
, ChainInstr
))
507 // Check (in BB order) if any instruction prevents ChainInstr from being
508 // vectorized. Find and store the first such "conflicting" instruction.
509 for (Instruction
*MemInstr
: MemoryInstrs
) {
510 // If a barrier memory instruction was found, do not check past it.
511 if (BarrierMemoryInstr
&& OBB
.dominates(BarrierMemoryInstr
, MemInstr
))
514 if (isa
<LoadInst
>(MemInstr
) && isa
<LoadInst
>(ChainInstr
))
517 // We can ignore the alias as long as the load comes before the store,
518 // because that means we won't be moving the load past the store to
519 // vectorize it (the vectorized load is inserted at the location of the
520 // first load in the chain).
521 if (isa
<StoreInst
>(MemInstr
) && isa
<LoadInst
>(ChainInstr
) &&
522 OBB
.dominates(ChainInstr
, MemInstr
))
525 // Same case, but in reverse.
526 if (isa
<LoadInst
>(MemInstr
) && isa
<StoreInst
>(ChainInstr
) &&
527 OBB
.dominates(MemInstr
, ChainInstr
))
530 if (!AA
.isNoAlias(MemoryLocation::get(MemInstr
),
531 MemoryLocation::get(ChainInstr
))) {
533 dbgs() << "LSV: Found alias:\n"
534 " Aliasing instruction and pointer:\n"
535 << " " << *MemInstr
<< '\n'
536 << " " << *getPointerOperand(MemInstr
) << '\n'
537 << " Aliased instruction and pointer:\n"
538 << " " << *ChainInstr
<< '\n'
539 << " " << *getPointerOperand(ChainInstr
) << '\n';
541 // Save this aliasing memory instruction as a barrier, but allow other
542 // instructions that precede the barrier to be vectorized with this one.
543 BarrierMemoryInstr
= MemInstr
;
547 // Continue the search only for store chains, since vectorizing stores that
548 // precede an aliasing load is valid. Conversely, vectorizing loads is valid
549 // up to an aliasing store, but should not pull loads from further down in
551 if (IsLoadChain
&& BarrierMemoryInstr
) {
552 // The BarrierMemoryInstr is a store that precedes ChainInstr.
553 assert(OBB
.dominates(BarrierMemoryInstr
, ChainInstr
));
558 // Find the largest prefix of Chain whose elements are all in
559 // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of
560 // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB
562 SmallPtrSet
<Instruction
*, 8> VectorizableChainInstrs(
563 ChainInstrs
.begin(), ChainInstrs
.begin() + ChainInstrIdx
);
564 unsigned ChainIdx
= 0;
565 for (unsigned ChainLen
= Chain
.size(); ChainIdx
< ChainLen
; ++ChainIdx
) {
566 if (!VectorizableChainInstrs
.count(Chain
[ChainIdx
]))
569 return Chain
.slice(0, ChainIdx
);
572 std::pair
<InstrListMap
, InstrListMap
>
573 Vectorizer::collectInstructions(BasicBlock
*BB
) {
574 InstrListMap LoadRefs
;
575 InstrListMap StoreRefs
;
577 for (Instruction
&I
: *BB
) {
578 if (!I
.mayReadOrWriteMemory())
581 if (LoadInst
*LI
= dyn_cast
<LoadInst
>(&I
)) {
585 // Skip if it's not legal.
586 if (!TTI
.isLegalToVectorizeLoad(LI
))
589 Type
*Ty
= LI
->getType();
590 if (!VectorType::isValidElementType(Ty
->getScalarType()))
593 // Skip weird non-byte sizes. They probably aren't worth the effort of
594 // handling correctly.
595 unsigned TySize
= DL
.getTypeSizeInBits(Ty
);
599 Value
*Ptr
= LI
->getPointerOperand();
600 unsigned AS
= Ptr
->getType()->getPointerAddressSpace();
601 unsigned VecRegSize
= TTI
.getLoadStoreVecRegBitWidth(AS
);
603 // No point in looking at these if they're too big to vectorize.
604 if (TySize
> VecRegSize
/ 2)
607 // Make sure all the users of a vector are constant-index extracts.
608 if (isa
<VectorType
>(Ty
) && !all_of(LI
->users(), [](const User
*U
) {
609 const ExtractElementInst
*EEI
= dyn_cast
<ExtractElementInst
>(U
);
610 return EEI
&& isa
<ConstantInt
>(EEI
->getOperand(1));
614 // Save the load locations.
615 Value
*ObjPtr
= GetUnderlyingObject(Ptr
, DL
);
616 LoadRefs
[ObjPtr
].push_back(LI
);
618 } else if (StoreInst
*SI
= dyn_cast
<StoreInst
>(&I
)) {
622 // Skip if it's not legal.
623 if (!TTI
.isLegalToVectorizeStore(SI
))
626 Type
*Ty
= SI
->getValueOperand()->getType();
627 if (!VectorType::isValidElementType(Ty
->getScalarType()))
630 // Skip weird non-byte sizes. They probably aren't worth the effort of
631 // handling correctly.
632 unsigned TySize
= DL
.getTypeSizeInBits(Ty
);
636 Value
*Ptr
= SI
->getPointerOperand();
637 unsigned AS
= Ptr
->getType()->getPointerAddressSpace();
638 unsigned VecRegSize
= TTI
.getLoadStoreVecRegBitWidth(AS
);
639 if (TySize
> VecRegSize
/ 2)
642 if (isa
<VectorType
>(Ty
) && !all_of(SI
->users(), [](const User
*U
) {
643 const ExtractElementInst
*EEI
= dyn_cast
<ExtractElementInst
>(U
);
644 return EEI
&& isa
<ConstantInt
>(EEI
->getOperand(1));
648 // Save store location.
649 Value
*ObjPtr
= GetUnderlyingObject(Ptr
, DL
);
650 StoreRefs
[ObjPtr
].push_back(SI
);
654 return {LoadRefs
, StoreRefs
};
657 bool Vectorizer::vectorizeChains(InstrListMap
&Map
) {
658 bool Changed
= false;
660 for (const std::pair
<Value
*, InstrList
> &Chain
: Map
) {
661 unsigned Size
= Chain
.second
.size();
665 DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size
<< ".\n");
667 // Process the stores in chunks of 64.
668 for (unsigned CI
= 0, CE
= Size
; CI
< CE
; CI
+= 64) {
669 unsigned Len
= std::min
<unsigned>(CE
- CI
, 64);
670 ArrayRef
<Instruction
*> Chunk(&Chain
.second
[CI
], Len
);
671 Changed
|= vectorizeInstructions(Chunk
);
678 bool Vectorizer::vectorizeInstructions(ArrayRef
<Instruction
*> Instrs
) {
679 DEBUG(dbgs() << "LSV: Vectorizing " << Instrs
.size() << " instructions.\n");
680 SmallVector
<int, 16> Heads
, Tails
;
681 int ConsecutiveChain
[64];
683 // Do a quadratic search on all of the given stores and find all of the pairs
684 // of stores that follow each other.
685 for (int i
= 0, e
= Instrs
.size(); i
< e
; ++i
) {
686 ConsecutiveChain
[i
] = -1;
687 for (int j
= e
- 1; j
>= 0; --j
) {
691 if (isConsecutiveAccess(Instrs
[i
], Instrs
[j
])) {
692 if (ConsecutiveChain
[i
] != -1) {
693 int CurDistance
= std::abs(ConsecutiveChain
[i
] - i
);
694 int NewDistance
= std::abs(ConsecutiveChain
[i
] - j
);
695 if (j
< i
|| NewDistance
> CurDistance
)
696 continue; // Should not insert.
701 ConsecutiveChain
[i
] = j
;
706 bool Changed
= false;
707 SmallPtrSet
<Instruction
*, 16> InstructionsProcessed
;
709 for (int Head
: Heads
) {
710 if (InstructionsProcessed
.count(Instrs
[Head
]))
712 bool LongerChainExists
= false;
713 for (unsigned TIt
= 0; TIt
< Tails
.size(); TIt
++)
714 if (Head
== Tails
[TIt
] &&
715 !InstructionsProcessed
.count(Instrs
[Heads
[TIt
]])) {
716 LongerChainExists
= true;
719 if (LongerChainExists
)
722 // We found an instr that starts a chain. Now follow the chain and try to
724 SmallVector
<Instruction
*, 16> Operands
;
726 while (I
!= -1 && (is_contained(Tails
, I
) || is_contained(Heads
, I
))) {
727 if (InstructionsProcessed
.count(Instrs
[I
]))
730 Operands
.push_back(Instrs
[I
]);
731 I
= ConsecutiveChain
[I
];
734 bool Vectorized
= false;
735 if (isa
<LoadInst
>(*Operands
.begin()))
736 Vectorized
= vectorizeLoadChain(Operands
, &InstructionsProcessed
);
738 Vectorized
= vectorizeStoreChain(Operands
, &InstructionsProcessed
);
740 Changed
|= Vectorized
;
746 bool Vectorizer::vectorizeStoreChain(
747 ArrayRef
<Instruction
*> Chain
,
748 SmallPtrSet
<Instruction
*, 16> *InstructionsProcessed
) {
749 StoreInst
*S0
= cast
<StoreInst
>(Chain
[0]);
751 // If the vector has an int element, default to int for the whole load.
753 for (Instruction
*I
: Chain
) {
754 StoreTy
= cast
<StoreInst
>(I
)->getValueOperand()->getType();
755 if (StoreTy
->isIntOrIntVectorTy())
758 if (StoreTy
->isPtrOrPtrVectorTy()) {
759 StoreTy
= Type::getIntNTy(F
.getParent()->getContext(),
760 DL
.getTypeSizeInBits(StoreTy
));
765 unsigned Sz
= DL
.getTypeSizeInBits(StoreTy
);
766 unsigned AS
= S0
->getPointerAddressSpace();
767 unsigned VecRegSize
= TTI
.getLoadStoreVecRegBitWidth(AS
);
768 unsigned VF
= VecRegSize
/ Sz
;
769 unsigned ChainSize
= Chain
.size();
770 unsigned Alignment
= getAlignment(S0
);
772 if (!isPowerOf2_32(Sz
) || VF
< 2 || ChainSize
< 2) {
773 InstructionsProcessed
->insert(Chain
.begin(), Chain
.end());
777 ArrayRef
<Instruction
*> NewChain
= getVectorizablePrefix(Chain
);
778 if (NewChain
.empty()) {
779 // No vectorization possible.
780 InstructionsProcessed
->insert(Chain
.begin(), Chain
.end());
783 if (NewChain
.size() == 1) {
784 // Failed after the first instruction. Discard it and try the smaller chain.
785 InstructionsProcessed
->insert(NewChain
.front());
789 // Update Chain to the valid vectorizable subchain.
791 ChainSize
= Chain
.size();
793 // Check if it's legal to vectorize this chain. If not, split the chain and
795 unsigned EltSzInBytes
= Sz
/ 8;
796 unsigned SzInBytes
= EltSzInBytes
* ChainSize
;
797 if (!TTI
.isLegalToVectorizeStoreChain(SzInBytes
, Alignment
, AS
)) {
798 auto Chains
= splitOddVectorElts(Chain
, Sz
);
799 return vectorizeStoreChain(Chains
.first
, InstructionsProcessed
) |
800 vectorizeStoreChain(Chains
.second
, InstructionsProcessed
);
804 VectorType
*VecStoreTy
= dyn_cast
<VectorType
>(StoreTy
);
806 VecTy
= VectorType::get(StoreTy
->getScalarType(),
807 Chain
.size() * VecStoreTy
->getNumElements());
809 VecTy
= VectorType::get(StoreTy
, Chain
.size());
811 // If it's more than the max vector size or the target has a better
812 // vector factor, break it into two pieces.
813 unsigned TargetVF
= TTI
.getStoreVectorFactor(VF
, Sz
, SzInBytes
, VecTy
);
814 if (ChainSize
> VF
|| (VF
!= TargetVF
&& TargetVF
< ChainSize
)) {
815 DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
816 " Creating two separate arrays.\n");
817 return vectorizeStoreChain(Chain
.slice(0, TargetVF
),
818 InstructionsProcessed
) |
819 vectorizeStoreChain(Chain
.slice(TargetVF
), InstructionsProcessed
);
823 dbgs() << "LSV: Stores to vectorize:\n";
824 for (Instruction
*I
: Chain
)
825 dbgs() << " " << *I
<< "\n";
828 // We won't try again to vectorize the elements of the chain, regardless of
829 // whether we succeed below.
830 InstructionsProcessed
->insert(Chain
.begin(), Chain
.end());
832 // If the store is going to be misaligned, don't vectorize it.
833 if (accessIsMisaligned(SzInBytes
, AS
, Alignment
)) {
834 if (S0
->getPointerAddressSpace() != 0)
837 unsigned NewAlign
= getOrEnforceKnownAlignment(S0
->getPointerOperand(),
838 StackAdjustedAlignment
,
839 DL
, S0
, nullptr, &DT
);
840 if (NewAlign
< StackAdjustedAlignment
)
844 BasicBlock::iterator First
, Last
;
845 std::tie(First
, Last
) = getBoundaryInstrs(Chain
);
846 Builder
.SetInsertPoint(&*Last
);
848 Value
*Vec
= UndefValue::get(VecTy
);
851 unsigned VecWidth
= VecStoreTy
->getNumElements();
852 for (unsigned I
= 0, E
= Chain
.size(); I
!= E
; ++I
) {
853 StoreInst
*Store
= cast
<StoreInst
>(Chain
[I
]);
854 for (unsigned J
= 0, NE
= VecStoreTy
->getNumElements(); J
!= NE
; ++J
) {
855 unsigned NewIdx
= J
+ I
* VecWidth
;
856 Value
*Extract
= Builder
.CreateExtractElement(Store
->getValueOperand(),
857 Builder
.getInt32(J
));
858 if (Extract
->getType() != StoreTy
->getScalarType())
859 Extract
= Builder
.CreateBitCast(Extract
, StoreTy
->getScalarType());
862 Builder
.CreateInsertElement(Vec
, Extract
, Builder
.getInt32(NewIdx
));
867 for (unsigned I
= 0, E
= Chain
.size(); I
!= E
; ++I
) {
868 StoreInst
*Store
= cast
<StoreInst
>(Chain
[I
]);
869 Value
*Extract
= Store
->getValueOperand();
870 if (Extract
->getType() != StoreTy
->getScalarType())
872 Builder
.CreateBitOrPointerCast(Extract
, StoreTy
->getScalarType());
875 Builder
.CreateInsertElement(Vec
, Extract
, Builder
.getInt32(I
));
880 // This cast is safe because Builder.CreateStore() always creates a bona fide
882 StoreInst
*SI
= cast
<StoreInst
>(
883 Builder
.CreateStore(Vec
, Builder
.CreateBitCast(S0
->getPointerOperand(),
884 VecTy
->getPointerTo(AS
))));
885 propagateMetadata(SI
, Chain
);
886 SI
->setAlignment(Alignment
);
888 eraseInstructions(Chain
);
889 ++NumVectorInstructions
;
890 NumScalarsVectorized
+= Chain
.size();
894 bool Vectorizer::vectorizeLoadChain(
895 ArrayRef
<Instruction
*> Chain
,
896 SmallPtrSet
<Instruction
*, 16> *InstructionsProcessed
) {
897 LoadInst
*L0
= cast
<LoadInst
>(Chain
[0]);
899 // If the vector has an int element, default to int for the whole load.
901 for (const auto &V
: Chain
) {
902 LoadTy
= cast
<LoadInst
>(V
)->getType();
903 if (LoadTy
->isIntOrIntVectorTy())
906 if (LoadTy
->isPtrOrPtrVectorTy()) {
907 LoadTy
= Type::getIntNTy(F
.getParent()->getContext(),
908 DL
.getTypeSizeInBits(LoadTy
));
913 unsigned Sz
= DL
.getTypeSizeInBits(LoadTy
);
914 unsigned AS
= L0
->getPointerAddressSpace();
915 unsigned VecRegSize
= TTI
.getLoadStoreVecRegBitWidth(AS
);
916 unsigned VF
= VecRegSize
/ Sz
;
917 unsigned ChainSize
= Chain
.size();
918 unsigned Alignment
= getAlignment(L0
);
920 if (!isPowerOf2_32(Sz
) || VF
< 2 || ChainSize
< 2) {
921 InstructionsProcessed
->insert(Chain
.begin(), Chain
.end());
925 ArrayRef
<Instruction
*> NewChain
= getVectorizablePrefix(Chain
);
926 if (NewChain
.empty()) {
927 // No vectorization possible.
928 InstructionsProcessed
->insert(Chain
.begin(), Chain
.end());
931 if (NewChain
.size() == 1) {
932 // Failed after the first instruction. Discard it and try the smaller chain.
933 InstructionsProcessed
->insert(NewChain
.front());
937 // Update Chain to the valid vectorizable subchain.
939 ChainSize
= Chain
.size();
941 // Check if it's legal to vectorize this chain. If not, split the chain and
943 unsigned EltSzInBytes
= Sz
/ 8;
944 unsigned SzInBytes
= EltSzInBytes
* ChainSize
;
945 if (!TTI
.isLegalToVectorizeLoadChain(SzInBytes
, Alignment
, AS
)) {
946 auto Chains
= splitOddVectorElts(Chain
, Sz
);
947 return vectorizeLoadChain(Chains
.first
, InstructionsProcessed
) |
948 vectorizeLoadChain(Chains
.second
, InstructionsProcessed
);
952 VectorType
*VecLoadTy
= dyn_cast
<VectorType
>(LoadTy
);
954 VecTy
= VectorType::get(LoadTy
->getScalarType(),
955 Chain
.size() * VecLoadTy
->getNumElements());
957 VecTy
= VectorType::get(LoadTy
, Chain
.size());
959 // If it's more than the max vector size or the target has a better
960 // vector factor, break it into two pieces.
961 unsigned TargetVF
= TTI
.getLoadVectorFactor(VF
, Sz
, SzInBytes
, VecTy
);
962 if (ChainSize
> VF
|| (VF
!= TargetVF
&& TargetVF
< ChainSize
)) {
963 DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
964 " Creating two separate arrays.\n");
965 return vectorizeLoadChain(Chain
.slice(0, TargetVF
), InstructionsProcessed
) |
966 vectorizeLoadChain(Chain
.slice(TargetVF
), InstructionsProcessed
);
969 // We won't try again to vectorize the elements of the chain, regardless of
970 // whether we succeed below.
971 InstructionsProcessed
->insert(Chain
.begin(), Chain
.end());
973 // If the load is going to be misaligned, don't vectorize it.
974 if (accessIsMisaligned(SzInBytes
, AS
, Alignment
)) {
975 if (L0
->getPointerAddressSpace() != 0)
978 unsigned NewAlign
= getOrEnforceKnownAlignment(L0
->getPointerOperand(),
979 StackAdjustedAlignment
,
980 DL
, L0
, nullptr, &DT
);
981 if (NewAlign
< StackAdjustedAlignment
)
984 Alignment
= NewAlign
;
988 dbgs() << "LSV: Loads to vectorize:\n";
989 for (Instruction
*I
: Chain
)
993 // getVectorizablePrefix already computed getBoundaryInstrs. The value of
994 // Last may have changed since then, but the value of First won't have. If it
995 // matters, we could compute getBoundaryInstrs only once and reuse it here.
996 BasicBlock::iterator First
, Last
;
997 std::tie(First
, Last
) = getBoundaryInstrs(Chain
);
998 Builder
.SetInsertPoint(&*First
);
1001 Builder
.CreateBitCast(L0
->getPointerOperand(), VecTy
->getPointerTo(AS
));
1002 // This cast is safe because Builder.CreateLoad always creates a bona fide
1004 LoadInst
*LI
= cast
<LoadInst
>(Builder
.CreateLoad(Bitcast
));
1005 propagateMetadata(LI
, Chain
);
1006 LI
->setAlignment(Alignment
);
1009 SmallVector
<Instruction
*, 16> InstrsToErase
;
1011 unsigned VecWidth
= VecLoadTy
->getNumElements();
1012 for (unsigned I
= 0, E
= Chain
.size(); I
!= E
; ++I
) {
1013 for (auto Use
: Chain
[I
]->users()) {
1014 // All users of vector loads are ExtractElement instructions with
1015 // constant indices, otherwise we would have bailed before now.
1016 Instruction
*UI
= cast
<Instruction
>(Use
);
1017 unsigned Idx
= cast
<ConstantInt
>(UI
->getOperand(1))->getZExtValue();
1018 unsigned NewIdx
= Idx
+ I
* VecWidth
;
1019 Value
*V
= Builder
.CreateExtractElement(LI
, Builder
.getInt32(NewIdx
),
1021 if (V
->getType() != UI
->getType())
1022 V
= Builder
.CreateBitCast(V
, UI
->getType());
1024 // Replace the old instruction.
1025 UI
->replaceAllUsesWith(V
);
1026 InstrsToErase
.push_back(UI
);
1030 // Bitcast might not be an Instruction, if the value being loaded is a
1031 // constant. In that case, no need to reorder anything.
1032 if (Instruction
*BitcastInst
= dyn_cast
<Instruction
>(Bitcast
))
1033 reorder(BitcastInst
);
1035 for (auto I
: InstrsToErase
)
1036 I
->eraseFromParent();
1038 for (unsigned I
= 0, E
= Chain
.size(); I
!= E
; ++I
) {
1039 Value
*CV
= Chain
[I
];
1041 Builder
.CreateExtractElement(LI
, Builder
.getInt32(I
), CV
->getName());
1042 if (V
->getType() != CV
->getType()) {
1043 V
= Builder
.CreateBitOrPointerCast(V
, CV
->getType());
1046 // Replace the old instruction.
1047 CV
->replaceAllUsesWith(V
);
1050 if (Instruction
*BitcastInst
= dyn_cast
<Instruction
>(Bitcast
))
1051 reorder(BitcastInst
);
1054 eraseInstructions(Chain
);
1056 ++NumVectorInstructions
;
1057 NumScalarsVectorized
+= Chain
.size();
1061 bool Vectorizer::accessIsMisaligned(unsigned SzInBytes
, unsigned AddressSpace
,
1062 unsigned Alignment
) {
1063 if (Alignment
% SzInBytes
== 0)
1067 bool Allows
= TTI
.allowsMisalignedMemoryAccesses(F
.getParent()->getContext(),
1068 SzInBytes
* 8, AddressSpace
,
1070 DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
1071 << " and fast? " << Fast
<< "\n";);
1072 return !Allows
|| !Fast
;