1 //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass merges loads/stores to/from sequential memory addresses into vector
10 // loads/stores. Although there's nothing GPU-specific in here, this pass is
11 // motivated by the microarchitectural quirks of nVidia and AMD GPUs.
13 // (For simplicity below we talk about loads only, but everything also applies
16 // This pass is intended to be run late in the pipeline, after other
17 // vectorization opportunities have been exploited. So the assumption here is
18 // that immediately following our new vector load we'll need to extract out the
19 // individual elements of the load, so we can operate on them individually.
21 // On CPUs this transformation is usually not beneficial, because extracting the
22 // elements of a vector register is expensive on most architectures. It's
23 // usually better just to load each element individually into its own scalar
26 // However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27 // "vector load" loads directly into a series of scalar registers. In effect,
28 // extracting the elements of the vector is free. It's therefore always
29 // beneficial to vectorize a sequence of loads on these architectures.
31 // Vectorizing (perhaps a better name might be "coalescing") loads can have
32 // large performance impacts on GPU kernels, and opportunities for vectorizing
33 // are common in GPU code. This pass tries very hard to find such
34 // opportunities; its runtime is quadratic in the number of loads in a BB.
36 // Some CPU architectures, such as ARM, have instructions that load into
37 // multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38 // could use this pass (with some modifications), but currently it implements
39 // its own pass to do something similar to what we do here.
41 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
42 #include "llvm/ADT/APInt.h"
43 #include "llvm/ADT/ArrayRef.h"
44 #include "llvm/ADT/MapVector.h"
45 #include "llvm/ADT/PostOrderIterator.h"
46 #include "llvm/ADT/STLExtras.h"
47 #include "llvm/ADT/SmallPtrSet.h"
48 #include "llvm/ADT/SmallVector.h"
49 #include "llvm/ADT/Statistic.h"
50 #include "llvm/ADT/iterator_range.h"
51 #include "llvm/Analysis/AliasAnalysis.h"
52 #include "llvm/Analysis/AssumptionCache.h"
53 #include "llvm/Analysis/MemoryLocation.h"
54 #include "llvm/Analysis/ScalarEvolution.h"
55 #include "llvm/Analysis/TargetTransformInfo.h"
56 #include "llvm/Analysis/ValueTracking.h"
57 #include "llvm/Analysis/VectorUtils.h"
58 #include "llvm/IR/Attributes.h"
59 #include "llvm/IR/BasicBlock.h"
60 #include "llvm/IR/Constants.h"
61 #include "llvm/IR/DataLayout.h"
62 #include "llvm/IR/DerivedTypes.h"
63 #include "llvm/IR/Dominators.h"
64 #include "llvm/IR/Function.h"
65 #include "llvm/IR/IRBuilder.h"
66 #include "llvm/IR/InstrTypes.h"
67 #include "llvm/IR/Instruction.h"
68 #include "llvm/IR/Instructions.h"
69 #include "llvm/IR/IntrinsicInst.h"
70 #include "llvm/IR/Module.h"
71 #include "llvm/IR/Type.h"
72 #include "llvm/IR/User.h"
73 #include "llvm/IR/Value.h"
74 #include "llvm/InitializePasses.h"
75 #include "llvm/Pass.h"
76 #include "llvm/Support/Casting.h"
77 #include "llvm/Support/Debug.h"
78 #include "llvm/Support/KnownBits.h"
79 #include "llvm/Support/MathExtras.h"
80 #include "llvm/Support/raw_ostream.h"
81 #include "llvm/Transforms/Utils/Local.h"
82 #include "llvm/Transforms/Vectorize.h"
91 #define DEBUG_TYPE "load-store-vectorizer"
93 STATISTIC(NumVectorInstructions
, "Number of vector accesses generated");
94 STATISTIC(NumScalarsVectorized
, "Number of scalar accesses vectorized");
96 // FIXME: Assuming stack alignment of 4 is always good enough
97 static const unsigned StackAdjustedAlignment
= 4;
101 /// ChainID is an arbitrary token that is allowed to be different only for the
102 /// accesses that are guaranteed to be considered non-consecutive by
103 /// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
104 /// together and reducing the number of instructions the main search operates on
105 /// at a time, i.e. this is to reduce compile time and nothing else as the main
106 /// search has O(n^2) time complexity. The underlying type of ChainID should not
108 using ChainID
= const Value
*;
109 using InstrList
= SmallVector
<Instruction
*, 8>;
110 using InstrListMap
= MapVector
<ChainID
, InstrList
>;
118 TargetTransformInfo
&TTI
;
119 const DataLayout
&DL
;
123 Vectorizer(Function
&F
, AliasAnalysis
&AA
, AssumptionCache
&AC
,
124 DominatorTree
&DT
, ScalarEvolution
&SE
, TargetTransformInfo
&TTI
)
125 : F(F
), AA(AA
), AC(AC
), DT(DT
), SE(SE
), TTI(TTI
),
126 DL(F
.getParent()->getDataLayout()), Builder(SE
.getContext()) {}
131 unsigned getPointerAddressSpace(Value
*I
);
133 static const unsigned MaxDepth
= 3;
135 bool isConsecutiveAccess(Value
*A
, Value
*B
);
136 bool areConsecutivePointers(Value
*PtrA
, Value
*PtrB
, APInt PtrDelta
,
137 unsigned Depth
= 0) const;
138 bool lookThroughComplexAddresses(Value
*PtrA
, Value
*PtrB
, APInt PtrDelta
,
139 unsigned Depth
) const;
140 bool lookThroughSelects(Value
*PtrA
, Value
*PtrB
, const APInt
&PtrDelta
,
141 unsigned Depth
) const;
143 /// After vectorization, reorder the instructions that I depends on
144 /// (the instructions defining its operands), to ensure they dominate I.
145 void reorder(Instruction
*I
);
147 /// Returns the first and the last instructions in Chain.
148 std::pair
<BasicBlock::iterator
, BasicBlock::iterator
>
149 getBoundaryInstrs(ArrayRef
<Instruction
*> Chain
);
151 /// Erases the original instructions after vectorizing.
152 void eraseInstructions(ArrayRef
<Instruction
*> Chain
);
154 /// "Legalize" the vector type that would be produced by combining \p
155 /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
156 /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
157 /// expected to have more than 4 elements.
158 std::pair
<ArrayRef
<Instruction
*>, ArrayRef
<Instruction
*>>
159 splitOddVectorElts(ArrayRef
<Instruction
*> Chain
, unsigned ElementSizeBits
);
161 /// Finds the largest prefix of Chain that's vectorizable, checking for
162 /// intervening instructions which may affect the memory accessed by the
163 /// instructions within Chain.
165 /// The elements of \p Chain must be all loads or all stores and must be in
167 ArrayRef
<Instruction
*> getVectorizablePrefix(ArrayRef
<Instruction
*> Chain
);
169 /// Collects load and store instructions to vectorize.
170 std::pair
<InstrListMap
, InstrListMap
> collectInstructions(BasicBlock
*BB
);
172 /// Processes the collected instructions, the \p Map. The values of \p Map
173 /// should be all loads or all stores.
174 bool vectorizeChains(InstrListMap
&Map
);
176 /// Finds the load/stores to consecutive memory addresses and vectorizes them.
177 bool vectorizeInstructions(ArrayRef
<Instruction
*> Instrs
);
179 /// Vectorizes the load instructions in Chain.
181 vectorizeLoadChain(ArrayRef
<Instruction
*> Chain
,
182 SmallPtrSet
<Instruction
*, 16> *InstructionsProcessed
);
184 /// Vectorizes the store instructions in Chain.
186 vectorizeStoreChain(ArrayRef
<Instruction
*> Chain
,
187 SmallPtrSet
<Instruction
*, 16> *InstructionsProcessed
);
189 /// Check if this load/store access is misaligned accesses.
190 bool accessIsMisaligned(unsigned SzInBytes
, unsigned AddressSpace
,
194 class LoadStoreVectorizerLegacyPass
: public FunctionPass
{
198 LoadStoreVectorizerLegacyPass() : FunctionPass(ID
) {
199 initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry());
202 bool runOnFunction(Function
&F
) override
;
204 StringRef
getPassName() const override
{
205 return "GPU Load and Store Vectorizer";
208 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
209 AU
.addRequired
<AAResultsWrapperPass
>();
210 AU
.addRequired
<AssumptionCacheTracker
>();
211 AU
.addRequired
<ScalarEvolutionWrapperPass
>();
212 AU
.addRequired
<DominatorTreeWrapperPass
>();
213 AU
.addRequired
<TargetTransformInfoWrapperPass
>();
214 AU
.setPreservesCFG();
218 } // end anonymous namespace
220 char LoadStoreVectorizerLegacyPass::ID
= 0;
222 INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass
, DEBUG_TYPE
,
223 "Vectorize load and Store instructions", false, false)
224 INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass
)
225 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker
);
226 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass
)
227 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass
)
228 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass
)
229 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass
)
230 INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass
, DEBUG_TYPE
,
231 "Vectorize load and store instructions", false, false)
233 Pass
*llvm::createLoadStoreVectorizerPass() {
234 return new LoadStoreVectorizerLegacyPass();
237 bool LoadStoreVectorizerLegacyPass::runOnFunction(Function
&F
) {
238 // Don't vectorize when the attribute NoImplicitFloat is used.
239 if (skipFunction(F
) || F
.hasFnAttribute(Attribute::NoImplicitFloat
))
242 AliasAnalysis
&AA
= getAnalysis
<AAResultsWrapperPass
>().getAAResults();
243 DominatorTree
&DT
= getAnalysis
<DominatorTreeWrapperPass
>().getDomTree();
244 ScalarEvolution
&SE
= getAnalysis
<ScalarEvolutionWrapperPass
>().getSE();
245 TargetTransformInfo
&TTI
=
246 getAnalysis
<TargetTransformInfoWrapperPass
>().getTTI(F
);
248 AssumptionCache
&AC
=
249 getAnalysis
<AssumptionCacheTracker
>().getAssumptionCache(F
);
251 Vectorizer
V(F
, AA
, AC
, DT
, SE
, TTI
);
255 PreservedAnalyses
LoadStoreVectorizerPass::run(Function
&F
, FunctionAnalysisManager
&AM
) {
256 // Don't vectorize when the attribute NoImplicitFloat is used.
257 if (F
.hasFnAttribute(Attribute::NoImplicitFloat
))
258 return PreservedAnalyses::all();
260 AliasAnalysis
&AA
= AM
.getResult
<AAManager
>(F
);
261 DominatorTree
&DT
= AM
.getResult
<DominatorTreeAnalysis
>(F
);
262 ScalarEvolution
&SE
= AM
.getResult
<ScalarEvolutionAnalysis
>(F
);
263 TargetTransformInfo
&TTI
= AM
.getResult
<TargetIRAnalysis
>(F
);
264 AssumptionCache
&AC
= AM
.getResult
<AssumptionAnalysis
>(F
);
266 Vectorizer
V(F
, AA
, AC
, DT
, SE
, TTI
);
267 bool Changed
= V
.run();
268 PreservedAnalyses PA
;
269 PA
.preserveSet
<CFGAnalyses
>();
270 return Changed
? PA
: PreservedAnalyses::all();
273 // The real propagateMetadata expects a SmallVector<Value*>, but we deal in
274 // vectors of Instructions.
275 static void propagateMetadata(Instruction
*I
, ArrayRef
<Instruction
*> IL
) {
276 SmallVector
<Value
*, 8> VL(IL
.begin(), IL
.end());
277 propagateMetadata(I
, VL
);
280 // Vectorizer Implementation
281 bool Vectorizer::run() {
282 bool Changed
= false;
284 // Scan the blocks in the function in post order.
285 for (BasicBlock
*BB
: post_order(&F
)) {
286 InstrListMap LoadRefs
, StoreRefs
;
287 std::tie(LoadRefs
, StoreRefs
) = collectInstructions(BB
);
288 Changed
|= vectorizeChains(LoadRefs
);
289 Changed
|= vectorizeChains(StoreRefs
);
295 unsigned Vectorizer::getPointerAddressSpace(Value
*I
) {
296 if (LoadInst
*L
= dyn_cast
<LoadInst
>(I
))
297 return L
->getPointerAddressSpace();
298 if (StoreInst
*S
= dyn_cast
<StoreInst
>(I
))
299 return S
->getPointerAddressSpace();
303 // FIXME: Merge with llvm::isConsecutiveAccess
304 bool Vectorizer::isConsecutiveAccess(Value
*A
, Value
*B
) {
305 Value
*PtrA
= getLoadStorePointerOperand(A
);
306 Value
*PtrB
= getLoadStorePointerOperand(B
);
307 unsigned ASA
= getPointerAddressSpace(A
);
308 unsigned ASB
= getPointerAddressSpace(B
);
310 // Check that the address spaces match and that the pointers are valid.
311 if (!PtrA
|| !PtrB
|| (ASA
!= ASB
))
314 // Make sure that A and B are different pointers of the same size type.
315 Type
*PtrATy
= getLoadStoreType(A
);
316 Type
*PtrBTy
= getLoadStoreType(B
);
318 PtrATy
->isVectorTy() != PtrBTy
->isVectorTy() ||
319 DL
.getTypeStoreSize(PtrATy
) != DL
.getTypeStoreSize(PtrBTy
) ||
320 DL
.getTypeStoreSize(PtrATy
->getScalarType()) !=
321 DL
.getTypeStoreSize(PtrBTy
->getScalarType()))
324 unsigned PtrBitWidth
= DL
.getPointerSizeInBits(ASA
);
325 APInt
Size(PtrBitWidth
, DL
.getTypeStoreSize(PtrATy
));
327 return areConsecutivePointers(PtrA
, PtrB
, Size
);
330 bool Vectorizer::areConsecutivePointers(Value
*PtrA
, Value
*PtrB
,
331 APInt PtrDelta
, unsigned Depth
) const {
332 unsigned PtrBitWidth
= DL
.getPointerTypeSizeInBits(PtrA
->getType());
333 APInt
OffsetA(PtrBitWidth
, 0);
334 APInt
OffsetB(PtrBitWidth
, 0);
335 PtrA
= PtrA
->stripAndAccumulateInBoundsConstantOffsets(DL
, OffsetA
);
336 PtrB
= PtrB
->stripAndAccumulateInBoundsConstantOffsets(DL
, OffsetB
);
338 unsigned NewPtrBitWidth
= DL
.getTypeStoreSizeInBits(PtrA
->getType());
340 if (NewPtrBitWidth
!= DL
.getTypeStoreSizeInBits(PtrB
->getType()))
343 // In case if we have to shrink the pointer
344 // stripAndAccumulateInBoundsConstantOffsets should properly handle a
345 // possible overflow and the value should fit into a smallest data type
346 // used in the cast/gep chain.
347 assert(OffsetA
.getMinSignedBits() <= NewPtrBitWidth
&&
348 OffsetB
.getMinSignedBits() <= NewPtrBitWidth
);
350 OffsetA
= OffsetA
.sextOrTrunc(NewPtrBitWidth
);
351 OffsetB
= OffsetB
.sextOrTrunc(NewPtrBitWidth
);
352 PtrDelta
= PtrDelta
.sextOrTrunc(NewPtrBitWidth
);
354 APInt OffsetDelta
= OffsetB
- OffsetA
;
356 // Check if they are based on the same pointer. That makes the offsets
359 return OffsetDelta
== PtrDelta
;
361 // Compute the necessary base pointer delta to have the necessary final delta
362 // equal to the pointer delta requested.
363 APInt BaseDelta
= PtrDelta
- OffsetDelta
;
365 // Compute the distance with SCEV between the base pointers.
366 const SCEV
*PtrSCEVA
= SE
.getSCEV(PtrA
);
367 const SCEV
*PtrSCEVB
= SE
.getSCEV(PtrB
);
368 const SCEV
*C
= SE
.getConstant(BaseDelta
);
369 const SCEV
*X
= SE
.getAddExpr(PtrSCEVA
, C
);
373 // The above check will not catch the cases where one of the pointers is
374 // factorized but the other one is not, such as (C + (S * (A + B))) vs
375 // (AS + BS). Get the minus scev. That will allow re-combining the expresions
376 // and getting the simplified difference.
377 const SCEV
*Dist
= SE
.getMinusSCEV(PtrSCEVB
, PtrSCEVA
);
381 // Sometimes even this doesn't work, because SCEV can't always see through
382 // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
383 // things the hard way.
384 return lookThroughComplexAddresses(PtrA
, PtrB
, BaseDelta
, Depth
);
387 static bool checkNoWrapFlags(Instruction
*I
, bool Signed
) {
388 BinaryOperator
*BinOpI
= cast
<BinaryOperator
>(I
);
389 return (Signed
&& BinOpI
->hasNoSignedWrap()) ||
390 (!Signed
&& BinOpI
->hasNoUnsignedWrap());
393 static bool checkIfSafeAddSequence(const APInt
&IdxDiff
, Instruction
*AddOpA
,
394 unsigned MatchingOpIdxA
, Instruction
*AddOpB
,
395 unsigned MatchingOpIdxB
, bool Signed
) {
396 // If both OpA and OpB is an add with NSW/NUW and with
397 // one of the operands being the same, we can guarantee that the
398 // transformation is safe if we can prove that OpA won't overflow when
399 // IdxDiff added to the other operand of OpA.
401 // %tmp7 = add nsw i32 %tmp2, %v0
402 // %tmp8 = sext i32 %tmp7 to i64
404 // %tmp11 = add nsw i32 %v0, 1
405 // %tmp12 = add nsw i32 %tmp2, %tmp11
406 // %tmp13 = sext i32 %tmp12 to i64
408 // Both %tmp7 and %tmp2 has the nsw flag and the first operand
409 // is %tmp2. It's guaranteed that adding 1 to %tmp7 won't overflow
410 // because %tmp11 adds 1 to %v0 and both %tmp11 and %tmp12 has the
412 assert(AddOpA
->getOpcode() == Instruction::Add
&&
413 AddOpB
->getOpcode() == Instruction::Add
&&
414 checkNoWrapFlags(AddOpA
, Signed
) && checkNoWrapFlags(AddOpB
, Signed
));
415 if (AddOpA
->getOperand(MatchingOpIdxA
) ==
416 AddOpB
->getOperand(MatchingOpIdxB
)) {
417 Value
*OtherOperandA
= AddOpA
->getOperand(MatchingOpIdxA
== 1 ? 0 : 1);
418 Value
*OtherOperandB
= AddOpB
->getOperand(MatchingOpIdxB
== 1 ? 0 : 1);
419 Instruction
*OtherInstrA
= dyn_cast
<Instruction
>(OtherOperandA
);
420 Instruction
*OtherInstrB
= dyn_cast
<Instruction
>(OtherOperandB
);
421 // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
422 if (OtherInstrB
&& OtherInstrB
->getOpcode() == Instruction::Add
&&
423 checkNoWrapFlags(OtherInstrB
, Signed
) &&
424 isa
<ConstantInt
>(OtherInstrB
->getOperand(1))) {
426 cast
<ConstantInt
>(OtherInstrB
->getOperand(1))->getSExtValue();
427 if (OtherInstrB
->getOperand(0) == OtherOperandA
&&
428 IdxDiff
.getSExtValue() == CstVal
)
431 // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
432 if (OtherInstrA
&& OtherInstrA
->getOpcode() == Instruction::Add
&&
433 checkNoWrapFlags(OtherInstrA
, Signed
) &&
434 isa
<ConstantInt
>(OtherInstrA
->getOperand(1))) {
436 cast
<ConstantInt
>(OtherInstrA
->getOperand(1))->getSExtValue();
437 if (OtherInstrA
->getOperand(0) == OtherOperandB
&&
438 IdxDiff
.getSExtValue() == -CstVal
)
441 // Match `x +nsw/nuw (y +nsw/nuw c)` and
442 // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
443 if (OtherInstrA
&& OtherInstrB
&&
444 OtherInstrA
->getOpcode() == Instruction::Add
&&
445 OtherInstrB
->getOpcode() == Instruction::Add
&&
446 checkNoWrapFlags(OtherInstrA
, Signed
) &&
447 checkNoWrapFlags(OtherInstrB
, Signed
) &&
448 isa
<ConstantInt
>(OtherInstrA
->getOperand(1)) &&
449 isa
<ConstantInt
>(OtherInstrB
->getOperand(1))) {
451 cast
<ConstantInt
>(OtherInstrA
->getOperand(1))->getSExtValue();
453 cast
<ConstantInt
>(OtherInstrB
->getOperand(1))->getSExtValue();
454 if (OtherInstrA
->getOperand(0) == OtherInstrB
->getOperand(0) &&
455 IdxDiff
.getSExtValue() == (CstValB
- CstValA
))
462 bool Vectorizer::lookThroughComplexAddresses(Value
*PtrA
, Value
*PtrB
,
464 unsigned Depth
) const {
465 auto *GEPA
= dyn_cast
<GetElementPtrInst
>(PtrA
);
466 auto *GEPB
= dyn_cast
<GetElementPtrInst
>(PtrB
);
468 return lookThroughSelects(PtrA
, PtrB
, PtrDelta
, Depth
);
470 // Look through GEPs after checking they're the same except for the last
472 if (GEPA
->getNumOperands() != GEPB
->getNumOperands() ||
473 GEPA
->getPointerOperand() != GEPB
->getPointerOperand())
475 gep_type_iterator GTIA
= gep_type_begin(GEPA
);
476 gep_type_iterator GTIB
= gep_type_begin(GEPB
);
477 for (unsigned I
= 0, E
= GEPA
->getNumIndices() - 1; I
< E
; ++I
) {
478 if (GTIA
.getOperand() != GTIB
.getOperand())
484 Instruction
*OpA
= dyn_cast
<Instruction
>(GTIA
.getOperand());
485 Instruction
*OpB
= dyn_cast
<Instruction
>(GTIB
.getOperand());
486 if (!OpA
|| !OpB
|| OpA
->getOpcode() != OpB
->getOpcode() ||
487 OpA
->getType() != OpB
->getType())
490 if (PtrDelta
.isNegative()) {
491 if (PtrDelta
.isMinSignedValue())
496 uint64_t Stride
= DL
.getTypeAllocSize(GTIA
.getIndexedType());
497 if (PtrDelta
.urem(Stride
) != 0)
499 unsigned IdxBitWidth
= OpA
->getType()->getScalarSizeInBits();
500 APInt IdxDiff
= PtrDelta
.udiv(Stride
).zextOrSelf(IdxBitWidth
);
502 // Only look through a ZExt/SExt.
503 if (!isa
<SExtInst
>(OpA
) && !isa
<ZExtInst
>(OpA
))
506 bool Signed
= isa
<SExtInst
>(OpA
);
508 // At this point A could be a function parameter, i.e. not an instruction
509 Value
*ValA
= OpA
->getOperand(0);
510 OpB
= dyn_cast
<Instruction
>(OpB
->getOperand(0));
511 if (!OpB
|| ValA
->getType() != OpB
->getType())
514 // Now we need to prove that adding IdxDiff to ValA won't overflow.
517 // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
519 if (OpB
->getOpcode() == Instruction::Add
&&
520 isa
<ConstantInt
>(OpB
->getOperand(1)) &&
521 IdxDiff
.sle(cast
<ConstantInt
>(OpB
->getOperand(1))->getSExtValue()) &&
522 checkNoWrapFlags(OpB
, Signed
))
525 // Second attempt: check if we have eligible add NSW/NUW instruction
527 OpA
= dyn_cast
<Instruction
>(ValA
);
528 if (!Safe
&& OpA
&& OpA
->getOpcode() == Instruction::Add
&&
529 OpB
->getOpcode() == Instruction::Add
&& checkNoWrapFlags(OpA
, Signed
) &&
530 checkNoWrapFlags(OpB
, Signed
)) {
531 // In the checks below a matching operand in OpA and OpB is
532 // an operand which is the same in those two instructions.
533 // Below we account for possible orders of the operands of
534 // these add instructions.
535 for (unsigned MatchingOpIdxA
: {0, 1})
536 for (unsigned MatchingOpIdxB
: {0, 1})
538 Safe
= checkIfSafeAddSequence(IdxDiff
, OpA
, MatchingOpIdxA
, OpB
,
539 MatchingOpIdxB
, Signed
);
542 unsigned BitWidth
= ValA
->getType()->getScalarSizeInBits();
545 // If all set bits of IdxDiff or any higher order bit other than the sign bit
546 // are known to be zero in ValA, we can add Diff to it while guaranteeing no
547 // overflow of any sort.
549 KnownBits
Known(BitWidth
);
550 computeKnownBits(ValA
, Known
, DL
, 0, &AC
, OpB
, &DT
);
551 APInt BitsAllowedToBeSet
= Known
.Zero
.zext(IdxDiff
.getBitWidth());
553 BitsAllowedToBeSet
.clearBit(BitWidth
- 1);
554 if (BitsAllowedToBeSet
.ult(IdxDiff
))
558 const SCEV
*OffsetSCEVA
= SE
.getSCEV(ValA
);
559 const SCEV
*OffsetSCEVB
= SE
.getSCEV(OpB
);
560 const SCEV
*C
= SE
.getConstant(IdxDiff
.trunc(BitWidth
));
561 const SCEV
*X
= SE
.getAddExpr(OffsetSCEVA
, C
);
562 return X
== OffsetSCEVB
;
565 bool Vectorizer::lookThroughSelects(Value
*PtrA
, Value
*PtrB
,
566 const APInt
&PtrDelta
,
567 unsigned Depth
) const {
568 if (Depth
++ == MaxDepth
)
571 if (auto *SelectA
= dyn_cast
<SelectInst
>(PtrA
)) {
572 if (auto *SelectB
= dyn_cast
<SelectInst
>(PtrB
)) {
573 return SelectA
->getCondition() == SelectB
->getCondition() &&
574 areConsecutivePointers(SelectA
->getTrueValue(),
575 SelectB
->getTrueValue(), PtrDelta
, Depth
) &&
576 areConsecutivePointers(SelectA
->getFalseValue(),
577 SelectB
->getFalseValue(), PtrDelta
, Depth
);
583 void Vectorizer::reorder(Instruction
*I
) {
584 SmallPtrSet
<Instruction
*, 16> InstructionsToMove
;
585 SmallVector
<Instruction
*, 16> Worklist
;
587 Worklist
.push_back(I
);
588 while (!Worklist
.empty()) {
589 Instruction
*IW
= Worklist
.pop_back_val();
590 int NumOperands
= IW
->getNumOperands();
591 for (int i
= 0; i
< NumOperands
; i
++) {
592 Instruction
*IM
= dyn_cast
<Instruction
>(IW
->getOperand(i
));
593 if (!IM
|| IM
->getOpcode() == Instruction::PHI
)
596 // If IM is in another BB, no need to move it, because this pass only
597 // vectorizes instructions within one BB.
598 if (IM
->getParent() != I
->getParent())
601 if (!IM
->comesBefore(I
)) {
602 InstructionsToMove
.insert(IM
);
603 Worklist
.push_back(IM
);
608 // All instructions to move should follow I. Start from I, not from begin().
609 for (auto BBI
= I
->getIterator(), E
= I
->getParent()->end(); BBI
!= E
;
611 if (!InstructionsToMove
.count(&*BBI
))
613 Instruction
*IM
= &*BBI
;
615 IM
->removeFromParent();
620 std::pair
<BasicBlock::iterator
, BasicBlock::iterator
>
621 Vectorizer::getBoundaryInstrs(ArrayRef
<Instruction
*> Chain
) {
622 Instruction
*C0
= Chain
[0];
623 BasicBlock::iterator FirstInstr
= C0
->getIterator();
624 BasicBlock::iterator LastInstr
= C0
->getIterator();
626 BasicBlock
*BB
= C0
->getParent();
627 unsigned NumFound
= 0;
628 for (Instruction
&I
: *BB
) {
629 if (!is_contained(Chain
, &I
))
634 FirstInstr
= I
.getIterator();
636 if (NumFound
== Chain
.size()) {
637 LastInstr
= I
.getIterator();
642 // Range is [first, last).
643 return std::make_pair(FirstInstr
, ++LastInstr
);
646 void Vectorizer::eraseInstructions(ArrayRef
<Instruction
*> Chain
) {
647 SmallVector
<Instruction
*, 16> Instrs
;
648 for (Instruction
*I
: Chain
) {
649 Value
*PtrOperand
= getLoadStorePointerOperand(I
);
650 assert(PtrOperand
&& "Instruction must have a pointer operand.");
652 if (GetElementPtrInst
*GEP
= dyn_cast
<GetElementPtrInst
>(PtrOperand
))
653 Instrs
.push_back(GEP
);
656 // Erase instructions.
657 for (Instruction
*I
: Instrs
)
659 I
->eraseFromParent();
662 std::pair
<ArrayRef
<Instruction
*>, ArrayRef
<Instruction
*>>
663 Vectorizer::splitOddVectorElts(ArrayRef
<Instruction
*> Chain
,
664 unsigned ElementSizeBits
) {
665 unsigned ElementSizeBytes
= ElementSizeBits
/ 8;
666 unsigned SizeBytes
= ElementSizeBytes
* Chain
.size();
667 unsigned NumLeft
= (SizeBytes
- (SizeBytes
% 4)) / ElementSizeBytes
;
668 if (NumLeft
== Chain
.size()) {
669 if ((NumLeft
& 1) == 0)
670 NumLeft
/= 2; // Split even in half
672 --NumLeft
; // Split off last element
673 } else if (NumLeft
== 0)
675 return std::make_pair(Chain
.slice(0, NumLeft
), Chain
.slice(NumLeft
));
678 ArrayRef
<Instruction
*>
679 Vectorizer::getVectorizablePrefix(ArrayRef
<Instruction
*> Chain
) {
680 // These are in BB order, unlike Chain, which is in address order.
681 SmallVector
<Instruction
*, 16> MemoryInstrs
;
682 SmallVector
<Instruction
*, 16> ChainInstrs
;
684 bool IsLoadChain
= isa
<LoadInst
>(Chain
[0]);
686 for (Instruction
*I
: Chain
) {
688 assert(isa
<LoadInst
>(I
) &&
689 "All elements of Chain must be loads, or all must be stores.");
691 assert(isa
<StoreInst
>(I
) &&
692 "All elements of Chain must be loads, or all must be stores.");
696 for (Instruction
&I
: make_range(getBoundaryInstrs(Chain
))) {
697 if (isa
<LoadInst
>(I
) || isa
<StoreInst
>(I
)) {
698 if (!is_contained(Chain
, &I
))
699 MemoryInstrs
.push_back(&I
);
701 ChainInstrs
.push_back(&I
);
702 } else if (isa
<IntrinsicInst
>(&I
) &&
703 cast
<IntrinsicInst
>(&I
)->getIntrinsicID() ==
704 Intrinsic::sideeffect
) {
705 // Ignore llvm.sideeffect calls.
706 } else if (isa
<IntrinsicInst
>(&I
) &&
707 cast
<IntrinsicInst
>(&I
)->getIntrinsicID() ==
708 Intrinsic::pseudoprobe
) {
709 // Ignore llvm.pseudoprobe calls.
710 } else if (isa
<IntrinsicInst
>(&I
) &&
711 cast
<IntrinsicInst
>(&I
)->getIntrinsicID() == Intrinsic::assume
) {
712 // Ignore llvm.assume calls.
713 } else if (IsLoadChain
&& (I
.mayWriteToMemory() || I
.mayThrow())) {
714 LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
717 } else if (!IsLoadChain
&& (I
.mayReadOrWriteMemory() || I
.mayThrow())) {
718 LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
724 // Loop until we find an instruction in ChainInstrs that we can't vectorize.
725 unsigned ChainInstrIdx
= 0;
726 Instruction
*BarrierMemoryInstr
= nullptr;
728 for (unsigned E
= ChainInstrs
.size(); ChainInstrIdx
< E
; ++ChainInstrIdx
) {
729 Instruction
*ChainInstr
= ChainInstrs
[ChainInstrIdx
];
731 // If a barrier memory instruction was found, chain instructions that follow
732 // will not be added to the valid prefix.
733 if (BarrierMemoryInstr
&& BarrierMemoryInstr
->comesBefore(ChainInstr
))
736 // Check (in BB order) if any instruction prevents ChainInstr from being
737 // vectorized. Find and store the first such "conflicting" instruction.
738 for (Instruction
*MemInstr
: MemoryInstrs
) {
739 // If a barrier memory instruction was found, do not check past it.
740 if (BarrierMemoryInstr
&& BarrierMemoryInstr
->comesBefore(MemInstr
))
743 auto *MemLoad
= dyn_cast
<LoadInst
>(MemInstr
);
744 auto *ChainLoad
= dyn_cast
<LoadInst
>(ChainInstr
);
745 if (MemLoad
&& ChainLoad
)
748 // We can ignore the alias if the we have a load store pair and the load
749 // is known to be invariant. The load cannot be clobbered by the store.
750 auto IsInvariantLoad
= [](const LoadInst
*LI
) -> bool {
751 return LI
->hasMetadata(LLVMContext::MD_invariant_load
);
754 // We can ignore the alias as long as the load comes before the store,
755 // because that means we won't be moving the load past the store to
756 // vectorize it (the vectorized load is inserted at the location of the
757 // first load in the chain).
758 if (isa
<StoreInst
>(MemInstr
) && ChainLoad
&&
759 (IsInvariantLoad(ChainLoad
) || ChainLoad
->comesBefore(MemInstr
)))
762 // Same case, but in reverse.
763 if (MemLoad
&& isa
<StoreInst
>(ChainInstr
) &&
764 (IsInvariantLoad(MemLoad
) || MemLoad
->comesBefore(ChainInstr
)))
767 if (!AA
.isNoAlias(MemoryLocation::get(MemInstr
),
768 MemoryLocation::get(ChainInstr
))) {
770 dbgs() << "LSV: Found alias:\n"
771 " Aliasing instruction and pointer:\n"
772 << " " << *MemInstr
<< '\n'
773 << " " << *getLoadStorePointerOperand(MemInstr
) << '\n'
774 << " Aliased instruction and pointer:\n"
775 << " " << *ChainInstr
<< '\n'
776 << " " << *getLoadStorePointerOperand(ChainInstr
) << '\n';
778 // Save this aliasing memory instruction as a barrier, but allow other
779 // instructions that precede the barrier to be vectorized with this one.
780 BarrierMemoryInstr
= MemInstr
;
784 // Continue the search only for store chains, since vectorizing stores that
785 // precede an aliasing load is valid. Conversely, vectorizing loads is valid
786 // up to an aliasing store, but should not pull loads from further down in
788 if (IsLoadChain
&& BarrierMemoryInstr
) {
789 // The BarrierMemoryInstr is a store that precedes ChainInstr.
790 assert(BarrierMemoryInstr
->comesBefore(ChainInstr
));
795 // Find the largest prefix of Chain whose elements are all in
796 // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of
797 // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB
799 SmallPtrSet
<Instruction
*, 8> VectorizableChainInstrs(
800 ChainInstrs
.begin(), ChainInstrs
.begin() + ChainInstrIdx
);
801 unsigned ChainIdx
= 0;
802 for (unsigned ChainLen
= Chain
.size(); ChainIdx
< ChainLen
; ++ChainIdx
) {
803 if (!VectorizableChainInstrs
.count(Chain
[ChainIdx
]))
806 return Chain
.slice(0, ChainIdx
);
809 static ChainID
getChainID(const Value
*Ptr
) {
810 const Value
*ObjPtr
= getUnderlyingObject(Ptr
);
811 if (const auto *Sel
= dyn_cast
<SelectInst
>(ObjPtr
)) {
812 // The select's themselves are distinct instructions even if they share the
813 // same condition and evaluate to consecutive pointers for true and false
814 // values of the condition. Therefore using the select's themselves for
815 // grouping instructions would put consecutive accesses into different lists
816 // and they won't be even checked for being consecutive, and won't be
818 return Sel
->getCondition();
823 std::pair
<InstrListMap
, InstrListMap
>
824 Vectorizer::collectInstructions(BasicBlock
*BB
) {
825 InstrListMap LoadRefs
;
826 InstrListMap StoreRefs
;
828 for (Instruction
&I
: *BB
) {
829 if (!I
.mayReadOrWriteMemory())
832 if (LoadInst
*LI
= dyn_cast
<LoadInst
>(&I
)) {
836 // Skip if it's not legal.
837 if (!TTI
.isLegalToVectorizeLoad(LI
))
840 Type
*Ty
= LI
->getType();
841 if (!VectorType::isValidElementType(Ty
->getScalarType()))
844 // Skip weird non-byte sizes. They probably aren't worth the effort of
845 // handling correctly.
846 unsigned TySize
= DL
.getTypeSizeInBits(Ty
);
847 if ((TySize
% 8) != 0)
850 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
851 // functions are currently using an integer type for the vectorized
852 // load/store, and does not support casting between the integer type and a
853 // vector of pointers (e.g. i64 to <2 x i16*>)
854 if (Ty
->isVectorTy() && Ty
->isPtrOrPtrVectorTy())
857 Value
*Ptr
= LI
->getPointerOperand();
858 unsigned AS
= Ptr
->getType()->getPointerAddressSpace();
859 unsigned VecRegSize
= TTI
.getLoadStoreVecRegBitWidth(AS
);
861 unsigned VF
= VecRegSize
/ TySize
;
862 VectorType
*VecTy
= dyn_cast
<VectorType
>(Ty
);
864 // No point in looking at these if they're too big to vectorize.
865 if (TySize
> VecRegSize
/ 2 ||
866 (VecTy
&& TTI
.getLoadVectorFactor(VF
, TySize
, TySize
/ 8, VecTy
) == 0))
869 // Make sure all the users of a vector are constant-index extracts.
870 if (isa
<VectorType
>(Ty
) && !llvm::all_of(LI
->users(), [](const User
*U
) {
871 const ExtractElementInst
*EEI
= dyn_cast
<ExtractElementInst
>(U
);
872 return EEI
&& isa
<ConstantInt
>(EEI
->getOperand(1));
876 // Save the load locations.
877 const ChainID ID
= getChainID(Ptr
);
878 LoadRefs
[ID
].push_back(LI
);
879 } else if (StoreInst
*SI
= dyn_cast
<StoreInst
>(&I
)) {
883 // Skip if it's not legal.
884 if (!TTI
.isLegalToVectorizeStore(SI
))
887 Type
*Ty
= SI
->getValueOperand()->getType();
888 if (!VectorType::isValidElementType(Ty
->getScalarType()))
891 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
892 // functions are currently using an integer type for the vectorized
893 // load/store, and does not support casting between the integer type and a
894 // vector of pointers (e.g. i64 to <2 x i16*>)
895 if (Ty
->isVectorTy() && Ty
->isPtrOrPtrVectorTy())
898 // Skip weird non-byte sizes. They probably aren't worth the effort of
899 // handling correctly.
900 unsigned TySize
= DL
.getTypeSizeInBits(Ty
);
901 if ((TySize
% 8) != 0)
904 Value
*Ptr
= SI
->getPointerOperand();
905 unsigned AS
= Ptr
->getType()->getPointerAddressSpace();
906 unsigned VecRegSize
= TTI
.getLoadStoreVecRegBitWidth(AS
);
908 unsigned VF
= VecRegSize
/ TySize
;
909 VectorType
*VecTy
= dyn_cast
<VectorType
>(Ty
);
911 // No point in looking at these if they're too big to vectorize.
912 if (TySize
> VecRegSize
/ 2 ||
913 (VecTy
&& TTI
.getStoreVectorFactor(VF
, TySize
, TySize
/ 8, VecTy
) == 0))
916 if (isa
<VectorType
>(Ty
) && !llvm::all_of(SI
->users(), [](const User
*U
) {
917 const ExtractElementInst
*EEI
= dyn_cast
<ExtractElementInst
>(U
);
918 return EEI
&& isa
<ConstantInt
>(EEI
->getOperand(1));
922 // Save store location.
923 const ChainID ID
= getChainID(Ptr
);
924 StoreRefs
[ID
].push_back(SI
);
928 return {LoadRefs
, StoreRefs
};
931 bool Vectorizer::vectorizeChains(InstrListMap
&Map
) {
932 bool Changed
= false;
934 for (const std::pair
<ChainID
, InstrList
> &Chain
: Map
) {
935 unsigned Size
= Chain
.second
.size();
939 LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size
<< ".\n");
941 // Process the stores in chunks of 64.
942 for (unsigned CI
= 0, CE
= Size
; CI
< CE
; CI
+= 64) {
943 unsigned Len
= std::min
<unsigned>(CE
- CI
, 64);
944 ArrayRef
<Instruction
*> Chunk(&Chain
.second
[CI
], Len
);
945 Changed
|= vectorizeInstructions(Chunk
);
952 bool Vectorizer::vectorizeInstructions(ArrayRef
<Instruction
*> Instrs
) {
953 LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs
.size()
954 << " instructions.\n");
955 SmallVector
<int, 16> Heads
, Tails
;
956 int ConsecutiveChain
[64];
958 // Do a quadratic search on all of the given loads/stores and find all of the
959 // pairs of loads/stores that follow each other.
960 for (int i
= 0, e
= Instrs
.size(); i
< e
; ++i
) {
961 ConsecutiveChain
[i
] = -1;
962 for (int j
= e
- 1; j
>= 0; --j
) {
966 if (isConsecutiveAccess(Instrs
[i
], Instrs
[j
])) {
967 if (ConsecutiveChain
[i
] != -1) {
968 int CurDistance
= std::abs(ConsecutiveChain
[i
] - i
);
969 int NewDistance
= std::abs(ConsecutiveChain
[i
] - j
);
970 if (j
< i
|| NewDistance
> CurDistance
)
971 continue; // Should not insert.
976 ConsecutiveChain
[i
] = j
;
981 bool Changed
= false;
982 SmallPtrSet
<Instruction
*, 16> InstructionsProcessed
;
984 for (int Head
: Heads
) {
985 if (InstructionsProcessed
.count(Instrs
[Head
]))
987 bool LongerChainExists
= false;
988 for (unsigned TIt
= 0; TIt
< Tails
.size(); TIt
++)
989 if (Head
== Tails
[TIt
] &&
990 !InstructionsProcessed
.count(Instrs
[Heads
[TIt
]])) {
991 LongerChainExists
= true;
994 if (LongerChainExists
)
997 // We found an instr that starts a chain. Now follow the chain and try to
999 SmallVector
<Instruction
*, 16> Operands
;
1001 while (I
!= -1 && (is_contained(Tails
, I
) || is_contained(Heads
, I
))) {
1002 if (InstructionsProcessed
.count(Instrs
[I
]))
1005 Operands
.push_back(Instrs
[I
]);
1006 I
= ConsecutiveChain
[I
];
1009 bool Vectorized
= false;
1010 if (isa
<LoadInst
>(*Operands
.begin()))
1011 Vectorized
= vectorizeLoadChain(Operands
, &InstructionsProcessed
);
1013 Vectorized
= vectorizeStoreChain(Operands
, &InstructionsProcessed
);
1015 Changed
|= Vectorized
;
1021 bool Vectorizer::vectorizeStoreChain(
1022 ArrayRef
<Instruction
*> Chain
,
1023 SmallPtrSet
<Instruction
*, 16> *InstructionsProcessed
) {
1024 StoreInst
*S0
= cast
<StoreInst
>(Chain
[0]);
1026 // If the vector has an int element, default to int for the whole store.
1027 Type
*StoreTy
= nullptr;
1028 for (Instruction
*I
: Chain
) {
1029 StoreTy
= cast
<StoreInst
>(I
)->getValueOperand()->getType();
1030 if (StoreTy
->isIntOrIntVectorTy())
1033 if (StoreTy
->isPtrOrPtrVectorTy()) {
1034 StoreTy
= Type::getIntNTy(F
.getParent()->getContext(),
1035 DL
.getTypeSizeInBits(StoreTy
));
1039 assert(StoreTy
&& "Failed to find store type");
1041 unsigned Sz
= DL
.getTypeSizeInBits(StoreTy
);
1042 unsigned AS
= S0
->getPointerAddressSpace();
1043 unsigned VecRegSize
= TTI
.getLoadStoreVecRegBitWidth(AS
);
1044 unsigned VF
= VecRegSize
/ Sz
;
1045 unsigned ChainSize
= Chain
.size();
1046 Align Alignment
= S0
->getAlign();
1048 if (!isPowerOf2_32(Sz
) || VF
< 2 || ChainSize
< 2) {
1049 InstructionsProcessed
->insert(Chain
.begin(), Chain
.end());
1053 ArrayRef
<Instruction
*> NewChain
= getVectorizablePrefix(Chain
);
1054 if (NewChain
.empty()) {
1055 // No vectorization possible.
1056 InstructionsProcessed
->insert(Chain
.begin(), Chain
.end());
1059 if (NewChain
.size() == 1) {
1060 // Failed after the first instruction. Discard it and try the smaller chain.
1061 InstructionsProcessed
->insert(NewChain
.front());
1065 // Update Chain to the valid vectorizable subchain.
1067 ChainSize
= Chain
.size();
1069 // Check if it's legal to vectorize this chain. If not, split the chain and
1071 unsigned EltSzInBytes
= Sz
/ 8;
1072 unsigned SzInBytes
= EltSzInBytes
* ChainSize
;
1074 FixedVectorType
*VecTy
;
1075 auto *VecStoreTy
= dyn_cast
<FixedVectorType
>(StoreTy
);
1077 VecTy
= FixedVectorType::get(StoreTy
->getScalarType(),
1078 Chain
.size() * VecStoreTy
->getNumElements());
1080 VecTy
= FixedVectorType::get(StoreTy
, Chain
.size());
1082 // If it's more than the max vector size or the target has a better
1083 // vector factor, break it into two pieces.
1084 unsigned TargetVF
= TTI
.getStoreVectorFactor(VF
, Sz
, SzInBytes
, VecTy
);
1085 if (ChainSize
> VF
|| (VF
!= TargetVF
&& TargetVF
< ChainSize
)) {
1086 LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1087 " Creating two separate arrays.\n");
1088 return vectorizeStoreChain(Chain
.slice(0, TargetVF
),
1089 InstructionsProcessed
) |
1090 vectorizeStoreChain(Chain
.slice(TargetVF
), InstructionsProcessed
);
1094 dbgs() << "LSV: Stores to vectorize:\n";
1095 for (Instruction
*I
: Chain
)
1096 dbgs() << " " << *I
<< "\n";
1099 // We won't try again to vectorize the elements of the chain, regardless of
1100 // whether we succeed below.
1101 InstructionsProcessed
->insert(Chain
.begin(), Chain
.end());
1103 // If the store is going to be misaligned, don't vectorize it.
1104 if (accessIsMisaligned(SzInBytes
, AS
, Alignment
)) {
1105 if (S0
->getPointerAddressSpace() != DL
.getAllocaAddrSpace()) {
1106 auto Chains
= splitOddVectorElts(Chain
, Sz
);
1107 return vectorizeStoreChain(Chains
.first
, InstructionsProcessed
) |
1108 vectorizeStoreChain(Chains
.second
, InstructionsProcessed
);
1111 Align NewAlign
= getOrEnforceKnownAlignment(S0
->getPointerOperand(),
1112 Align(StackAdjustedAlignment
),
1113 DL
, S0
, nullptr, &DT
);
1114 if (NewAlign
>= Alignment
)
1115 Alignment
= NewAlign
;
1120 if (!TTI
.isLegalToVectorizeStoreChain(SzInBytes
, Alignment
, AS
)) {
1121 auto Chains
= splitOddVectorElts(Chain
, Sz
);
1122 return vectorizeStoreChain(Chains
.first
, InstructionsProcessed
) |
1123 vectorizeStoreChain(Chains
.second
, InstructionsProcessed
);
1126 BasicBlock::iterator First
, Last
;
1127 std::tie(First
, Last
) = getBoundaryInstrs(Chain
);
1128 Builder
.SetInsertPoint(&*Last
);
1130 Value
*Vec
= UndefValue::get(VecTy
);
1133 unsigned VecWidth
= VecStoreTy
->getNumElements();
1134 for (unsigned I
= 0, E
= Chain
.size(); I
!= E
; ++I
) {
1135 StoreInst
*Store
= cast
<StoreInst
>(Chain
[I
]);
1136 for (unsigned J
= 0, NE
= VecStoreTy
->getNumElements(); J
!= NE
; ++J
) {
1137 unsigned NewIdx
= J
+ I
* VecWidth
;
1138 Value
*Extract
= Builder
.CreateExtractElement(Store
->getValueOperand(),
1139 Builder
.getInt32(J
));
1140 if (Extract
->getType() != StoreTy
->getScalarType())
1141 Extract
= Builder
.CreateBitCast(Extract
, StoreTy
->getScalarType());
1144 Builder
.CreateInsertElement(Vec
, Extract
, Builder
.getInt32(NewIdx
));
1149 for (unsigned I
= 0, E
= Chain
.size(); I
!= E
; ++I
) {
1150 StoreInst
*Store
= cast
<StoreInst
>(Chain
[I
]);
1151 Value
*Extract
= Store
->getValueOperand();
1152 if (Extract
->getType() != StoreTy
->getScalarType())
1154 Builder
.CreateBitOrPointerCast(Extract
, StoreTy
->getScalarType());
1157 Builder
.CreateInsertElement(Vec
, Extract
, Builder
.getInt32(I
));
1162 StoreInst
*SI
= Builder
.CreateAlignedStore(
1164 Builder
.CreateBitCast(S0
->getPointerOperand(), VecTy
->getPointerTo(AS
)),
1166 propagateMetadata(SI
, Chain
);
1168 eraseInstructions(Chain
);
1169 ++NumVectorInstructions
;
1170 NumScalarsVectorized
+= Chain
.size();
1174 bool Vectorizer::vectorizeLoadChain(
1175 ArrayRef
<Instruction
*> Chain
,
1176 SmallPtrSet
<Instruction
*, 16> *InstructionsProcessed
) {
1177 LoadInst
*L0
= cast
<LoadInst
>(Chain
[0]);
1179 // If the vector has an int element, default to int for the whole load.
1180 Type
*LoadTy
= nullptr;
1181 for (const auto &V
: Chain
) {
1182 LoadTy
= cast
<LoadInst
>(V
)->getType();
1183 if (LoadTy
->isIntOrIntVectorTy())
1186 if (LoadTy
->isPtrOrPtrVectorTy()) {
1187 LoadTy
= Type::getIntNTy(F
.getParent()->getContext(),
1188 DL
.getTypeSizeInBits(LoadTy
));
1192 assert(LoadTy
&& "Can't determine LoadInst type from chain");
1194 unsigned Sz
= DL
.getTypeSizeInBits(LoadTy
);
1195 unsigned AS
= L0
->getPointerAddressSpace();
1196 unsigned VecRegSize
= TTI
.getLoadStoreVecRegBitWidth(AS
);
1197 unsigned VF
= VecRegSize
/ Sz
;
1198 unsigned ChainSize
= Chain
.size();
1199 Align Alignment
= L0
->getAlign();
1201 if (!isPowerOf2_32(Sz
) || VF
< 2 || ChainSize
< 2) {
1202 InstructionsProcessed
->insert(Chain
.begin(), Chain
.end());
1206 ArrayRef
<Instruction
*> NewChain
= getVectorizablePrefix(Chain
);
1207 if (NewChain
.empty()) {
1208 // No vectorization possible.
1209 InstructionsProcessed
->insert(Chain
.begin(), Chain
.end());
1212 if (NewChain
.size() == 1) {
1213 // Failed after the first instruction. Discard it and try the smaller chain.
1214 InstructionsProcessed
->insert(NewChain
.front());
1218 // Update Chain to the valid vectorizable subchain.
1220 ChainSize
= Chain
.size();
1222 // Check if it's legal to vectorize this chain. If not, split the chain and
1224 unsigned EltSzInBytes
= Sz
/ 8;
1225 unsigned SzInBytes
= EltSzInBytes
* ChainSize
;
1227 auto *VecLoadTy
= dyn_cast
<FixedVectorType
>(LoadTy
);
1229 VecTy
= FixedVectorType::get(LoadTy
->getScalarType(),
1230 Chain
.size() * VecLoadTy
->getNumElements());
1232 VecTy
= FixedVectorType::get(LoadTy
, Chain
.size());
1234 // If it's more than the max vector size or the target has a better
1235 // vector factor, break it into two pieces.
1236 unsigned TargetVF
= TTI
.getLoadVectorFactor(VF
, Sz
, SzInBytes
, VecTy
);
1237 if (ChainSize
> VF
|| (VF
!= TargetVF
&& TargetVF
< ChainSize
)) {
1238 LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1239 " Creating two separate arrays.\n");
1240 return vectorizeLoadChain(Chain
.slice(0, TargetVF
), InstructionsProcessed
) |
1241 vectorizeLoadChain(Chain
.slice(TargetVF
), InstructionsProcessed
);
1244 // We won't try again to vectorize the elements of the chain, regardless of
1245 // whether we succeed below.
1246 InstructionsProcessed
->insert(Chain
.begin(), Chain
.end());
1248 // If the load is going to be misaligned, don't vectorize it.
1249 if (accessIsMisaligned(SzInBytes
, AS
, Alignment
)) {
1250 if (L0
->getPointerAddressSpace() != DL
.getAllocaAddrSpace()) {
1251 auto Chains
= splitOddVectorElts(Chain
, Sz
);
1252 return vectorizeLoadChain(Chains
.first
, InstructionsProcessed
) |
1253 vectorizeLoadChain(Chains
.second
, InstructionsProcessed
);
1256 Align NewAlign
= getOrEnforceKnownAlignment(L0
->getPointerOperand(),
1257 Align(StackAdjustedAlignment
),
1258 DL
, L0
, nullptr, &DT
);
1259 if (NewAlign
>= Alignment
)
1260 Alignment
= NewAlign
;
1265 if (!TTI
.isLegalToVectorizeLoadChain(SzInBytes
, Alignment
, AS
)) {
1266 auto Chains
= splitOddVectorElts(Chain
, Sz
);
1267 return vectorizeLoadChain(Chains
.first
, InstructionsProcessed
) |
1268 vectorizeLoadChain(Chains
.second
, InstructionsProcessed
);
1272 dbgs() << "LSV: Loads to vectorize:\n";
1273 for (Instruction
*I
: Chain
)
1277 // getVectorizablePrefix already computed getBoundaryInstrs. The value of
1278 // Last may have changed since then, but the value of First won't have. If it
1279 // matters, we could compute getBoundaryInstrs only once and reuse it here.
1280 BasicBlock::iterator First
, Last
;
1281 std::tie(First
, Last
) = getBoundaryInstrs(Chain
);
1282 Builder
.SetInsertPoint(&*First
);
1285 Builder
.CreateBitCast(L0
->getPointerOperand(), VecTy
->getPointerTo(AS
));
1287 Builder
.CreateAlignedLoad(VecTy
, Bitcast
, MaybeAlign(Alignment
));
1288 propagateMetadata(LI
, Chain
);
1291 SmallVector
<Instruction
*, 16> InstrsToErase
;
1293 unsigned VecWidth
= VecLoadTy
->getNumElements();
1294 for (unsigned I
= 0, E
= Chain
.size(); I
!= E
; ++I
) {
1295 for (auto Use
: Chain
[I
]->users()) {
1296 // All users of vector loads are ExtractElement instructions with
1297 // constant indices, otherwise we would have bailed before now.
1298 Instruction
*UI
= cast
<Instruction
>(Use
);
1299 unsigned Idx
= cast
<ConstantInt
>(UI
->getOperand(1))->getZExtValue();
1300 unsigned NewIdx
= Idx
+ I
* VecWidth
;
1301 Value
*V
= Builder
.CreateExtractElement(LI
, Builder
.getInt32(NewIdx
),
1303 if (V
->getType() != UI
->getType())
1304 V
= Builder
.CreateBitCast(V
, UI
->getType());
1306 // Replace the old instruction.
1307 UI
->replaceAllUsesWith(V
);
1308 InstrsToErase
.push_back(UI
);
1312 // Bitcast might not be an Instruction, if the value being loaded is a
1313 // constant. In that case, no need to reorder anything.
1314 if (Instruction
*BitcastInst
= dyn_cast
<Instruction
>(Bitcast
))
1315 reorder(BitcastInst
);
1317 for (auto I
: InstrsToErase
)
1318 I
->eraseFromParent();
1320 for (unsigned I
= 0, E
= Chain
.size(); I
!= E
; ++I
) {
1321 Value
*CV
= Chain
[I
];
1323 Builder
.CreateExtractElement(LI
, Builder
.getInt32(I
), CV
->getName());
1324 if (V
->getType() != CV
->getType()) {
1325 V
= Builder
.CreateBitOrPointerCast(V
, CV
->getType());
1328 // Replace the old instruction.
1329 CV
->replaceAllUsesWith(V
);
1332 if (Instruction
*BitcastInst
= dyn_cast
<Instruction
>(Bitcast
))
1333 reorder(BitcastInst
);
1336 eraseInstructions(Chain
);
1338 ++NumVectorInstructions
;
1339 NumScalarsVectorized
+= Chain
.size();
1343 bool Vectorizer::accessIsMisaligned(unsigned SzInBytes
, unsigned AddressSpace
,
1345 if (Alignment
.value() % SzInBytes
== 0)
1349 bool Allows
= TTI
.allowsMisalignedMemoryAccesses(F
.getParent()->getContext(),
1350 SzInBytes
* 8, AddressSpace
,
1352 LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
1353 << " and fast? " << Fast
<< "\n";);
1354 return !Allows
|| !Fast
;