1 //===------- VectorCombine.cpp - Optimize partial vector operations -------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass optimizes scalar/vector interactions using target cost models. The
10 // transforms implemented here may not fit in traditional loop-based or SLP
11 // vectorization passes.
13 //===----------------------------------------------------------------------===//
15 #include "llvm/Transforms/Vectorize/VectorCombine.h"
16 #include "llvm/ADT/DenseMap.h"
17 #include "llvm/ADT/ScopeExit.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/Analysis/AssumptionCache.h"
20 #include "llvm/Analysis/BasicAliasAnalysis.h"
21 #include "llvm/Analysis/GlobalsModRef.h"
22 #include "llvm/Analysis/Loads.h"
23 #include "llvm/Analysis/TargetTransformInfo.h"
24 #include "llvm/Analysis/ValueTracking.h"
25 #include "llvm/Analysis/VectorUtils.h"
26 #include "llvm/IR/Dominators.h"
27 #include "llvm/IR/Function.h"
28 #include "llvm/IR/IRBuilder.h"
29 #include "llvm/IR/PatternMatch.h"
30 #include "llvm/Support/CommandLine.h"
31 #include "llvm/Transforms/Utils/Local.h"
34 #define DEBUG_TYPE "vector-combine"
35 #include "llvm/Transforms/Utils/InstructionWorklist.h"
38 using namespace llvm::PatternMatch
;
40 STATISTIC(NumVecLoad
, "Number of vector loads formed");
41 STATISTIC(NumVecCmp
, "Number of vector compares formed");
42 STATISTIC(NumVecBO
, "Number of vector binops formed");
43 STATISTIC(NumVecCmpBO
, "Number of vector compare + binop formed");
44 STATISTIC(NumShufOfBitcast
, "Number of shuffles moved after bitcast");
45 STATISTIC(NumScalarBO
, "Number of scalar binops formed");
46 STATISTIC(NumScalarCmp
, "Number of scalar compares formed");
48 static cl::opt
<bool> DisableVectorCombine(
49 "disable-vector-combine", cl::init(false), cl::Hidden
,
50 cl::desc("Disable all vector combine transforms"));
52 static cl::opt
<bool> DisableBinopExtractShuffle(
53 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden
,
54 cl::desc("Disable binop extract to shuffle transforms"));
56 static cl::opt
<unsigned> MaxInstrsToScan(
57 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden
,
58 cl::desc("Max number of instructions to scan for vector combining."));
60 static const unsigned InvalidIndex
= std::numeric_limits
<unsigned>::max();
65 VectorCombine(Function
&F
, const TargetTransformInfo
&TTI
,
66 const DominatorTree
&DT
, AAResults
&AA
, AssumptionCache
&AC
,
67 bool TryEarlyFoldsOnly
)
68 : F(F
), Builder(F
.getContext()), TTI(TTI
), DT(DT
), AA(AA
), AC(AC
),
69 TryEarlyFoldsOnly(TryEarlyFoldsOnly
) {}
76 const TargetTransformInfo
&TTI
;
77 const DominatorTree
&DT
;
81 /// If true, only perform beneficial early IR transforms. Do not introduce new
82 /// vector operations.
83 bool TryEarlyFoldsOnly
;
85 InstructionWorklist Worklist
;
87 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
88 // parameter. That should be updated to specific sub-classes because the
89 // run loop was changed to dispatch on opcode.
90 bool vectorizeLoadInsert(Instruction
&I
);
91 bool widenSubvectorLoad(Instruction
&I
);
92 ExtractElementInst
*getShuffleExtract(ExtractElementInst
*Ext0
,
93 ExtractElementInst
*Ext1
,
94 unsigned PreferredExtractIndex
) const;
95 bool isExtractExtractCheap(ExtractElementInst
*Ext0
, ExtractElementInst
*Ext1
,
97 ExtractElementInst
*&ConvertToShuffle
,
98 unsigned PreferredExtractIndex
);
99 void foldExtExtCmp(ExtractElementInst
*Ext0
, ExtractElementInst
*Ext1
,
101 void foldExtExtBinop(ExtractElementInst
*Ext0
, ExtractElementInst
*Ext1
,
103 bool foldExtractExtract(Instruction
&I
);
104 bool foldInsExtFNeg(Instruction
&I
);
105 bool foldBitcastShuffle(Instruction
&I
);
106 bool scalarizeBinopOrCmp(Instruction
&I
);
107 bool scalarizeVPIntrinsic(Instruction
&I
);
108 bool foldExtractedCmps(Instruction
&I
);
109 bool foldSingleElementStore(Instruction
&I
);
110 bool scalarizeLoadExtract(Instruction
&I
);
111 bool foldShuffleOfBinops(Instruction
&I
);
112 bool foldShuffleFromReductions(Instruction
&I
);
113 bool foldSelectShuffle(Instruction
&I
, bool FromReduction
= false);
115 void replaceValue(Value
&Old
, Value
&New
) {
116 Old
.replaceAllUsesWith(&New
);
117 if (auto *NewI
= dyn_cast
<Instruction
>(&New
)) {
119 Worklist
.pushUsersToWorkList(*NewI
);
120 Worklist
.pushValue(NewI
);
122 Worklist
.pushValue(&Old
);
125 void eraseInstruction(Instruction
&I
) {
126 for (Value
*Op
: I
.operands())
127 Worklist
.pushValue(Op
);
134 static bool canWidenLoad(LoadInst
*Load
, const TargetTransformInfo
&TTI
) {
135 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
136 // The widened load may load data from dirty regions or create data races
137 // non-existent in the source.
138 if (!Load
|| !Load
->isSimple() || !Load
->hasOneUse() ||
139 Load
->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag
) ||
140 mustSuppressSpeculation(*Load
))
143 // We are potentially transforming byte-sized (8-bit) memory accesses, so make
144 // sure we have all of our type-based constraints in place for this target.
145 Type
*ScalarTy
= Load
->getType()->getScalarType();
146 uint64_t ScalarSize
= ScalarTy
->getPrimitiveSizeInBits();
147 unsigned MinVectorSize
= TTI
.getMinVectorRegisterBitWidth();
148 if (!ScalarSize
|| !MinVectorSize
|| MinVectorSize
% ScalarSize
!= 0 ||
155 bool VectorCombine::vectorizeLoadInsert(Instruction
&I
) {
156 // Match insert into fixed vector of scalar value.
157 // TODO: Handle non-zero insert index.
159 if (!match(&I
, m_InsertElt(m_Undef(), m_Value(Scalar
), m_ZeroInt())) ||
160 !Scalar
->hasOneUse())
163 // Optionally match an extract from another vector.
165 bool HasExtract
= match(Scalar
, m_ExtractElt(m_Value(X
), m_ZeroInt()));
169 auto *Load
= dyn_cast
<LoadInst
>(X
);
170 if (!canWidenLoad(Load
, TTI
))
173 Type
*ScalarTy
= Scalar
->getType();
174 uint64_t ScalarSize
= ScalarTy
->getPrimitiveSizeInBits();
175 unsigned MinVectorSize
= TTI
.getMinVectorRegisterBitWidth();
177 // Check safety of replacing the scalar load with a larger vector load.
178 // We use minimal alignment (maximum flexibility) because we only care about
179 // the dereferenceable region. When calculating cost and creating a new op,
180 // we may use a larger value based on alignment attributes.
181 const DataLayout
&DL
= I
.getModule()->getDataLayout();
182 Value
*SrcPtr
= Load
->getPointerOperand()->stripPointerCasts();
183 assert(isa
<PointerType
>(SrcPtr
->getType()) && "Expected a pointer type");
185 unsigned MinVecNumElts
= MinVectorSize
/ ScalarSize
;
186 auto *MinVecTy
= VectorType::get(ScalarTy
, MinVecNumElts
, false);
187 unsigned OffsetEltIndex
= 0;
188 Align Alignment
= Load
->getAlign();
189 if (!isSafeToLoadUnconditionally(SrcPtr
, MinVecTy
, Align(1), DL
, Load
, &AC
,
191 // It is not safe to load directly from the pointer, but we can still peek
192 // through gep offsets and check if it safe to load from a base address with
193 // updated alignment. If it is, we can shuffle the element(s) into place
195 unsigned OffsetBitWidth
= DL
.getIndexTypeSizeInBits(SrcPtr
->getType());
196 APInt
Offset(OffsetBitWidth
, 0);
197 SrcPtr
= SrcPtr
->stripAndAccumulateInBoundsConstantOffsets(DL
, Offset
);
199 // We want to shuffle the result down from a high element of a vector, so
200 // the offset must be positive.
201 if (Offset
.isNegative())
204 // The offset must be a multiple of the scalar element to shuffle cleanly
205 // in the element's size.
206 uint64_t ScalarSizeInBytes
= ScalarSize
/ 8;
207 if (Offset
.urem(ScalarSizeInBytes
) != 0)
210 // If we load MinVecNumElts, will our target element still be loaded?
211 OffsetEltIndex
= Offset
.udiv(ScalarSizeInBytes
).getZExtValue();
212 if (OffsetEltIndex
>= MinVecNumElts
)
215 if (!isSafeToLoadUnconditionally(SrcPtr
, MinVecTy
, Align(1), DL
, Load
, &AC
,
219 // Update alignment with offset value. Note that the offset could be negated
220 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
221 // negation does not change the result of the alignment calculation.
222 Alignment
= commonAlignment(Alignment
, Offset
.getZExtValue());
225 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
226 // Use the greater of the alignment on the load or its source pointer.
227 Alignment
= std::max(SrcPtr
->getPointerAlignment(DL
), Alignment
);
228 Type
*LoadTy
= Load
->getType();
229 unsigned AS
= Load
->getPointerAddressSpace();
230 InstructionCost OldCost
=
231 TTI
.getMemoryOpCost(Instruction::Load
, LoadTy
, Alignment
, AS
);
232 APInt DemandedElts
= APInt::getOneBitSet(MinVecNumElts
, 0);
233 TTI::TargetCostKind CostKind
= TTI::TCK_RecipThroughput
;
235 TTI
.getScalarizationOverhead(MinVecTy
, DemandedElts
,
236 /* Insert */ true, HasExtract
, CostKind
);
238 // New pattern: load VecPtr
239 InstructionCost NewCost
=
240 TTI
.getMemoryOpCost(Instruction::Load
, MinVecTy
, Alignment
, AS
);
241 // Optionally, we are shuffling the loaded vector element(s) into place.
242 // For the mask set everything but element 0 to undef to prevent poison from
243 // propagating from the extra loaded memory. This will also optionally
244 // shrink/grow the vector from the loaded size to the output size.
245 // We assume this operation has no cost in codegen if there was no offset.
246 // Note that we could use freeze to avoid poison problems, but then we might
247 // still need a shuffle to change the vector size.
248 auto *Ty
= cast
<FixedVectorType
>(I
.getType());
249 unsigned OutputNumElts
= Ty
->getNumElements();
250 SmallVector
<int, 16> Mask(OutputNumElts
, PoisonMaskElem
);
251 assert(OffsetEltIndex
< MinVecNumElts
&& "Address offset too big");
252 Mask
[0] = OffsetEltIndex
;
254 NewCost
+= TTI
.getShuffleCost(TTI::SK_PermuteSingleSrc
, MinVecTy
, Mask
);
256 // We can aggressively convert to the vector form because the backend can
257 // invert this transform if it does not result in a performance win.
258 if (OldCost
< NewCost
|| !NewCost
.isValid())
261 // It is safe and potentially profitable to load a vector directly:
262 // inselt undef, load Scalar, 0 --> load VecPtr
263 IRBuilder
<> Builder(Load
);
264 Value
*CastedPtr
= Builder
.CreatePointerBitCastOrAddrSpaceCast(
265 SrcPtr
, MinVecTy
->getPointerTo(AS
));
266 Value
*VecLd
= Builder
.CreateAlignedLoad(MinVecTy
, CastedPtr
, Alignment
);
267 VecLd
= Builder
.CreateShuffleVector(VecLd
, Mask
);
269 replaceValue(I
, *VecLd
);
274 /// If we are loading a vector and then inserting it into a larger vector with
275 /// undefined elements, try to load the larger vector and eliminate the insert.
276 /// This removes a shuffle in IR and may allow combining of other loaded values.
277 bool VectorCombine::widenSubvectorLoad(Instruction
&I
) {
278 // Match subvector insert of fixed vector.
279 auto *Shuf
= cast
<ShuffleVectorInst
>(&I
);
280 if (!Shuf
->isIdentityWithPadding())
283 // Allow a non-canonical shuffle mask that is choosing elements from op1.
285 cast
<FixedVectorType
>(Shuf
->getOperand(0)->getType())->getNumElements();
286 unsigned OpIndex
= any_of(Shuf
->getShuffleMask(), [&NumOpElts
](int M
) {
287 return M
>= (int)(NumOpElts
);
290 auto *Load
= dyn_cast
<LoadInst
>(Shuf
->getOperand(OpIndex
));
291 if (!canWidenLoad(Load
, TTI
))
294 // We use minimal alignment (maximum flexibility) because we only care about
295 // the dereferenceable region. When calculating cost and creating a new op,
296 // we may use a larger value based on alignment attributes.
297 auto *Ty
= cast
<FixedVectorType
>(I
.getType());
298 const DataLayout
&DL
= I
.getModule()->getDataLayout();
299 Value
*SrcPtr
= Load
->getPointerOperand()->stripPointerCasts();
300 assert(isa
<PointerType
>(SrcPtr
->getType()) && "Expected a pointer type");
301 Align Alignment
= Load
->getAlign();
302 if (!isSafeToLoadUnconditionally(SrcPtr
, Ty
, Align(1), DL
, Load
, &AC
, &DT
))
305 Alignment
= std::max(SrcPtr
->getPointerAlignment(DL
), Alignment
);
306 Type
*LoadTy
= Load
->getType();
307 unsigned AS
= Load
->getPointerAddressSpace();
309 // Original pattern: insert_subvector (load PtrOp)
310 // This conservatively assumes that the cost of a subvector insert into an
311 // undef value is 0. We could add that cost if the cost model accurately
312 // reflects the real cost of that operation.
313 InstructionCost OldCost
=
314 TTI
.getMemoryOpCost(Instruction::Load
, LoadTy
, Alignment
, AS
);
316 // New pattern: load PtrOp
317 InstructionCost NewCost
=
318 TTI
.getMemoryOpCost(Instruction::Load
, Ty
, Alignment
, AS
);
320 // We can aggressively convert to the vector form because the backend can
321 // invert this transform if it does not result in a performance win.
322 if (OldCost
< NewCost
|| !NewCost
.isValid())
325 IRBuilder
<> Builder(Load
);
327 Builder
.CreatePointerBitCastOrAddrSpaceCast(SrcPtr
, Ty
->getPointerTo(AS
));
328 Value
*VecLd
= Builder
.CreateAlignedLoad(Ty
, CastedPtr
, Alignment
);
329 replaceValue(I
, *VecLd
);
334 /// Determine which, if any, of the inputs should be replaced by a shuffle
335 /// followed by extract from a different index.
336 ExtractElementInst
*VectorCombine::getShuffleExtract(
337 ExtractElementInst
*Ext0
, ExtractElementInst
*Ext1
,
338 unsigned PreferredExtractIndex
= InvalidIndex
) const {
339 auto *Index0C
= dyn_cast
<ConstantInt
>(Ext0
->getIndexOperand());
340 auto *Index1C
= dyn_cast
<ConstantInt
>(Ext1
->getIndexOperand());
341 assert(Index0C
&& Index1C
&& "Expected constant extract indexes");
343 unsigned Index0
= Index0C
->getZExtValue();
344 unsigned Index1
= Index1C
->getZExtValue();
346 // If the extract indexes are identical, no shuffle is needed.
347 if (Index0
== Index1
)
350 Type
*VecTy
= Ext0
->getVectorOperand()->getType();
351 TTI::TargetCostKind CostKind
= TTI::TCK_RecipThroughput
;
352 assert(VecTy
== Ext1
->getVectorOperand()->getType() && "Need matching types");
353 InstructionCost Cost0
=
354 TTI
.getVectorInstrCost(*Ext0
, VecTy
, CostKind
, Index0
);
355 InstructionCost Cost1
=
356 TTI
.getVectorInstrCost(*Ext1
, VecTy
, CostKind
, Index1
);
358 // If both costs are invalid no shuffle is needed
359 if (!Cost0
.isValid() && !Cost1
.isValid())
362 // We are extracting from 2 different indexes, so one operand must be shuffled
363 // before performing a vector operation and/or extract. The more expensive
364 // extract will be replaced by a shuffle.
370 // If the costs are equal and there is a preferred extract index, shuffle the
372 if (PreferredExtractIndex
== Index0
)
374 if (PreferredExtractIndex
== Index1
)
377 // Otherwise, replace the extract with the higher index.
378 return Index0
> Index1
? Ext0
: Ext1
;
381 /// Compare the relative costs of 2 extracts followed by scalar operation vs.
382 /// vector operation(s) followed by extract. Return true if the existing
383 /// instructions are cheaper than a vector alternative. Otherwise, return false
384 /// and if one of the extracts should be transformed to a shufflevector, set
385 /// \p ConvertToShuffle to that extract instruction.
386 bool VectorCombine::isExtractExtractCheap(ExtractElementInst
*Ext0
,
387 ExtractElementInst
*Ext1
,
388 const Instruction
&I
,
389 ExtractElementInst
*&ConvertToShuffle
,
390 unsigned PreferredExtractIndex
) {
391 auto *Ext0IndexC
= dyn_cast
<ConstantInt
>(Ext0
->getOperand(1));
392 auto *Ext1IndexC
= dyn_cast
<ConstantInt
>(Ext1
->getOperand(1));
393 assert(Ext0IndexC
&& Ext1IndexC
&& "Expected constant extract indexes");
395 unsigned Opcode
= I
.getOpcode();
396 Type
*ScalarTy
= Ext0
->getType();
397 auto *VecTy
= cast
<VectorType
>(Ext0
->getOperand(0)->getType());
398 InstructionCost ScalarOpCost
, VectorOpCost
;
400 // Get cost estimates for scalar and vector versions of the operation.
401 bool IsBinOp
= Instruction::isBinaryOp(Opcode
);
403 ScalarOpCost
= TTI
.getArithmeticInstrCost(Opcode
, ScalarTy
);
404 VectorOpCost
= TTI
.getArithmeticInstrCost(Opcode
, VecTy
);
406 assert((Opcode
== Instruction::ICmp
|| Opcode
== Instruction::FCmp
) &&
407 "Expected a compare");
408 CmpInst::Predicate Pred
= cast
<CmpInst
>(I
).getPredicate();
409 ScalarOpCost
= TTI
.getCmpSelInstrCost(
410 Opcode
, ScalarTy
, CmpInst::makeCmpResultType(ScalarTy
), Pred
);
411 VectorOpCost
= TTI
.getCmpSelInstrCost(
412 Opcode
, VecTy
, CmpInst::makeCmpResultType(VecTy
), Pred
);
415 // Get cost estimates for the extract elements. These costs will factor into
417 unsigned Ext0Index
= Ext0IndexC
->getZExtValue();
418 unsigned Ext1Index
= Ext1IndexC
->getZExtValue();
419 TTI::TargetCostKind CostKind
= TTI::TCK_RecipThroughput
;
421 InstructionCost Extract0Cost
=
422 TTI
.getVectorInstrCost(*Ext0
, VecTy
, CostKind
, Ext0Index
);
423 InstructionCost Extract1Cost
=
424 TTI
.getVectorInstrCost(*Ext1
, VecTy
, CostKind
, Ext1Index
);
426 // A more expensive extract will always be replaced by a splat shuffle.
427 // For example, if Ext0 is more expensive:
428 // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
429 // extelt (opcode (splat V0, Ext0), V1), Ext1
430 // TODO: Evaluate whether that always results in lowest cost. Alternatively,
431 // check the cost of creating a broadcast shuffle and shuffling both
432 // operands to element 0.
433 InstructionCost CheapExtractCost
= std::min(Extract0Cost
, Extract1Cost
);
435 // Extra uses of the extracts mean that we include those costs in the
436 // vector total because those instructions will not be eliminated.
437 InstructionCost OldCost
, NewCost
;
438 if (Ext0
->getOperand(0) == Ext1
->getOperand(0) && Ext0Index
== Ext1Index
) {
439 // Handle a special case. If the 2 extracts are identical, adjust the
440 // formulas to account for that. The extra use charge allows for either the
441 // CSE'd pattern or an unoptimized form with identical values:
442 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
443 bool HasUseTax
= Ext0
== Ext1
? !Ext0
->hasNUses(2)
444 : !Ext0
->hasOneUse() || !Ext1
->hasOneUse();
445 OldCost
= CheapExtractCost
+ ScalarOpCost
;
446 NewCost
= VectorOpCost
+ CheapExtractCost
+ HasUseTax
* CheapExtractCost
;
448 // Handle the general case. Each extract is actually a different value:
449 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
450 OldCost
= Extract0Cost
+ Extract1Cost
+ ScalarOpCost
;
451 NewCost
= VectorOpCost
+ CheapExtractCost
+
452 !Ext0
->hasOneUse() * Extract0Cost
+
453 !Ext1
->hasOneUse() * Extract1Cost
;
456 ConvertToShuffle
= getShuffleExtract(Ext0
, Ext1
, PreferredExtractIndex
);
457 if (ConvertToShuffle
) {
458 if (IsBinOp
&& DisableBinopExtractShuffle
)
461 // If we are extracting from 2 different indexes, then one operand must be
462 // shuffled before performing the vector operation. The shuffle mask is
463 // poison except for 1 lane that is being translated to the remaining
464 // extraction lane. Therefore, it is a splat shuffle. Ex:
465 // ShufMask = { poison, poison, 0, poison }
466 // TODO: The cost model has an option for a "broadcast" shuffle
467 // (splat-from-element-0), but no option for a more general splat.
469 TTI
.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc
, VecTy
);
472 // Aggressively form a vector op if the cost is equal because the transform
473 // may enable further optimization.
474 // Codegen can reverse this transform (scalarize) if it was not profitable.
475 return OldCost
< NewCost
;
478 /// Create a shuffle that translates (shifts) 1 element from the input vector
479 /// to a new element location.
480 static Value
*createShiftShuffle(Value
*Vec
, unsigned OldIndex
,
481 unsigned NewIndex
, IRBuilder
<> &Builder
) {
482 // The shuffle mask is poison except for 1 lane that is being translated
483 // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
484 // ShufMask = { 2, poison, poison, poison }
485 auto *VecTy
= cast
<FixedVectorType
>(Vec
->getType());
486 SmallVector
<int, 32> ShufMask(VecTy
->getNumElements(), PoisonMaskElem
);
487 ShufMask
[NewIndex
] = OldIndex
;
488 return Builder
.CreateShuffleVector(Vec
, ShufMask
, "shift");
491 /// Given an extract element instruction with constant index operand, shuffle
492 /// the source vector (shift the scalar element) to a NewIndex for extraction.
493 /// Return null if the input can be constant folded, so that we are not creating
494 /// unnecessary instructions.
495 static ExtractElementInst
*translateExtract(ExtractElementInst
*ExtElt
,
497 IRBuilder
<> &Builder
) {
498 // Shufflevectors can only be created for fixed-width vectors.
499 if (!isa
<FixedVectorType
>(ExtElt
->getOperand(0)->getType()))
502 // If the extract can be constant-folded, this code is unsimplified. Defer
503 // to other passes to handle that.
504 Value
*X
= ExtElt
->getVectorOperand();
505 Value
*C
= ExtElt
->getIndexOperand();
506 assert(isa
<ConstantInt
>(C
) && "Expected a constant index operand");
507 if (isa
<Constant
>(X
))
510 Value
*Shuf
= createShiftShuffle(X
, cast
<ConstantInt
>(C
)->getZExtValue(),
512 return cast
<ExtractElementInst
>(Builder
.CreateExtractElement(Shuf
, NewIndex
));
515 /// Try to reduce extract element costs by converting scalar compares to vector
516 /// compares followed by extract.
517 /// cmp (ext0 V0, C), (ext1 V1, C)
518 void VectorCombine::foldExtExtCmp(ExtractElementInst
*Ext0
,
519 ExtractElementInst
*Ext1
, Instruction
&I
) {
520 assert(isa
<CmpInst
>(&I
) && "Expected a compare");
521 assert(cast
<ConstantInt
>(Ext0
->getIndexOperand())->getZExtValue() ==
522 cast
<ConstantInt
>(Ext1
->getIndexOperand())->getZExtValue() &&
523 "Expected matching constant extract indexes");
525 // cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C
527 CmpInst::Predicate Pred
= cast
<CmpInst
>(&I
)->getPredicate();
528 Value
*V0
= Ext0
->getVectorOperand(), *V1
= Ext1
->getVectorOperand();
529 Value
*VecCmp
= Builder
.CreateCmp(Pred
, V0
, V1
);
530 Value
*NewExt
= Builder
.CreateExtractElement(VecCmp
, Ext0
->getIndexOperand());
531 replaceValue(I
, *NewExt
);
534 /// Try to reduce extract element costs by converting scalar binops to vector
535 /// binops followed by extract.
536 /// bo (ext0 V0, C), (ext1 V1, C)
537 void VectorCombine::foldExtExtBinop(ExtractElementInst
*Ext0
,
538 ExtractElementInst
*Ext1
, Instruction
&I
) {
539 assert(isa
<BinaryOperator
>(&I
) && "Expected a binary operator");
540 assert(cast
<ConstantInt
>(Ext0
->getIndexOperand())->getZExtValue() ==
541 cast
<ConstantInt
>(Ext1
->getIndexOperand())->getZExtValue() &&
542 "Expected matching constant extract indexes");
544 // bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C
546 Value
*V0
= Ext0
->getVectorOperand(), *V1
= Ext1
->getVectorOperand();
548 Builder
.CreateBinOp(cast
<BinaryOperator
>(&I
)->getOpcode(), V0
, V1
);
550 // All IR flags are safe to back-propagate because any potential poison
551 // created in unused vector elements is discarded by the extract.
552 if (auto *VecBOInst
= dyn_cast
<Instruction
>(VecBO
))
553 VecBOInst
->copyIRFlags(&I
);
555 Value
*NewExt
= Builder
.CreateExtractElement(VecBO
, Ext0
->getIndexOperand());
556 replaceValue(I
, *NewExt
);
559 /// Match an instruction with extracted vector operands.
560 bool VectorCombine::foldExtractExtract(Instruction
&I
) {
561 // It is not safe to transform things like div, urem, etc. because we may
562 // create undefined behavior when executing those on unknown vector elements.
563 if (!isSafeToSpeculativelyExecute(&I
))
566 Instruction
*I0
, *I1
;
567 CmpInst::Predicate Pred
= CmpInst::BAD_ICMP_PREDICATE
;
568 if (!match(&I
, m_Cmp(Pred
, m_Instruction(I0
), m_Instruction(I1
))) &&
569 !match(&I
, m_BinOp(m_Instruction(I0
), m_Instruction(I1
))))
574 if (!match(I0
, m_ExtractElt(m_Value(V0
), m_ConstantInt(C0
))) ||
575 !match(I1
, m_ExtractElt(m_Value(V1
), m_ConstantInt(C1
))) ||
576 V0
->getType() != V1
->getType())
579 // If the scalar value 'I' is going to be re-inserted into a vector, then try
580 // to create an extract to that same element. The extract/insert can be
581 // reduced to a "select shuffle".
582 // TODO: If we add a larger pattern match that starts from an insert, this
583 // probably becomes unnecessary.
584 auto *Ext0
= cast
<ExtractElementInst
>(I0
);
585 auto *Ext1
= cast
<ExtractElementInst
>(I1
);
586 uint64_t InsertIndex
= InvalidIndex
;
589 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex
)));
591 ExtractElementInst
*ExtractToChange
;
592 if (isExtractExtractCheap(Ext0
, Ext1
, I
, ExtractToChange
, InsertIndex
))
595 if (ExtractToChange
) {
596 unsigned CheapExtractIdx
= ExtractToChange
== Ext0
? C1
: C0
;
597 ExtractElementInst
*NewExtract
=
598 translateExtract(ExtractToChange
, CheapExtractIdx
, Builder
);
601 if (ExtractToChange
== Ext0
)
607 if (Pred
!= CmpInst::BAD_ICMP_PREDICATE
)
608 foldExtExtCmp(Ext0
, Ext1
, I
);
610 foldExtExtBinop(Ext0
, Ext1
, I
);
617 /// Try to replace an extract + scalar fneg + insert with a vector fneg +
619 bool VectorCombine::foldInsExtFNeg(Instruction
&I
) {
620 // Match an insert (op (extract)) pattern.
624 if (!match(&I
, m_InsertElt(m_Value(DestVec
), m_OneUse(m_Instruction(FNeg
)),
625 m_ConstantInt(Index
))))
628 // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
630 Instruction
*Extract
;
631 if (!match(FNeg
, m_FNeg(m_CombineAnd(
632 m_Instruction(Extract
),
633 m_ExtractElt(m_Value(SrcVec
), m_SpecificInt(Index
))))))
636 // TODO: We could handle this with a length-changing shuffle.
637 auto *VecTy
= cast
<FixedVectorType
>(I
.getType());
638 if (SrcVec
->getType() != VecTy
)
641 // Ignore bogus insert/extract index.
642 unsigned NumElts
= VecTy
->getNumElements();
643 if (Index
>= NumElts
)
646 // We are inserting the negated element into the same lane that we extracted
647 // from. This is equivalent to a select-shuffle that chooses all but the
648 // negated element from the destination vector.
649 SmallVector
<int> Mask(NumElts
);
650 std::iota(Mask
.begin(), Mask
.end(), 0);
651 Mask
[Index
] = Index
+ NumElts
;
653 Type
*ScalarTy
= VecTy
->getScalarType();
654 TTI::TargetCostKind CostKind
= TTI::TCK_RecipThroughput
;
655 InstructionCost OldCost
=
656 TTI
.getArithmeticInstrCost(Instruction::FNeg
, ScalarTy
) +
657 TTI
.getVectorInstrCost(I
, VecTy
, CostKind
, Index
);
659 // If the extract has one use, it will be eliminated, so count it in the
660 // original cost. If it has more than one use, ignore the cost because it will
661 // be the same before/after.
662 if (Extract
->hasOneUse())
663 OldCost
+= TTI
.getVectorInstrCost(*Extract
, VecTy
, CostKind
, Index
);
665 InstructionCost NewCost
=
666 TTI
.getArithmeticInstrCost(Instruction::FNeg
, VecTy
) +
667 TTI
.getShuffleCost(TargetTransformInfo::SK_Select
, VecTy
, Mask
);
669 if (NewCost
> OldCost
)
672 // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index -->
673 // shuffle DestVec, (fneg SrcVec), Mask
674 Value
*VecFNeg
= Builder
.CreateFNegFMF(SrcVec
, FNeg
);
675 Value
*Shuf
= Builder
.CreateShuffleVector(DestVec
, VecFNeg
, Mask
);
676 replaceValue(I
, *Shuf
);
680 /// If this is a bitcast of a shuffle, try to bitcast the source vector to the
681 /// destination type followed by shuffle. This can enable further transforms by
682 /// moving bitcasts or shuffles together.
683 bool VectorCombine::foldBitcastShuffle(Instruction
&I
) {
686 if (!match(&I
, m_BitCast(
687 m_OneUse(m_Shuffle(m_Value(V
), m_Undef(), m_Mask(Mask
))))))
690 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
691 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
692 // mask for scalable type is a splat or not.
693 // 2) Disallow non-vector casts.
694 // TODO: We could allow any shuffle.
695 auto *DestTy
= dyn_cast
<FixedVectorType
>(I
.getType());
696 auto *SrcTy
= dyn_cast
<FixedVectorType
>(V
->getType());
697 if (!DestTy
|| !SrcTy
)
700 unsigned DestEltSize
= DestTy
->getScalarSizeInBits();
701 unsigned SrcEltSize
= SrcTy
->getScalarSizeInBits();
702 if (SrcTy
->getPrimitiveSizeInBits() % DestEltSize
!= 0)
705 SmallVector
<int, 16> NewMask
;
706 if (DestEltSize
<= SrcEltSize
) {
707 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
708 // always be expanded to the equivalent form choosing narrower elements.
709 assert(SrcEltSize
% DestEltSize
== 0 && "Unexpected shuffle mask");
710 unsigned ScaleFactor
= SrcEltSize
/ DestEltSize
;
711 narrowShuffleMaskElts(ScaleFactor
, Mask
, NewMask
);
713 // The bitcast is from narrow elements to wide elements. The shuffle mask
714 // must choose consecutive elements to allow casting first.
715 assert(DestEltSize
% SrcEltSize
== 0 && "Unexpected shuffle mask");
716 unsigned ScaleFactor
= DestEltSize
/ SrcEltSize
;
717 if (!widenShuffleMaskElts(ScaleFactor
, Mask
, NewMask
))
721 // Bitcast the shuffle src - keep its original width but using the destination
723 unsigned NumSrcElts
= SrcTy
->getPrimitiveSizeInBits() / DestEltSize
;
724 auto *ShuffleTy
= FixedVectorType::get(DestTy
->getScalarType(), NumSrcElts
);
726 // The new shuffle must not cost more than the old shuffle. The bitcast is
727 // moved ahead of the shuffle, so assume that it has the same cost as before.
728 InstructionCost DestCost
= TTI
.getShuffleCost(
729 TargetTransformInfo::SK_PermuteSingleSrc
, ShuffleTy
, NewMask
);
730 InstructionCost SrcCost
=
731 TTI
.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc
, SrcTy
, Mask
);
732 if (DestCost
> SrcCost
|| !DestCost
.isValid())
735 // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'
737 Value
*CastV
= Builder
.CreateBitCast(V
, ShuffleTy
);
738 Value
*Shuf
= Builder
.CreateShuffleVector(CastV
, NewMask
);
739 replaceValue(I
, *Shuf
);
743 /// VP Intrinsics whose vector operands are both splat values may be simplified
744 /// into the scalar version of the operation and the result splatted. This
745 /// can lead to scalarization down the line.
746 bool VectorCombine::scalarizeVPIntrinsic(Instruction
&I
) {
747 if (!isa
<VPIntrinsic
>(I
))
749 VPIntrinsic
&VPI
= cast
<VPIntrinsic
>(I
);
750 Value
*Op0
= VPI
.getArgOperand(0);
751 Value
*Op1
= VPI
.getArgOperand(1);
753 if (!isSplatValue(Op0
) || !isSplatValue(Op1
))
756 // For the binary VP intrinsics supported here, the result on disabled lanes
757 // is a poison value. For now, only do this simplification if all lanes
759 // TODO: Relax the condition that all lanes are active by using insertelement
760 // on inactive lanes.
761 auto IsAllTrueMask
= [](Value
*MaskVal
) {
762 if (Value
*SplattedVal
= getSplatValue(MaskVal
))
763 if (auto *ConstValue
= dyn_cast
<Constant
>(SplattedVal
))
764 return ConstValue
->isAllOnesValue();
767 if (!IsAllTrueMask(VPI
.getArgOperand(2)))
770 // Check to make sure we support scalarization of the intrinsic
771 Intrinsic::ID IntrID
= VPI
.getIntrinsicID();
772 if (!VPBinOpIntrinsic::isVPBinOp(IntrID
))
775 // Calculate cost of splatting both operands into vectors and the vector
777 VectorType
*VecTy
= cast
<VectorType
>(VPI
.getType());
778 TTI::TargetCostKind CostKind
= TTI::TCK_RecipThroughput
;
779 InstructionCost SplatCost
=
780 TTI
.getVectorInstrCost(Instruction::InsertElement
, VecTy
, CostKind
, 0) +
781 TTI
.getShuffleCost(TargetTransformInfo::SK_Broadcast
, VecTy
);
783 // Calculate the cost of the VP Intrinsic
784 SmallVector
<Type
*, 4> Args
;
785 for (Value
*V
: VPI
.args())
786 Args
.push_back(V
->getType());
787 IntrinsicCostAttributes
Attrs(IntrID
, VecTy
, Args
);
788 InstructionCost VectorOpCost
= TTI
.getIntrinsicInstrCost(Attrs
, CostKind
);
789 InstructionCost OldCost
= 2 * SplatCost
+ VectorOpCost
;
791 // Determine scalar opcode
792 std::optional
<unsigned> FunctionalOpcode
=
793 VPI
.getFunctionalOpcode();
794 std::optional
<Intrinsic::ID
> ScalarIntrID
= std::nullopt
;
795 if (!FunctionalOpcode
) {
796 ScalarIntrID
= VPI
.getFunctionalIntrinsicID();
801 // Calculate cost of scalarizing
802 InstructionCost ScalarOpCost
= 0;
804 IntrinsicCostAttributes
Attrs(*ScalarIntrID
, VecTy
->getScalarType(), Args
);
805 ScalarOpCost
= TTI
.getIntrinsicInstrCost(Attrs
, CostKind
);
808 TTI
.getArithmeticInstrCost(*FunctionalOpcode
, VecTy
->getScalarType());
811 // The existing splats may be kept around if other instructions use them.
812 InstructionCost CostToKeepSplats
=
813 (SplatCost
* !Op0
->hasOneUse()) + (SplatCost
* !Op1
->hasOneUse());
814 InstructionCost NewCost
= ScalarOpCost
+ SplatCost
+ CostToKeepSplats
;
816 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
818 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
819 << ", Cost of scalarizing:" << NewCost
<< "\n");
821 // We want to scalarize unless the vector variant actually has lower cost.
822 if (OldCost
< NewCost
|| !NewCost
.isValid())
825 // Scalarize the intrinsic
826 ElementCount EC
= cast
<VectorType
>(Op0
->getType())->getElementCount();
827 Value
*EVL
= VPI
.getArgOperand(3);
828 const DataLayout
&DL
= VPI
.getModule()->getDataLayout();
830 // If the VP op might introduce UB or poison, we can scalarize it provided
831 // that we know the EVL > 0: If the EVL is zero, then the original VP op
832 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
834 bool SafeToSpeculate
;
836 SafeToSpeculate
= Intrinsic::getAttributes(I
.getContext(), *ScalarIntrID
)
837 .hasFnAttr(Attribute::AttrKind::Speculatable
);
839 SafeToSpeculate
= isSafeToSpeculativelyExecuteWithOpcode(
840 *FunctionalOpcode
, &VPI
, nullptr, &AC
, &DT
);
841 if (!SafeToSpeculate
&& !isKnownNonZero(EVL
, DL
, 0, &AC
, &VPI
, &DT
))
844 Value
*ScalarOp0
= getSplatValue(Op0
);
845 Value
*ScalarOp1
= getSplatValue(Op1
);
848 ? Builder
.CreateIntrinsic(VecTy
->getScalarType(), *ScalarIntrID
,
849 {ScalarOp0
, ScalarOp1
})
850 : Builder
.CreateBinOp((Instruction::BinaryOps
)(*FunctionalOpcode
),
851 ScalarOp0
, ScalarOp1
);
853 replaceValue(VPI
, *Builder
.CreateVectorSplat(EC
, ScalarVal
));
857 /// Match a vector binop or compare instruction with at least one inserted
858 /// scalar operand and convert to scalar binop/cmp followed by insertelement.
859 bool VectorCombine::scalarizeBinopOrCmp(Instruction
&I
) {
860 CmpInst::Predicate Pred
= CmpInst::BAD_ICMP_PREDICATE
;
862 if (!match(&I
, m_BinOp(m_Value(Ins0
), m_Value(Ins1
))) &&
863 !match(&I
, m_Cmp(Pred
, m_Value(Ins0
), m_Value(Ins1
))))
866 // Do not convert the vector condition of a vector select into a scalar
867 // condition. That may cause problems for codegen because of differences in
868 // boolean formats and register-file transfers.
869 // TODO: Can we account for that in the cost model?
870 bool IsCmp
= Pred
!= CmpInst::Predicate::BAD_ICMP_PREDICATE
;
872 for (User
*U
: I
.users())
873 if (match(U
, m_Select(m_Specific(&I
), m_Value(), m_Value())))
876 // Match against one or both scalar values being inserted into constant
878 // vec_op VecC0, (inselt VecC1, V1, Index)
879 // vec_op (inselt VecC0, V0, Index), VecC1
880 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index)
881 // TODO: Deal with mismatched index constants and variable indexes?
882 Constant
*VecC0
= nullptr, *VecC1
= nullptr;
883 Value
*V0
= nullptr, *V1
= nullptr;
884 uint64_t Index0
= 0, Index1
= 0;
885 if (!match(Ins0
, m_InsertElt(m_Constant(VecC0
), m_Value(V0
),
886 m_ConstantInt(Index0
))) &&
887 !match(Ins0
, m_Constant(VecC0
)))
889 if (!match(Ins1
, m_InsertElt(m_Constant(VecC1
), m_Value(V1
),
890 m_ConstantInt(Index1
))) &&
891 !match(Ins1
, m_Constant(VecC1
)))
896 if (IsConst0
&& IsConst1
)
898 if (!IsConst0
&& !IsConst1
&& Index0
!= Index1
)
901 // Bail for single insertion if it is a load.
902 // TODO: Handle this once getVectorInstrCost can cost for load/stores.
903 auto *I0
= dyn_cast_or_null
<Instruction
>(V0
);
904 auto *I1
= dyn_cast_or_null
<Instruction
>(V1
);
905 if ((IsConst0
&& I1
&& I1
->mayReadFromMemory()) ||
906 (IsConst1
&& I0
&& I0
->mayReadFromMemory()))
909 uint64_t Index
= IsConst0
? Index1
: Index0
;
910 Type
*ScalarTy
= IsConst0
? V1
->getType() : V0
->getType();
911 Type
*VecTy
= I
.getType();
912 assert(VecTy
->isVectorTy() &&
913 (IsConst0
|| IsConst1
|| V0
->getType() == V1
->getType()) &&
914 (ScalarTy
->isIntegerTy() || ScalarTy
->isFloatingPointTy() ||
915 ScalarTy
->isPointerTy()) &&
916 "Unexpected types for insert element into binop or cmp");
918 unsigned Opcode
= I
.getOpcode();
919 InstructionCost ScalarOpCost
, VectorOpCost
;
921 CmpInst::Predicate Pred
= cast
<CmpInst
>(I
).getPredicate();
922 ScalarOpCost
= TTI
.getCmpSelInstrCost(
923 Opcode
, ScalarTy
, CmpInst::makeCmpResultType(ScalarTy
), Pred
);
924 VectorOpCost
= TTI
.getCmpSelInstrCost(
925 Opcode
, VecTy
, CmpInst::makeCmpResultType(VecTy
), Pred
);
927 ScalarOpCost
= TTI
.getArithmeticInstrCost(Opcode
, ScalarTy
);
928 VectorOpCost
= TTI
.getArithmeticInstrCost(Opcode
, VecTy
);
931 // Get cost estimate for the insert element. This cost will factor into
933 TTI::TargetCostKind CostKind
= TTI::TCK_RecipThroughput
;
934 InstructionCost InsertCost
= TTI
.getVectorInstrCost(
935 Instruction::InsertElement
, VecTy
, CostKind
, Index
);
936 InstructionCost OldCost
=
937 (IsConst0
? 0 : InsertCost
) + (IsConst1
? 0 : InsertCost
) + VectorOpCost
;
938 InstructionCost NewCost
= ScalarOpCost
+ InsertCost
+
939 (IsConst0
? 0 : !Ins0
->hasOneUse() * InsertCost
) +
940 (IsConst1
? 0 : !Ins1
->hasOneUse() * InsertCost
);
942 // We want to scalarize unless the vector variant actually has lower cost.
943 if (OldCost
< NewCost
|| !NewCost
.isValid())
946 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
947 // inselt NewVecC, (scalar_op V0, V1), Index
953 // For constant cases, extract the scalar element, this should constant fold.
955 V0
= ConstantExpr::getExtractElement(VecC0
, Builder
.getInt64(Index
));
957 V1
= ConstantExpr::getExtractElement(VecC1
, Builder
.getInt64(Index
));
960 IsCmp
? Builder
.CreateCmp(Pred
, V0
, V1
)
961 : Builder
.CreateBinOp((Instruction::BinaryOps
)Opcode
, V0
, V1
);
963 Scalar
->setName(I
.getName() + ".scalar");
965 // All IR flags are safe to back-propagate. There is no potential for extra
966 // poison to be created by the scalar instruction.
967 if (auto *ScalarInst
= dyn_cast
<Instruction
>(Scalar
))
968 ScalarInst
->copyIRFlags(&I
);
970 // Fold the vector constants in the original vectors into a new base vector.
972 IsCmp
? Builder
.CreateCmp(Pred
, VecC0
, VecC1
)
973 : Builder
.CreateBinOp((Instruction::BinaryOps
)Opcode
, VecC0
, VecC1
);
974 Value
*Insert
= Builder
.CreateInsertElement(NewVecC
, Scalar
, Index
);
975 replaceValue(I
, *Insert
);
979 /// Try to combine a scalar binop + 2 scalar compares of extracted elements of
980 /// a vector into vector operations followed by extract. Note: The SLP pass
981 /// may miss this pattern because of implementation problems.
982 bool VectorCombine::foldExtractedCmps(Instruction
&I
) {
983 // We are looking for a scalar binop of booleans.
984 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
985 if (!I
.isBinaryOp() || !I
.getType()->isIntegerTy(1))
988 // The compare predicates should match, and each compare should have a
990 // TODO: Relax the one-use constraints.
991 Value
*B0
= I
.getOperand(0), *B1
= I
.getOperand(1);
992 Instruction
*I0
, *I1
;
994 CmpInst::Predicate P0
, P1
;
995 if (!match(B0
, m_OneUse(m_Cmp(P0
, m_Instruction(I0
), m_Constant(C0
)))) ||
996 !match(B1
, m_OneUse(m_Cmp(P1
, m_Instruction(I1
), m_Constant(C1
)))) ||
1000 // The compare operands must be extracts of the same vector with constant
1002 // TODO: Relax the one-use constraints.
1004 uint64_t Index0
, Index1
;
1005 if (!match(I0
, m_OneUse(m_ExtractElt(m_Value(X
), m_ConstantInt(Index0
)))) ||
1006 !match(I1
, m_OneUse(m_ExtractElt(m_Specific(X
), m_ConstantInt(Index1
)))))
1009 auto *Ext0
= cast
<ExtractElementInst
>(I0
);
1010 auto *Ext1
= cast
<ExtractElementInst
>(I1
);
1011 ExtractElementInst
*ConvertToShuf
= getShuffleExtract(Ext0
, Ext1
);
1015 // The original scalar pattern is:
1016 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1017 CmpInst::Predicate Pred
= P0
;
1018 unsigned CmpOpcode
= CmpInst::isFPPredicate(Pred
) ? Instruction::FCmp
1019 : Instruction::ICmp
;
1020 auto *VecTy
= dyn_cast
<FixedVectorType
>(X
->getType());
1024 TTI::TargetCostKind CostKind
= TTI::TCK_RecipThroughput
;
1025 InstructionCost OldCost
=
1026 TTI
.getVectorInstrCost(*Ext0
, VecTy
, CostKind
, Index0
);
1027 OldCost
+= TTI
.getVectorInstrCost(*Ext1
, VecTy
, CostKind
, Index1
);
1029 TTI
.getCmpSelInstrCost(CmpOpcode
, I0
->getType(),
1030 CmpInst::makeCmpResultType(I0
->getType()), Pred
) *
1032 OldCost
+= TTI
.getArithmeticInstrCost(I
.getOpcode(), I
.getType());
1034 // The proposed vector pattern is:
1035 // vcmp = cmp Pred X, VecC
1036 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1037 int CheapIndex
= ConvertToShuf
== Ext0
? Index1
: Index0
;
1038 int ExpensiveIndex
= ConvertToShuf
== Ext0
? Index0
: Index1
;
1039 auto *CmpTy
= cast
<FixedVectorType
>(CmpInst::makeCmpResultType(X
->getType()));
1040 InstructionCost NewCost
= TTI
.getCmpSelInstrCost(
1041 CmpOpcode
, X
->getType(), CmpInst::makeCmpResultType(X
->getType()), Pred
);
1042 SmallVector
<int, 32> ShufMask(VecTy
->getNumElements(), PoisonMaskElem
);
1043 ShufMask
[CheapIndex
] = ExpensiveIndex
;
1044 NewCost
+= TTI
.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc
, CmpTy
,
1046 NewCost
+= TTI
.getArithmeticInstrCost(I
.getOpcode(), CmpTy
);
1047 NewCost
+= TTI
.getVectorInstrCost(*Ext0
, CmpTy
, CostKind
, CheapIndex
);
1049 // Aggressively form vector ops if the cost is equal because the transform
1050 // may enable further optimization.
1051 // Codegen can reverse this transform (scalarize) if it was not profitable.
1052 if (OldCost
< NewCost
|| !NewCost
.isValid())
1055 // Create a vector constant from the 2 scalar constants.
1056 SmallVector
<Constant
*, 32> CmpC(VecTy
->getNumElements(),
1057 PoisonValue::get(VecTy
->getElementType()));
1060 Value
*VCmp
= Builder
.CreateCmp(Pred
, X
, ConstantVector::get(CmpC
));
1062 Value
*Shuf
= createShiftShuffle(VCmp
, ExpensiveIndex
, CheapIndex
, Builder
);
1063 Value
*VecLogic
= Builder
.CreateBinOp(cast
<BinaryOperator
>(I
).getOpcode(),
1065 Value
*NewExt
= Builder
.CreateExtractElement(VecLogic
, CheapIndex
);
1066 replaceValue(I
, *NewExt
);
1071 // Check if memory loc modified between two instrs in the same BB
1072 static bool isMemModifiedBetween(BasicBlock::iterator Begin
,
1073 BasicBlock::iterator End
,
1074 const MemoryLocation
&Loc
, AAResults
&AA
) {
1075 unsigned NumScanned
= 0;
1076 return std::any_of(Begin
, End
, [&](const Instruction
&Instr
) {
1077 return isModSet(AA
.getModRefInfo(&Instr
, Loc
)) ||
1078 ++NumScanned
> MaxInstrsToScan
;
1083 /// Helper class to indicate whether a vector index can be safely scalarized and
1084 /// if a freeze needs to be inserted.
1085 class ScalarizationResult
{
1086 enum class StatusTy
{ Unsafe
, Safe
, SafeWithFreeze
};
1091 ScalarizationResult(StatusTy Status
, Value
*ToFreeze
= nullptr)
1092 : Status(Status
), ToFreeze(ToFreeze
) {}
1095 ScalarizationResult(const ScalarizationResult
&Other
) = default;
1096 ~ScalarizationResult() {
1097 assert(!ToFreeze
&& "freeze() not called with ToFreeze being set");
1100 static ScalarizationResult
unsafe() { return {StatusTy::Unsafe
}; }
1101 static ScalarizationResult
safe() { return {StatusTy::Safe
}; }
1102 static ScalarizationResult
safeWithFreeze(Value
*ToFreeze
) {
1103 return {StatusTy::SafeWithFreeze
, ToFreeze
};
1106 /// Returns true if the index can be scalarize without requiring a freeze.
1107 bool isSafe() const { return Status
== StatusTy::Safe
; }
1108 /// Returns true if the index cannot be scalarized.
1109 bool isUnsafe() const { return Status
== StatusTy::Unsafe
; }
1110 /// Returns true if the index can be scalarize, but requires inserting a
1112 bool isSafeWithFreeze() const { return Status
== StatusTy::SafeWithFreeze
; }
1114 /// Reset the state of Unsafe and clear ToFreze if set.
1117 Status
= StatusTy::Unsafe
;
1120 /// Freeze the ToFreeze and update the use in \p User to use it.
1121 void freeze(IRBuilder
<> &Builder
, Instruction
&UserI
) {
1122 assert(isSafeWithFreeze() &&
1123 "should only be used when freezing is required");
1124 assert(is_contained(ToFreeze
->users(), &UserI
) &&
1125 "UserI must be a user of ToFreeze");
1126 IRBuilder
<>::InsertPointGuard
Guard(Builder
);
1127 Builder
.SetInsertPoint(cast
<Instruction
>(&UserI
));
1129 Builder
.CreateFreeze(ToFreeze
, ToFreeze
->getName() + ".frozen");
1130 for (Use
&U
: make_early_inc_range((UserI
.operands())))
1131 if (U
.get() == ToFreeze
)
1139 /// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1140 /// Idx. \p Idx must access a valid vector element.
1141 static ScalarizationResult
canScalarizeAccess(VectorType
*VecTy
, Value
*Idx
,
1143 AssumptionCache
&AC
,
1144 const DominatorTree
&DT
) {
1145 // We do checks for both fixed vector types and scalable vector types.
1146 // This is the number of elements of fixed vector types,
1147 // or the minimum number of elements of scalable vector types.
1148 uint64_t NumElements
= VecTy
->getElementCount().getKnownMinValue();
1150 if (auto *C
= dyn_cast
<ConstantInt
>(Idx
)) {
1151 if (C
->getValue().ult(NumElements
))
1152 return ScalarizationResult::safe();
1153 return ScalarizationResult::unsafe();
1156 unsigned IntWidth
= Idx
->getType()->getScalarSizeInBits();
1157 APInt
Zero(IntWidth
, 0);
1158 APInt
MaxElts(IntWidth
, NumElements
);
1159 ConstantRange
ValidIndices(Zero
, MaxElts
);
1160 ConstantRange
IdxRange(IntWidth
, true);
1162 if (isGuaranteedNotToBePoison(Idx
, &AC
)) {
1163 if (ValidIndices
.contains(computeConstantRange(Idx
, /* ForSigned */ false,
1164 true, &AC
, CtxI
, &DT
)))
1165 return ScalarizationResult::safe();
1166 return ScalarizationResult::unsafe();
1169 // If the index may be poison, check if we can insert a freeze before the
1170 // range of the index is restricted.
1173 if (match(Idx
, m_And(m_Value(IdxBase
), m_ConstantInt(CI
)))) {
1174 IdxRange
= IdxRange
.binaryAnd(CI
->getValue());
1175 } else if (match(Idx
, m_URem(m_Value(IdxBase
), m_ConstantInt(CI
)))) {
1176 IdxRange
= IdxRange
.urem(CI
->getValue());
1179 if (ValidIndices
.contains(IdxRange
))
1180 return ScalarizationResult::safeWithFreeze(IdxBase
);
1181 return ScalarizationResult::unsafe();
1184 /// The memory operation on a vector of \p ScalarType had alignment of
1185 /// \p VectorAlignment. Compute the maximal, but conservatively correct,
1186 /// alignment that will be valid for the memory operation on a single scalar
1187 /// element of the same type with index \p Idx.
1188 static Align
computeAlignmentAfterScalarization(Align VectorAlignment
,
1189 Type
*ScalarType
, Value
*Idx
,
1190 const DataLayout
&DL
) {
1191 if (auto *C
= dyn_cast
<ConstantInt
>(Idx
))
1192 return commonAlignment(VectorAlignment
,
1193 C
->getZExtValue() * DL
.getTypeStoreSize(ScalarType
));
1194 return commonAlignment(VectorAlignment
, DL
.getTypeStoreSize(ScalarType
));
1197 // Combine patterns like:
1198 // %0 = load <4 x i32>, <4 x i32>* %a
1199 // %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1200 // store <4 x i32> %1, <4 x i32>* %a
1202 // %0 = bitcast <4 x i32>* %a to i32*
1203 // %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
1204 // store i32 %b, i32* %1
1205 bool VectorCombine::foldSingleElementStore(Instruction
&I
) {
1206 auto *SI
= cast
<StoreInst
>(&I
);
1207 if (!SI
->isSimple() || !isa
<VectorType
>(SI
->getValueOperand()->getType()))
1210 // TODO: Combine more complicated patterns (multiple insert) by referencing
1211 // TargetTransformInfo.
1212 Instruction
*Source
;
1215 if (!match(SI
->getValueOperand(),
1216 m_InsertElt(m_Instruction(Source
), m_Value(NewElement
),
1220 if (auto *Load
= dyn_cast
<LoadInst
>(Source
)) {
1221 auto VecTy
= cast
<VectorType
>(SI
->getValueOperand()->getType());
1222 const DataLayout
&DL
= I
.getModule()->getDataLayout();
1223 Value
*SrcAddr
= Load
->getPointerOperand()->stripPointerCasts();
1224 // Don't optimize for atomic/volatile load or store. Ensure memory is not
1225 // modified between, vector type matches store size, and index is inbounds.
1226 if (!Load
->isSimple() || Load
->getParent() != SI
->getParent() ||
1227 !DL
.typeSizeEqualsStoreSize(Load
->getType()->getScalarType()) ||
1228 SrcAddr
!= SI
->getPointerOperand()->stripPointerCasts())
1231 auto ScalarizableIdx
= canScalarizeAccess(VecTy
, Idx
, Load
, AC
, DT
);
1232 if (ScalarizableIdx
.isUnsafe() ||
1233 isMemModifiedBetween(Load
->getIterator(), SI
->getIterator(),
1234 MemoryLocation::get(SI
), AA
))
1237 if (ScalarizableIdx
.isSafeWithFreeze())
1238 ScalarizableIdx
.freeze(Builder
, *cast
<Instruction
>(Idx
));
1239 Value
*GEP
= Builder
.CreateInBoundsGEP(
1240 SI
->getValueOperand()->getType(), SI
->getPointerOperand(),
1241 {ConstantInt::get(Idx
->getType(), 0), Idx
});
1242 StoreInst
*NSI
= Builder
.CreateStore(NewElement
, GEP
);
1243 NSI
->copyMetadata(*SI
);
1244 Align ScalarOpAlignment
= computeAlignmentAfterScalarization(
1245 std::max(SI
->getAlign(), Load
->getAlign()), NewElement
->getType(), Idx
,
1247 NSI
->setAlignment(ScalarOpAlignment
);
1248 replaceValue(I
, *NSI
);
1249 eraseInstruction(I
);
1256 /// Try to scalarize vector loads feeding extractelement instructions.
1257 bool VectorCombine::scalarizeLoadExtract(Instruction
&I
) {
1259 if (!match(&I
, m_Load(m_Value(Ptr
))))
1262 auto *VecTy
= cast
<VectorType
>(I
.getType());
1263 auto *LI
= cast
<LoadInst
>(&I
);
1264 const DataLayout
&DL
= I
.getModule()->getDataLayout();
1265 if (LI
->isVolatile() || !DL
.typeSizeEqualsStoreSize(VecTy
->getScalarType()))
1268 InstructionCost OriginalCost
=
1269 TTI
.getMemoryOpCost(Instruction::Load
, VecTy
, LI
->getAlign(),
1270 LI
->getPointerAddressSpace());
1271 InstructionCost ScalarizedCost
= 0;
1273 Instruction
*LastCheckedInst
= LI
;
1274 unsigned NumInstChecked
= 0;
1275 DenseMap
<ExtractElementInst
*, ScalarizationResult
> NeedFreeze
;
1276 auto FailureGuard
= make_scope_exit([&]() {
1277 // If the transform is aborted, discard the ScalarizationResults.
1278 for (auto &Pair
: NeedFreeze
)
1279 Pair
.second
.discard();
1282 // Check if all users of the load are extracts with no memory modifications
1283 // between the load and the extract. Compute the cost of both the original
1284 // code and the scalarized version.
1285 for (User
*U
: LI
->users()) {
1286 auto *UI
= dyn_cast
<ExtractElementInst
>(U
);
1287 if (!UI
|| UI
->getParent() != LI
->getParent())
1290 // Check if any instruction between the load and the extract may modify
1292 if (LastCheckedInst
->comesBefore(UI
)) {
1293 for (Instruction
&I
:
1294 make_range(std::next(LI
->getIterator()), UI
->getIterator())) {
1295 // Bail out if we reached the check limit or the instruction may write
1297 if (NumInstChecked
== MaxInstrsToScan
|| I
.mayWriteToMemory())
1301 LastCheckedInst
= UI
;
1304 auto ScalarIdx
= canScalarizeAccess(VecTy
, UI
->getOperand(1), &I
, AC
, DT
);
1305 if (ScalarIdx
.isUnsafe())
1307 if (ScalarIdx
.isSafeWithFreeze()) {
1308 NeedFreeze
.try_emplace(UI
, ScalarIdx
);
1309 ScalarIdx
.discard();
1312 auto *Index
= dyn_cast
<ConstantInt
>(UI
->getOperand(1));
1313 TTI::TargetCostKind CostKind
= TTI::TCK_RecipThroughput
;
1315 TTI
.getVectorInstrCost(Instruction::ExtractElement
, VecTy
, CostKind
,
1316 Index
? Index
->getZExtValue() : -1);
1318 TTI
.getMemoryOpCost(Instruction::Load
, VecTy
->getElementType(),
1319 Align(1), LI
->getPointerAddressSpace());
1320 ScalarizedCost
+= TTI
.getAddressComputationCost(VecTy
->getElementType());
1323 if (ScalarizedCost
>= OriginalCost
)
1326 // Replace extracts with narrow scalar loads.
1327 for (User
*U
: LI
->users()) {
1328 auto *EI
= cast
<ExtractElementInst
>(U
);
1329 Value
*Idx
= EI
->getOperand(1);
1331 // Insert 'freeze' for poison indexes.
1332 auto It
= NeedFreeze
.find(EI
);
1333 if (It
!= NeedFreeze
.end())
1334 It
->second
.freeze(Builder
, *cast
<Instruction
>(Idx
));
1336 Builder
.SetInsertPoint(EI
);
1338 Builder
.CreateInBoundsGEP(VecTy
, Ptr
, {Builder
.getInt32(0), Idx
});
1339 auto *NewLoad
= cast
<LoadInst
>(Builder
.CreateLoad(
1340 VecTy
->getElementType(), GEP
, EI
->getName() + ".scalar"));
1342 Align ScalarOpAlignment
= computeAlignmentAfterScalarization(
1343 LI
->getAlign(), VecTy
->getElementType(), Idx
, DL
);
1344 NewLoad
->setAlignment(ScalarOpAlignment
);
1346 replaceValue(*EI
, *NewLoad
);
1349 FailureGuard
.release();
1353 /// Try to convert "shuffle (binop), (binop)" with a shared binop operand into
1354 /// "binop (shuffle), (shuffle)".
1355 bool VectorCombine::foldShuffleOfBinops(Instruction
&I
) {
1356 auto *VecTy
= cast
<FixedVectorType
>(I
.getType());
1357 BinaryOperator
*B0
, *B1
;
1359 if (!match(&I
, m_Shuffle(m_OneUse(m_BinOp(B0
)), m_OneUse(m_BinOp(B1
)),
1361 B0
->getOpcode() != B1
->getOpcode() || B0
->getType() != VecTy
)
1364 // Try to replace a binop with a shuffle if the shuffle is not costly.
1365 // The new shuffle will choose from a single, common operand, so it may be
1366 // cheaper than the existing two-operand shuffle.
1367 SmallVector
<int> UnaryMask
= createUnaryMask(Mask
, Mask
.size());
1368 Instruction::BinaryOps Opcode
= B0
->getOpcode();
1369 InstructionCost BinopCost
= TTI
.getArithmeticInstrCost(Opcode
, VecTy
);
1370 InstructionCost ShufCost
= TTI
.getShuffleCost(
1371 TargetTransformInfo::SK_PermuteSingleSrc
, VecTy
, UnaryMask
);
1372 if (ShufCost
> BinopCost
)
1375 // If we have something like "add X, Y" and "add Z, X", swap ops to match.
1376 Value
*X
= B0
->getOperand(0), *Y
= B0
->getOperand(1);
1377 Value
*Z
= B1
->getOperand(0), *W
= B1
->getOperand(1);
1378 if (BinaryOperator::isCommutative(Opcode
) && X
!= Z
&& Y
!= W
)
1381 Value
*Shuf0
, *Shuf1
;
1383 // shuf (bo X, Y), (bo X, W) --> bo (shuf X), (shuf Y, W)
1384 Shuf0
= Builder
.CreateShuffleVector(X
, UnaryMask
);
1385 Shuf1
= Builder
.CreateShuffleVector(Y
, W
, Mask
);
1386 } else if (Y
== W
) {
1387 // shuf (bo X, Y), (bo Z, Y) --> bo (shuf X, Z), (shuf Y)
1388 Shuf0
= Builder
.CreateShuffleVector(X
, Z
, Mask
);
1389 Shuf1
= Builder
.CreateShuffleVector(Y
, UnaryMask
);
1394 Value
*NewBO
= Builder
.CreateBinOp(Opcode
, Shuf0
, Shuf1
);
1395 // Intersect flags from the old binops.
1396 if (auto *NewInst
= dyn_cast
<Instruction
>(NewBO
)) {
1397 NewInst
->copyIRFlags(B0
);
1398 NewInst
->andIRFlags(B1
);
1400 replaceValue(I
, *NewBO
);
1404 /// Given a commutative reduction, the order of the input lanes does not alter
1405 /// the results. We can use this to remove certain shuffles feeding the
1406 /// reduction, removing the need to shuffle at all.
1407 bool VectorCombine::foldShuffleFromReductions(Instruction
&I
) {
1408 auto *II
= dyn_cast
<IntrinsicInst
>(&I
);
1411 switch (II
->getIntrinsicID()) {
1412 case Intrinsic::vector_reduce_add
:
1413 case Intrinsic::vector_reduce_mul
:
1414 case Intrinsic::vector_reduce_and
:
1415 case Intrinsic::vector_reduce_or
:
1416 case Intrinsic::vector_reduce_xor
:
1417 case Intrinsic::vector_reduce_smin
:
1418 case Intrinsic::vector_reduce_smax
:
1419 case Intrinsic::vector_reduce_umin
:
1420 case Intrinsic::vector_reduce_umax
:
1426 // Find all the inputs when looking through operations that do not alter the
1427 // lane order (binops, for example). Currently we look for a single shuffle,
1428 // and can ignore splat values.
1429 std::queue
<Value
*> Worklist
;
1430 SmallPtrSet
<Value
*, 4> Visited
;
1431 ShuffleVectorInst
*Shuffle
= nullptr;
1432 if (auto *Op
= dyn_cast
<Instruction
>(I
.getOperand(0)))
1435 while (!Worklist
.empty()) {
1436 Value
*CV
= Worklist
.front();
1438 if (Visited
.contains(CV
))
1441 // Splats don't change the order, so can be safely ignored.
1442 if (isSplatValue(CV
))
1447 if (auto *CI
= dyn_cast
<Instruction
>(CV
)) {
1448 if (CI
->isBinaryOp()) {
1449 for (auto *Op
: CI
->operand_values())
1452 } else if (auto *SV
= dyn_cast
<ShuffleVectorInst
>(CI
)) {
1453 if (Shuffle
&& Shuffle
!= SV
)
1460 // Anything else is currently an unknown node.
1467 // Check all uses of the binary ops and shuffles are also included in the
1468 // lane-invariant operations (Visited should be the list of lanewise
1469 // instructions, including the shuffle that we found).
1470 for (auto *V
: Visited
)
1471 for (auto *U
: V
->users())
1472 if (!Visited
.contains(U
) && U
!= &I
)
1475 FixedVectorType
*VecType
=
1476 dyn_cast
<FixedVectorType
>(II
->getOperand(0)->getType());
1479 FixedVectorType
*ShuffleInputType
=
1480 dyn_cast
<FixedVectorType
>(Shuffle
->getOperand(0)->getType());
1481 if (!ShuffleInputType
)
1483 unsigned NumInputElts
= ShuffleInputType
->getNumElements();
1485 // Find the mask from sorting the lanes into order. This is most likely to
1486 // become a identity or concat mask. Undef elements are pushed to the end.
1487 SmallVector
<int> ConcatMask
;
1488 Shuffle
->getShuffleMask(ConcatMask
);
1489 sort(ConcatMask
, [](int X
, int Y
) { return (unsigned)X
< (unsigned)Y
; });
1490 // In the case of a truncating shuffle it's possible for the mask
1491 // to have an index greater than the size of the resulting vector.
1492 // This requires special handling.
1493 bool IsTruncatingShuffle
= VecType
->getNumElements() < NumInputElts
;
1494 bool UsesSecondVec
=
1495 any_of(ConcatMask
, [&](int M
) { return M
>= (int)NumInputElts
; });
1497 FixedVectorType
*VecTyForCost
=
1498 (UsesSecondVec
&& !IsTruncatingShuffle
) ? VecType
: ShuffleInputType
;
1499 InstructionCost OldCost
= TTI
.getShuffleCost(
1500 UsesSecondVec
? TTI::SK_PermuteTwoSrc
: TTI::SK_PermuteSingleSrc
,
1501 VecTyForCost
, Shuffle
->getShuffleMask());
1502 InstructionCost NewCost
= TTI
.getShuffleCost(
1503 UsesSecondVec
? TTI::SK_PermuteTwoSrc
: TTI::SK_PermuteSingleSrc
,
1504 VecTyForCost
, ConcatMask
);
1506 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
1508 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost
<< " vs NewCost: " << NewCost
1510 if (NewCost
< OldCost
) {
1511 Builder
.SetInsertPoint(Shuffle
);
1512 Value
*NewShuffle
= Builder
.CreateShuffleVector(
1513 Shuffle
->getOperand(0), Shuffle
->getOperand(1), ConcatMask
);
1514 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle
<< "\n");
1515 replaceValue(*Shuffle
, *NewShuffle
);
1518 // See if we can re-use foldSelectShuffle, getting it to reduce the size of
1519 // the shuffle into a nicer order, as it can ignore the order of the shuffles.
1520 return foldSelectShuffle(*Shuffle
, true);
1523 /// This method looks for groups of shuffles acting on binops, of the form:
1524 /// %x = shuffle ...
1525 /// %y = shuffle ...
1526 /// %a = binop %x, %y
1527 /// %b = binop %x, %y
1528 /// shuffle %a, %b, selectmask
1529 /// We may, especially if the shuffle is wider than legal, be able to convert
1530 /// the shuffle to a form where only parts of a and b need to be computed. On
1531 /// architectures with no obvious "select" shuffle, this can reduce the total
1532 /// number of operations if the target reports them as cheaper.
1533 bool VectorCombine::foldSelectShuffle(Instruction
&I
, bool FromReduction
) {
1534 auto *SVI
= cast
<ShuffleVectorInst
>(&I
);
1535 auto *VT
= cast
<FixedVectorType
>(I
.getType());
1536 auto *Op0
= dyn_cast
<Instruction
>(SVI
->getOperand(0));
1537 auto *Op1
= dyn_cast
<Instruction
>(SVI
->getOperand(1));
1538 if (!Op0
|| !Op1
|| Op0
== Op1
|| !Op0
->isBinaryOp() || !Op1
->isBinaryOp() ||
1539 VT
!= Op0
->getType())
1542 auto *SVI0A
= dyn_cast
<Instruction
>(Op0
->getOperand(0));
1543 auto *SVI0B
= dyn_cast
<Instruction
>(Op0
->getOperand(1));
1544 auto *SVI1A
= dyn_cast
<Instruction
>(Op1
->getOperand(0));
1545 auto *SVI1B
= dyn_cast
<Instruction
>(Op1
->getOperand(1));
1546 SmallPtrSet
<Instruction
*, 4> InputShuffles({SVI0A
, SVI0B
, SVI1A
, SVI1B
});
1547 auto checkSVNonOpUses
= [&](Instruction
*I
) {
1548 if (!I
|| I
->getOperand(0)->getType() != VT
)
1550 return any_of(I
->users(), [&](User
*U
) {
1551 return U
!= Op0
&& U
!= Op1
&&
1552 !(isa
<ShuffleVectorInst
>(U
) &&
1553 (InputShuffles
.contains(cast
<Instruction
>(U
)) ||
1554 isInstructionTriviallyDead(cast
<Instruction
>(U
))));
1557 if (checkSVNonOpUses(SVI0A
) || checkSVNonOpUses(SVI0B
) ||
1558 checkSVNonOpUses(SVI1A
) || checkSVNonOpUses(SVI1B
))
1561 // Collect all the uses that are shuffles that we can transform together. We
1562 // may not have a single shuffle, but a group that can all be transformed
1563 // together profitably.
1564 SmallVector
<ShuffleVectorInst
*> Shuffles
;
1565 auto collectShuffles
= [&](Instruction
*I
) {
1566 for (auto *U
: I
->users()) {
1567 auto *SV
= dyn_cast
<ShuffleVectorInst
>(U
);
1568 if (!SV
|| SV
->getType() != VT
)
1570 if ((SV
->getOperand(0) != Op0
&& SV
->getOperand(0) != Op1
) ||
1571 (SV
->getOperand(1) != Op0
&& SV
->getOperand(1) != Op1
))
1573 if (!llvm::is_contained(Shuffles
, SV
))
1574 Shuffles
.push_back(SV
);
1578 if (!collectShuffles(Op0
) || !collectShuffles(Op1
))
1580 // From a reduction, we need to be processing a single shuffle, otherwise the
1581 // other uses will not be lane-invariant.
1582 if (FromReduction
&& Shuffles
.size() > 1)
1585 // Add any shuffle uses for the shuffles we have found, to include them in our
1586 // cost calculations.
1587 if (!FromReduction
) {
1588 for (ShuffleVectorInst
*SV
: Shuffles
) {
1589 for (auto *U
: SV
->users()) {
1590 ShuffleVectorInst
*SSV
= dyn_cast
<ShuffleVectorInst
>(U
);
1591 if (SSV
&& isa
<UndefValue
>(SSV
->getOperand(1)) && SSV
->getType() == VT
)
1592 Shuffles
.push_back(SSV
);
1597 // For each of the output shuffles, we try to sort all the first vector
1598 // elements to the beginning, followed by the second array elements at the
1599 // end. If the binops are legalized to smaller vectors, this may reduce total
1600 // number of binops. We compute the ReconstructMask mask needed to convert
1601 // back to the original lane order.
1602 SmallVector
<std::pair
<int, int>> V1
, V2
;
1603 SmallVector
<SmallVector
<int>> OrigReconstructMasks
;
1604 int MaxV1Elt
= 0, MaxV2Elt
= 0;
1605 unsigned NumElts
= VT
->getNumElements();
1606 for (ShuffleVectorInst
*SVN
: Shuffles
) {
1607 SmallVector
<int> Mask
;
1608 SVN
->getShuffleMask(Mask
);
1610 // Check the operands are the same as the original, or reversed (in which
1611 // case we need to commute the mask).
1612 Value
*SVOp0
= SVN
->getOperand(0);
1613 Value
*SVOp1
= SVN
->getOperand(1);
1614 if (isa
<UndefValue
>(SVOp1
)) {
1615 auto *SSV
= cast
<ShuffleVectorInst
>(SVOp0
);
1616 SVOp0
= SSV
->getOperand(0);
1617 SVOp1
= SSV
->getOperand(1);
1618 for (unsigned I
= 0, E
= Mask
.size(); I
!= E
; I
++) {
1619 if (Mask
[I
] >= static_cast<int>(SSV
->getShuffleMask().size()))
1621 Mask
[I
] = Mask
[I
] < 0 ? Mask
[I
] : SSV
->getMaskValue(Mask
[I
]);
1624 if (SVOp0
== Op1
&& SVOp1
== Op0
) {
1625 std::swap(SVOp0
, SVOp1
);
1626 ShuffleVectorInst::commuteShuffleMask(Mask
, NumElts
);
1628 if (SVOp0
!= Op0
|| SVOp1
!= Op1
)
1631 // Calculate the reconstruction mask for this shuffle, as the mask needed to
1632 // take the packed values from Op0/Op1 and reconstructing to the original
1634 SmallVector
<int> ReconstructMask
;
1635 for (unsigned I
= 0; I
< Mask
.size(); I
++) {
1637 ReconstructMask
.push_back(-1);
1638 } else if (Mask
[I
] < static_cast<int>(NumElts
)) {
1639 MaxV1Elt
= std::max(MaxV1Elt
, Mask
[I
]);
1640 auto It
= find_if(V1
, [&](const std::pair
<int, int> &A
) {
1641 return Mask
[I
] == A
.first
;
1644 ReconstructMask
.push_back(It
- V1
.begin());
1646 ReconstructMask
.push_back(V1
.size());
1647 V1
.emplace_back(Mask
[I
], V1
.size());
1650 MaxV2Elt
= std::max
<int>(MaxV2Elt
, Mask
[I
] - NumElts
);
1651 auto It
= find_if(V2
, [&](const std::pair
<int, int> &A
) {
1652 return Mask
[I
] - static_cast<int>(NumElts
) == A
.first
;
1655 ReconstructMask
.push_back(NumElts
+ It
- V2
.begin());
1657 ReconstructMask
.push_back(NumElts
+ V2
.size());
1658 V2
.emplace_back(Mask
[I
] - NumElts
, NumElts
+ V2
.size());
1663 // For reductions, we know that the lane ordering out doesn't alter the
1664 // result. In-order can help simplify the shuffle away.
1666 sort(ReconstructMask
);
1667 OrigReconstructMasks
.push_back(std::move(ReconstructMask
));
1670 // If the Maximum element used from V1 and V2 are not larger than the new
1671 // vectors, the vectors are already packes and performing the optimization
1672 // again will likely not help any further. This also prevents us from getting
1673 // stuck in a cycle in case the costs do not also rule it out.
1674 if (V1
.empty() || V2
.empty() ||
1675 (MaxV1Elt
== static_cast<int>(V1
.size()) - 1 &&
1676 MaxV2Elt
== static_cast<int>(V2
.size()) - 1))
1679 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
1680 // shuffle of another shuffle, or not a shuffle (that is treated like a
1681 // identity shuffle).
1682 auto GetBaseMaskValue
= [&](Instruction
*I
, int M
) {
1683 auto *SV
= dyn_cast
<ShuffleVectorInst
>(I
);
1686 if (isa
<UndefValue
>(SV
->getOperand(1)))
1687 if (auto *SSV
= dyn_cast
<ShuffleVectorInst
>(SV
->getOperand(0)))
1688 if (InputShuffles
.contains(SSV
))
1689 return SSV
->getMaskValue(SV
->getMaskValue(M
));
1690 return SV
->getMaskValue(M
);
1693 // Attempt to sort the inputs my ascending mask values to make simpler input
1694 // shuffles and push complex shuffles down to the uses. We sort on the first
1695 // of the two input shuffle orders, to try and get at least one input into a
1697 auto SortBase
= [&](Instruction
*A
, std::pair
<int, int> X
,
1698 std::pair
<int, int> Y
) {
1699 int MXA
= GetBaseMaskValue(A
, X
.first
);
1700 int MYA
= GetBaseMaskValue(A
, Y
.first
);
1703 stable_sort(V1
, [&](std::pair
<int, int> A
, std::pair
<int, int> B
) {
1704 return SortBase(SVI0A
, A
, B
);
1706 stable_sort(V2
, [&](std::pair
<int, int> A
, std::pair
<int, int> B
) {
1707 return SortBase(SVI1A
, A
, B
);
1709 // Calculate our ReconstructMasks from the OrigReconstructMasks and the
1710 // modified order of the input shuffles.
1711 SmallVector
<SmallVector
<int>> ReconstructMasks
;
1712 for (const auto &Mask
: OrigReconstructMasks
) {
1713 SmallVector
<int> ReconstructMask
;
1714 for (int M
: Mask
) {
1715 auto FindIndex
= [](const SmallVector
<std::pair
<int, int>> &V
, int M
) {
1716 auto It
= find_if(V
, [M
](auto A
) { return A
.second
== M
; });
1717 assert(It
!= V
.end() && "Expected all entries in Mask");
1718 return std::distance(V
.begin(), It
);
1721 ReconstructMask
.push_back(-1);
1722 else if (M
< static_cast<int>(NumElts
)) {
1723 ReconstructMask
.push_back(FindIndex(V1
, M
));
1725 ReconstructMask
.push_back(NumElts
+ FindIndex(V2
, M
));
1728 ReconstructMasks
.push_back(std::move(ReconstructMask
));
1731 // Calculate the masks needed for the new input shuffles, which get padded
1733 SmallVector
<int> V1A
, V1B
, V2A
, V2B
;
1734 for (unsigned I
= 0; I
< V1
.size(); I
++) {
1735 V1A
.push_back(GetBaseMaskValue(SVI0A
, V1
[I
].first
));
1736 V1B
.push_back(GetBaseMaskValue(SVI0B
, V1
[I
].first
));
1738 for (unsigned I
= 0; I
< V2
.size(); I
++) {
1739 V2A
.push_back(GetBaseMaskValue(SVI1A
, V2
[I
].first
));
1740 V2B
.push_back(GetBaseMaskValue(SVI1B
, V2
[I
].first
));
1742 while (V1A
.size() < NumElts
) {
1743 V1A
.push_back(PoisonMaskElem
);
1744 V1B
.push_back(PoisonMaskElem
);
1746 while (V2A
.size() < NumElts
) {
1747 V2A
.push_back(PoisonMaskElem
);
1748 V2B
.push_back(PoisonMaskElem
);
1751 auto AddShuffleCost
= [&](InstructionCost C
, Instruction
*I
) {
1752 auto *SV
= dyn_cast
<ShuffleVectorInst
>(I
);
1755 return C
+ TTI
.getShuffleCost(isa
<UndefValue
>(SV
->getOperand(1))
1756 ? TTI::SK_PermuteSingleSrc
1757 : TTI::SK_PermuteTwoSrc
,
1758 VT
, SV
->getShuffleMask());
1760 auto AddShuffleMaskCost
= [&](InstructionCost C
, ArrayRef
<int> Mask
) {
1761 return C
+ TTI
.getShuffleCost(TTI::SK_PermuteTwoSrc
, VT
, Mask
);
1764 // Get the costs of the shuffles + binops before and after with the new
1766 InstructionCost CostBefore
=
1767 TTI
.getArithmeticInstrCost(Op0
->getOpcode(), VT
) +
1768 TTI
.getArithmeticInstrCost(Op1
->getOpcode(), VT
);
1769 CostBefore
+= std::accumulate(Shuffles
.begin(), Shuffles
.end(),
1770 InstructionCost(0), AddShuffleCost
);
1771 CostBefore
+= std::accumulate(InputShuffles
.begin(), InputShuffles
.end(),
1772 InstructionCost(0), AddShuffleCost
);
1774 // The new binops will be unused for lanes past the used shuffle lengths.
1775 // These types attempt to get the correct cost for that from the target.
1776 FixedVectorType
*Op0SmallVT
=
1777 FixedVectorType::get(VT
->getScalarType(), V1
.size());
1778 FixedVectorType
*Op1SmallVT
=
1779 FixedVectorType::get(VT
->getScalarType(), V2
.size());
1780 InstructionCost CostAfter
=
1781 TTI
.getArithmeticInstrCost(Op0
->getOpcode(), Op0SmallVT
) +
1782 TTI
.getArithmeticInstrCost(Op1
->getOpcode(), Op1SmallVT
);
1783 CostAfter
+= std::accumulate(ReconstructMasks
.begin(), ReconstructMasks
.end(),
1784 InstructionCost(0), AddShuffleMaskCost
);
1785 std::set
<SmallVector
<int>> OutputShuffleMasks({V1A
, V1B
, V2A
, V2B
});
1787 std::accumulate(OutputShuffleMasks
.begin(), OutputShuffleMasks
.end(),
1788 InstructionCost(0), AddShuffleMaskCost
);
1790 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I
<< "\n");
1791 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
1792 << " vs CostAfter: " << CostAfter
<< "\n");
1793 if (CostBefore
<= CostAfter
)
1796 // The cost model has passed, create the new instructions.
1797 auto GetShuffleOperand
= [&](Instruction
*I
, unsigned Op
) -> Value
* {
1798 auto *SV
= dyn_cast
<ShuffleVectorInst
>(I
);
1801 if (isa
<UndefValue
>(SV
->getOperand(1)))
1802 if (auto *SSV
= dyn_cast
<ShuffleVectorInst
>(SV
->getOperand(0)))
1803 if (InputShuffles
.contains(SSV
))
1804 return SSV
->getOperand(Op
);
1805 return SV
->getOperand(Op
);
1807 Builder
.SetInsertPoint(SVI0A
->getInsertionPointAfterDef());
1808 Value
*NSV0A
= Builder
.CreateShuffleVector(GetShuffleOperand(SVI0A
, 0),
1809 GetShuffleOperand(SVI0A
, 1), V1A
);
1810 Builder
.SetInsertPoint(SVI0B
->getInsertionPointAfterDef());
1811 Value
*NSV0B
= Builder
.CreateShuffleVector(GetShuffleOperand(SVI0B
, 0),
1812 GetShuffleOperand(SVI0B
, 1), V1B
);
1813 Builder
.SetInsertPoint(SVI1A
->getInsertionPointAfterDef());
1814 Value
*NSV1A
= Builder
.CreateShuffleVector(GetShuffleOperand(SVI1A
, 0),
1815 GetShuffleOperand(SVI1A
, 1), V2A
);
1816 Builder
.SetInsertPoint(SVI1B
->getInsertionPointAfterDef());
1817 Value
*NSV1B
= Builder
.CreateShuffleVector(GetShuffleOperand(SVI1B
, 0),
1818 GetShuffleOperand(SVI1B
, 1), V2B
);
1819 Builder
.SetInsertPoint(Op0
);
1820 Value
*NOp0
= Builder
.CreateBinOp((Instruction::BinaryOps
)Op0
->getOpcode(),
1822 if (auto *I
= dyn_cast
<Instruction
>(NOp0
))
1823 I
->copyIRFlags(Op0
, true);
1824 Builder
.SetInsertPoint(Op1
);
1825 Value
*NOp1
= Builder
.CreateBinOp((Instruction::BinaryOps
)Op1
->getOpcode(),
1827 if (auto *I
= dyn_cast
<Instruction
>(NOp1
))
1828 I
->copyIRFlags(Op1
, true);
1830 for (int S
= 0, E
= ReconstructMasks
.size(); S
!= E
; S
++) {
1831 Builder
.SetInsertPoint(Shuffles
[S
]);
1832 Value
*NSV
= Builder
.CreateShuffleVector(NOp0
, NOp1
, ReconstructMasks
[S
]);
1833 replaceValue(*Shuffles
[S
], *NSV
);
1836 Worklist
.pushValue(NSV0A
);
1837 Worklist
.pushValue(NSV0B
);
1838 Worklist
.pushValue(NSV1A
);
1839 Worklist
.pushValue(NSV1B
);
1840 for (auto *S
: Shuffles
)
1845 /// This is the entry point for all transforms. Pass manager differences are
1846 /// handled in the callers of this function.
1847 bool VectorCombine::run() {
1848 if (DisableVectorCombine
)
1851 // Don't attempt vectorization if the target does not support vectors.
1852 if (!TTI
.getNumberOfRegisters(TTI
.getRegisterClassForType(/*Vector*/ true)))
1855 bool MadeChange
= false;
1856 auto FoldInst
= [this, &MadeChange
](Instruction
&I
) {
1857 Builder
.SetInsertPoint(&I
);
1858 bool IsFixedVectorType
= isa
<FixedVectorType
>(I
.getType());
1859 auto Opcode
= I
.getOpcode();
1861 // These folds should be beneficial regardless of when this pass is run
1862 // in the optimization pipeline.
1863 // The type checking is for run-time efficiency. We can avoid wasting time
1864 // dispatching to folding functions if there's no chance of matching.
1865 if (IsFixedVectorType
) {
1867 case Instruction::InsertElement
:
1868 MadeChange
|= vectorizeLoadInsert(I
);
1870 case Instruction::ShuffleVector
:
1871 MadeChange
|= widenSubvectorLoad(I
);
1878 // This transform works with scalable and fixed vectors
1879 // TODO: Identify and allow other scalable transforms
1880 if (isa
<VectorType
>(I
.getType())) {
1881 MadeChange
|= scalarizeBinopOrCmp(I
);
1882 MadeChange
|= scalarizeLoadExtract(I
);
1883 MadeChange
|= scalarizeVPIntrinsic(I
);
1886 if (Opcode
== Instruction::Store
)
1887 MadeChange
|= foldSingleElementStore(I
);
1889 // If this is an early pipeline invocation of this pass, we are done.
1890 if (TryEarlyFoldsOnly
)
1893 // Otherwise, try folds that improve codegen but may interfere with
1894 // early IR canonicalizations.
1895 // The type checking is for run-time efficiency. We can avoid wasting time
1896 // dispatching to folding functions if there's no chance of matching.
1897 if (IsFixedVectorType
) {
1899 case Instruction::InsertElement
:
1900 MadeChange
|= foldInsExtFNeg(I
);
1902 case Instruction::ShuffleVector
:
1903 MadeChange
|= foldShuffleOfBinops(I
);
1904 MadeChange
|= foldSelectShuffle(I
);
1906 case Instruction::BitCast
:
1907 MadeChange
|= foldBitcastShuffle(I
);
1912 case Instruction::Call
:
1913 MadeChange
|= foldShuffleFromReductions(I
);
1915 case Instruction::ICmp
:
1916 case Instruction::FCmp
:
1917 MadeChange
|= foldExtractExtract(I
);
1920 if (Instruction::isBinaryOp(Opcode
)) {
1921 MadeChange
|= foldExtractExtract(I
);
1922 MadeChange
|= foldExtractedCmps(I
);
1929 for (BasicBlock
&BB
: F
) {
1930 // Ignore unreachable basic blocks.
1931 if (!DT
.isReachableFromEntry(&BB
))
1933 // Use early increment range so that we can erase instructions in loop.
1934 for (Instruction
&I
: make_early_inc_range(BB
)) {
1935 if (I
.isDebugOrPseudoInst())
1941 while (!Worklist
.isEmpty()) {
1942 Instruction
*I
= Worklist
.removeOne();
1946 if (isInstructionTriviallyDead(I
)) {
1947 eraseInstruction(*I
);
1957 PreservedAnalyses
VectorCombinePass::run(Function
&F
,
1958 FunctionAnalysisManager
&FAM
) {
1959 auto &AC
= FAM
.getResult
<AssumptionAnalysis
>(F
);
1960 TargetTransformInfo
&TTI
= FAM
.getResult
<TargetIRAnalysis
>(F
);
1961 DominatorTree
&DT
= FAM
.getResult
<DominatorTreeAnalysis
>(F
);
1962 AAResults
&AA
= FAM
.getResult
<AAManager
>(F
);
1963 VectorCombine
Combiner(F
, TTI
, DT
, AA
, AC
, TryEarlyFoldsOnly
);
1964 if (!Combiner
.run())
1965 return PreservedAnalyses::all();
1966 PreservedAnalyses PA
;
1967 PA
.preserveSet
<CFGAnalyses
>();