1 //===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "RISCVTargetTransformInfo.h"
10 #include "MCTargetDesc/RISCVMatInt.h"
11 #include "llvm/ADT/STLExtras.h"
12 #include "llvm/Analysis/TargetTransformInfo.h"
13 #include "llvm/CodeGen/BasicTTIImpl.h"
14 #include "llvm/CodeGen/CostTable.h"
15 #include "llvm/CodeGen/TargetLowering.h"
16 #include "llvm/IR/Instructions.h"
21 #define DEBUG_TYPE "riscvtti"
23 static cl::opt
<unsigned> RVVRegisterWidthLMUL(
24 "riscv-v-register-bit-width-lmul",
26 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
27 "by autovectorized code. Fractional LMULs are not supported."),
28 cl::init(2), cl::Hidden
);
30 static cl::opt
<unsigned> SLPMaxVF(
33 "Overrides result used for getMaximumVF query which is used "
34 "exclusively by SLP vectorizer."),
37 InstructionCost
RISCVTTIImpl::getIntImmCost(const APInt
&Imm
, Type
*Ty
,
38 TTI::TargetCostKind CostKind
) {
39 assert(Ty
->isIntegerTy() &&
40 "getIntImmCost can only estimate cost of materialising integers");
42 // We have a Zero register, so 0 is always free.
46 // Otherwise, we check how many instructions it will take to materialise.
47 const DataLayout
&DL
= getDataLayout();
48 return RISCVMatInt::getIntMatCost(Imm
, DL
.getTypeSizeInBits(Ty
),
49 getST()->getFeatureBits());
52 // Look for patterns of shift followed by AND that can be turned into a pair of
53 // shifts. We won't need to materialize an immediate for the AND so these can
54 // be considered free.
55 static bool canUseShiftPair(Instruction
*Inst
, const APInt
&Imm
) {
56 uint64_t Mask
= Imm
.getZExtValue();
57 auto *BO
= dyn_cast
<BinaryOperator
>(Inst
->getOperand(0));
58 if (!BO
|| !BO
->hasOneUse())
61 if (BO
->getOpcode() != Instruction::Shl
)
64 if (!isa
<ConstantInt
>(BO
->getOperand(1)))
67 unsigned ShAmt
= cast
<ConstantInt
>(BO
->getOperand(1))->getZExtValue();
68 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
69 // is a mask shifted by c2 bits with c3 leading zeros.
70 if (isShiftedMask_64(Mask
)) {
71 unsigned Trailing
= llvm::countr_zero(Mask
);
72 if (ShAmt
== Trailing
)
79 InstructionCost
RISCVTTIImpl::getIntImmCostInst(unsigned Opcode
, unsigned Idx
,
80 const APInt
&Imm
, Type
*Ty
,
81 TTI::TargetCostKind CostKind
,
83 assert(Ty
->isIntegerTy() &&
84 "getIntImmCost can only estimate cost of materialising integers");
86 // We have a Zero register, so 0 is always free.
90 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
91 // commutative, in others the immediate comes from a specific argument index.
92 bool Takes12BitImm
= false;
93 unsigned ImmArgIdx
= ~0U;
96 case Instruction::GetElementPtr
:
97 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
98 // split up large offsets in GEP into better parts than ConstantHoisting
100 return TTI::TCC_Free
;
101 case Instruction::And
:
103 if (Imm
== UINT64_C(0xffff) && ST
->hasStdExtZbb())
104 return TTI::TCC_Free
;
106 if (Imm
== UINT64_C(0xffffffff) && ST
->hasStdExtZba())
107 return TTI::TCC_Free
;
109 if (ST
->hasStdExtZbs() && (~Imm
).isPowerOf2())
110 return TTI::TCC_Free
;
111 if (Inst
&& Idx
== 1 && Imm
.getBitWidth() <= ST
->getXLen() &&
112 canUseShiftPair(Inst
, Imm
))
113 return TTI::TCC_Free
;
114 Takes12BitImm
= true;
116 case Instruction::Add
:
117 Takes12BitImm
= true;
119 case Instruction::Or
:
120 case Instruction::Xor
:
122 if (ST
->hasStdExtZbs() && Imm
.isPowerOf2())
123 return TTI::TCC_Free
;
124 Takes12BitImm
= true;
126 case Instruction::Mul
:
127 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
128 if (Imm
.isPowerOf2() || Imm
.isNegatedPowerOf2())
129 return TTI::TCC_Free
;
130 // One more or less than a power of 2 can use SLLI+ADD/SUB.
131 if ((Imm
+ 1).isPowerOf2() || (Imm
- 1).isPowerOf2())
132 return TTI::TCC_Free
;
133 // FIXME: There is no MULI instruction.
134 Takes12BitImm
= true;
136 case Instruction::Sub
:
137 case Instruction::Shl
:
138 case Instruction::LShr
:
139 case Instruction::AShr
:
140 Takes12BitImm
= true;
148 // Check immediate is the correct argument...
149 if (Instruction::isCommutative(Opcode
) || Idx
== ImmArgIdx
) {
150 // ... and fits into the 12-bit immediate.
151 if (Imm
.getSignificantBits() <= 64 &&
152 getTLI()->isLegalAddImmediate(Imm
.getSExtValue())) {
153 return TTI::TCC_Free
;
157 // Otherwise, use the full materialisation cost.
158 return getIntImmCost(Imm
, Ty
, CostKind
);
161 // By default, prevent hoisting.
162 return TTI::TCC_Free
;
166 RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID
, unsigned Idx
,
167 const APInt
&Imm
, Type
*Ty
,
168 TTI::TargetCostKind CostKind
) {
169 // Prevent hoisting in unknown cases.
170 return TTI::TCC_Free
;
173 TargetTransformInfo::PopcntSupportKind
174 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth
) {
175 assert(isPowerOf2_32(TyWidth
) && "Ty width must be power of 2");
176 return ST
->hasStdExtZbb() ? TTI::PSK_FastHardware
: TTI::PSK_Software
;
179 bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst
*II
) const {
180 // Currently, the ExpandReductions pass can't expand scalable-vector
181 // reductions, but we still request expansion as RVV doesn't support certain
182 // reductions and the SelectionDAG can't legalize them either.
183 switch (II
->getIntrinsicID()) {
186 // These reductions have no equivalent in RVV
187 case Intrinsic::vector_reduce_mul
:
188 case Intrinsic::vector_reduce_fmul
:
193 std::optional
<unsigned> RISCVTTIImpl::getMaxVScale() const {
194 if (ST
->hasVInstructions())
195 return ST
->getRealMaxVLen() / RISCV::RVVBitsPerBlock
;
196 return BaseT::getMaxVScale();
199 std::optional
<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
200 if (ST
->hasVInstructions())
201 if (unsigned MinVLen
= ST
->getRealMinVLen();
202 MinVLen
>= RISCV::RVVBitsPerBlock
)
203 return MinVLen
/ RISCV::RVVBitsPerBlock
;
204 return BaseT::getVScaleForTuning();
208 RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K
) const {
210 llvm::bit_floor(std::clamp
<unsigned>(RVVRegisterWidthLMUL
, 1, 8));
212 case TargetTransformInfo::RGK_Scalar
:
213 return TypeSize::Fixed(ST
->getXLen());
214 case TargetTransformInfo::RGK_FixedWidthVector
:
215 return TypeSize::Fixed(
216 ST
->useRVVForFixedLengthVectors() ? LMUL
* ST
->getRealMinVLen() : 0);
217 case TargetTransformInfo::RGK_ScalableVector
:
218 return TypeSize::Scalable((ST
->hasVInstructions() &&
219 ST
->getRealMinVLen() >= RISCV::RVVBitsPerBlock
)
220 ? LMUL
* RISCV::RVVBitsPerBlock
224 llvm_unreachable("Unsupported register kind");
228 RISCVTTIImpl::getConstantPoolLoadCost(Type
*Ty
, TTI::TargetCostKind CostKind
) {
229 // Add a cost of address generation + the cost of the load. The address
230 // is expected to be a PC relative offset to a constant pool entry
232 return 2 + getMemoryOpCost(Instruction::Load
, Ty
, DL
.getABITypeAlign(Ty
),
233 /*AddressSpace=*/0, CostKind
);
236 static VectorType
*getVRGatherIndexType(MVT DataVT
, const RISCVSubtarget
&ST
,
238 assert((DataVT
.getScalarSizeInBits() != 8 ||
239 DataVT
.getVectorNumElements() <= 256) && "unhandled case in lowering");
240 MVT IndexVT
= DataVT
.changeTypeToInteger();
241 if (IndexVT
.getScalarType().bitsGT(ST
.getXLenVT()))
242 IndexVT
= IndexVT
.changeVectorElementType(MVT::i16
);
243 return cast
<VectorType
>(EVT(IndexVT
).getTypeForEVT(C
));
246 InstructionCost
RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind
,
247 VectorType
*Tp
, ArrayRef
<int> Mask
,
248 TTI::TargetCostKind CostKind
,
249 int Index
, VectorType
*SubTp
,
250 ArrayRef
<const Value
*> Args
) {
251 Kind
= improveShuffleKindFromMask(Kind
, Mask
, Tp
, Index
, SubTp
);
253 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(Tp
);
255 // First, handle cases where having a fixed length vector enables us to
256 // give a more accurate cost than falling back to generic scalable codegen.
257 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
258 if (isa
<FixedVectorType
>(Tp
)) {
262 case TTI::SK_PermuteSingleSrc
: {
263 if (Mask
.size() >= 2 && LT
.second
.isFixedLengthVector()) {
264 MVT EltTp
= LT
.second
.getVectorElementType();
265 // If the size of the element is < ELEN then shuffles of interleaves and
266 // deinterleaves of 2 vectors can be lowered into the following
268 if (EltTp
.getScalarSizeInBits() < ST
->getELen()) {
270 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
271 // vwaddu.vv v10, v8, v9
272 // li a0, -1 (ignored)
273 // vwmaccu.vx v10, a0, v9
274 if (ShuffleVectorInst::isInterleaveMask(Mask
, 2, Mask
.size()))
275 return 2 * LT
.first
* TLI
->getLMULCost(LT
.second
);
277 if (Mask
[0] == 0 || Mask
[0] == 1) {
278 auto DeinterleaveMask
= createStrideMask(Mask
[0], 2, Mask
.size());
280 // vnsrl.wi v10, v8, 0
281 if (equal(DeinterleaveMask
, Mask
))
282 return LT
.first
* TLI
->getLMULCost(LT
.second
);
286 // vrgather + cost of generating the mask constant.
287 // We model this for an unknown mask with a single vrgather.
288 if (LT
.second
.isFixedLengthVector() && LT
.first
== 1 &&
289 (LT
.second
.getScalarSizeInBits() != 8 ||
290 LT
.second
.getVectorNumElements() <= 256)) {
291 VectorType
*IdxTy
= getVRGatherIndexType(LT
.second
, *ST
, Tp
->getContext());
292 InstructionCost IndexCost
= getConstantPoolLoadCost(IdxTy
, CostKind
);
293 return IndexCost
+ TLI
->getVRGatherVVCost(LT
.second
);
297 case TTI::SK_Transpose
:
298 case TTI::SK_PermuteTwoSrc
: {
299 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
300 // register for the second vrgather. We model this for an unknown
302 if (LT
.second
.isFixedLengthVector() && LT
.first
== 1 &&
303 (LT
.second
.getScalarSizeInBits() != 8 ||
304 LT
.second
.getVectorNumElements() <= 256)) {
305 auto &C
= Tp
->getContext();
306 auto EC
= Tp
->getElementCount();
307 VectorType
*IdxTy
= getVRGatherIndexType(LT
.second
, *ST
, C
);
308 VectorType
*MaskTy
= VectorType::get(IntegerType::getInt1Ty(C
), EC
);
309 InstructionCost IndexCost
= getConstantPoolLoadCost(IdxTy
, CostKind
);
310 InstructionCost MaskCost
= getConstantPoolLoadCost(MaskTy
, CostKind
);
311 return 2 * IndexCost
+ 2 * TLI
->getVRGatherVVCost(LT
.second
) + MaskCost
;
315 case TTI::SK_Select
: {
316 // We are going to permute multiple sources and the result will be in
317 // multiple destinations. Providing an accurate cost only for splits where
318 // the element type remains the same.
319 if (!Mask
.empty() && LT
.first
.isValid() && LT
.first
!= 1 &&
320 LT
.second
.isFixedLengthVector() &&
321 LT
.second
.getVectorElementType().getSizeInBits() ==
322 Tp
->getElementType()->getPrimitiveSizeInBits() &&
323 LT
.second
.getVectorNumElements() <
324 cast
<FixedVectorType
>(Tp
)->getNumElements() &&
325 divideCeil(Mask
.size(),
326 cast
<FixedVectorType
>(Tp
)->getNumElements()) ==
327 static_cast<unsigned>(*LT
.first
.getValue())) {
328 unsigned NumRegs
= *LT
.first
.getValue();
329 unsigned VF
= cast
<FixedVectorType
>(Tp
)->getNumElements();
330 unsigned SubVF
= PowerOf2Ceil(VF
/ NumRegs
);
331 auto *SubVecTy
= FixedVectorType::get(Tp
->getElementType(), SubVF
);
333 InstructionCost Cost
= 0;
334 for (unsigned I
= 0; I
< NumRegs
; ++I
) {
335 bool IsSingleVector
= true;
336 SmallVector
<int> SubMask(SubVF
, PoisonMaskElem
);
337 transform(Mask
.slice(I
* SubVF
,
338 I
== NumRegs
- 1 ? Mask
.size() % SubVF
: SubVF
),
339 SubMask
.begin(), [&](int I
) {
340 bool SingleSubVector
= I
/ VF
== 0;
341 IsSingleVector
&= SingleSubVector
;
342 return (SingleSubVector
? 0 : 1) * SubVF
+ I
% VF
;
344 Cost
+= getShuffleCost(IsSingleVector
? TTI::SK_PermuteSingleSrc
345 : TTI::SK_PermuteTwoSrc
,
346 SubVecTy
, SubMask
, CostKind
, 0, nullptr);
355 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
358 // Fallthrough to generic handling.
359 // TODO: Most of these cases will return getInvalid in generic code, and
360 // must be implemented here.
362 case TTI::SK_ExtractSubvector
:
364 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
365 // vslidedown.vi v8, v9, 2
366 return LT
.first
* TLI
->getVSlideCost(LT
.second
);
367 case TTI::SK_InsertSubvector
:
369 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
370 // vslideup.vi v8, v9, 2
371 return LT
.first
* TLI
->getVSlideCost(LT
.second
);
372 case TTI::SK_Select
: {
375 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
377 // vmerge.vvm v8, v9, v8, v0
378 return LT
.first
* 3 * TLI
->getLMULCost(LT
.second
);
380 case TTI::SK_Broadcast
: {
381 bool HasScalar
= (Args
.size() > 0) && (Operator::getOpcode(Args
[0]) ==
382 Instruction::InsertElement
);
383 if (LT
.second
.getScalarSizeInBits() == 1) {
387 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
389 // vmsne.vi v0, v8, 0
390 return LT
.first
* TLI
->getLMULCost(LT
.second
) * 3;
393 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
395 // vmerge.vim v8, v8, 1, v0
399 // vmsne.vi v0, v8, 0
401 return LT
.first
* TLI
->getLMULCost(LT
.second
) * 6;
407 return LT
.first
* TLI
->getLMULCost(LT
.second
);
411 // vrgather.vi v9, v8, 0
412 return LT
.first
* TLI
->getVRGatherVICost(LT
.second
);
415 // vslidedown+vslideup.
416 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
417 // of similar code, but I think we expand through memory.
418 return 2 * LT
.first
* TLI
->getVSlideCost(LT
.second
);
419 case TTI::SK_Reverse
: {
420 // TODO: Cases to improve here:
421 // * Illegal vector types
424 // At low LMUL, most of the cost is producing the vrgather index register.
425 // At high LMUL, the cost of the vrgather itself will dominate.
430 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
432 // vrsub.vx v10, v9, a0
433 // vrgather.vv v9, v8, v10
434 InstructionCost LenCost
= 3;
435 if (LT
.second
.isFixedLengthVector())
436 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
437 LenCost
= isInt
<5>(LT
.second
.getVectorNumElements() - 1) ? 0 : 1;
438 InstructionCost GatherCost
= 2 + TLI
->getVRGatherVVCost(LT
.second
);
439 // Mask operation additionally required extend and truncate
440 InstructionCost ExtendCost
= Tp
->getElementType()->isIntegerTy(1) ? 3 : 0;
441 return LT
.first
* (LenCost
+ GatherCost
+ ExtendCost
);
444 return BaseT::getShuffleCost(Kind
, Tp
, Mask
, CostKind
, Index
, SubTp
);
448 RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode
, Type
*Src
, Align Alignment
,
449 unsigned AddressSpace
,
450 TTI::TargetCostKind CostKind
) {
451 if (!isLegalMaskedLoadStore(Src
, Alignment
) ||
452 CostKind
!= TTI::TCK_RecipThroughput
)
453 return BaseT::getMaskedMemoryOpCost(Opcode
, Src
, Alignment
, AddressSpace
,
456 return getMemoryOpCost(Opcode
, Src
, Alignment
, AddressSpace
, CostKind
);
459 InstructionCost
RISCVTTIImpl::getInterleavedMemoryOpCost(
460 unsigned Opcode
, Type
*VecTy
, unsigned Factor
, ArrayRef
<unsigned> Indices
,
461 Align Alignment
, unsigned AddressSpace
, TTI::TargetCostKind CostKind
,
462 bool UseMaskForCond
, bool UseMaskForGaps
) {
463 if (isa
<ScalableVectorType
>(VecTy
))
464 return InstructionCost::getInvalid();
465 auto *FVTy
= cast
<FixedVectorType
>(VecTy
);
466 InstructionCost MemCost
=
467 getMemoryOpCost(Opcode
, VecTy
, Alignment
, AddressSpace
, CostKind
);
468 unsigned VF
= FVTy
->getNumElements() / Factor
;
470 // The interleaved memory access pass will lower interleaved memory ops (i.e
471 // a load and store followed by a specific shuffle) to vlseg/vsseg
472 // intrinsics. In those cases then we can treat it as if it's just one (legal)
474 if (!UseMaskForCond
&& !UseMaskForGaps
&&
475 Factor
<= TLI
->getMaxSupportedInterleaveFactor()) {
476 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(FVTy
);
477 // Need to make sure type has't been scalarized
478 if (LT
.second
.isFixedLengthVector()) {
479 auto *LegalFVTy
= FixedVectorType::get(FVTy
->getElementType(),
480 LT
.second
.getVectorNumElements());
481 // FIXME: We use the memory op cost of the *legalized* type here, becuase
482 // it's getMemoryOpCost returns a really expensive cost for types like
483 // <6 x i8>, which show up when doing interleaves of Factor=3 etc.
484 // Should the memory op cost of these be cheaper?
485 if (TLI
->isLegalInterleavedAccessType(LegalFVTy
, Factor
, Alignment
,
487 InstructionCost LegalMemCost
= getMemoryOpCost(
488 Opcode
, LegalFVTy
, Alignment
, AddressSpace
, CostKind
);
489 return LT
.first
+ LegalMemCost
;
494 // An interleaved load will look like this for Factor=3:
495 // %wide.vec = load <12 x i32>, ptr %3, align 4
496 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
497 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
498 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
499 if (Opcode
== Instruction::Load
) {
500 InstructionCost Cost
= MemCost
;
501 for (unsigned Index
: Indices
) {
502 FixedVectorType
*SubVecTy
=
503 FixedVectorType::get(FVTy
->getElementType(), VF
* Factor
);
504 auto Mask
= createStrideMask(Index
, Factor
, VF
);
505 InstructionCost ShuffleCost
=
506 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc
, SubVecTy
, Mask
,
507 CostKind
, 0, nullptr, {});
513 // TODO: Model for NF > 2
514 // We'll need to enhance getShuffleCost to model shuffles that are just
515 // inserts and extracts into subvectors, since they won't have the full cost
517 // An interleaved store for 3 vectors of 4 lanes will look like
518 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
519 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
520 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
521 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
522 // store <12 x i32> %interleaved.vec, ptr %10, align 4
524 return BaseT::getInterleavedMemoryOpCost(Opcode
, VecTy
, Factor
, Indices
,
525 Alignment
, AddressSpace
, CostKind
,
526 UseMaskForCond
, UseMaskForGaps
);
528 assert(Opcode
== Instruction::Store
&& "Opcode must be a store");
529 // For an interleaving store of 2 vectors, we perform one large interleaving
530 // shuffle that goes into the wide store
531 auto Mask
= createInterleaveMask(VF
, Factor
);
532 InstructionCost ShuffleCost
=
533 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc
, FVTy
, Mask
,
534 CostKind
, 0, nullptr, {});
535 return MemCost
+ ShuffleCost
;
538 InstructionCost
RISCVTTIImpl::getGatherScatterOpCost(
539 unsigned Opcode
, Type
*DataTy
, const Value
*Ptr
, bool VariableMask
,
540 Align Alignment
, TTI::TargetCostKind CostKind
, const Instruction
*I
) {
541 if (CostKind
!= TTI::TCK_RecipThroughput
)
542 return BaseT::getGatherScatterOpCost(Opcode
, DataTy
, Ptr
, VariableMask
,
543 Alignment
, CostKind
, I
);
545 if ((Opcode
== Instruction::Load
&&
546 !isLegalMaskedGather(DataTy
, Align(Alignment
))) ||
547 (Opcode
== Instruction::Store
&&
548 !isLegalMaskedScatter(DataTy
, Align(Alignment
))))
549 return BaseT::getGatherScatterOpCost(Opcode
, DataTy
, Ptr
, VariableMask
,
550 Alignment
, CostKind
, I
);
552 // Cost is proportional to the number of memory operations implied. For
553 // scalable vectors, we use an estimate on that number since we don't
554 // know exactly what VL will be.
555 auto &VTy
= *cast
<VectorType
>(DataTy
);
556 InstructionCost MemOpCost
=
557 getMemoryOpCost(Opcode
, VTy
.getElementType(), Alignment
, 0, CostKind
,
558 {TTI::OK_AnyValue
, TTI::OP_None
}, I
);
559 unsigned NumLoads
= getEstimatedVLFor(&VTy
);
560 return NumLoads
* MemOpCost
;
563 // Currently, these represent both throughput and codesize costs
564 // for the respective intrinsics. The costs in this table are simply
565 // instruction counts with the following adjustments made:
566 // * One vsetvli is considered free.
567 static const CostTblEntry VectorIntrinsicCostTable
[]{
568 {Intrinsic::floor
, MVT::v2f32
, 9},
569 {Intrinsic::floor
, MVT::v4f32
, 9},
570 {Intrinsic::floor
, MVT::v8f32
, 9},
571 {Intrinsic::floor
, MVT::v16f32
, 9},
572 {Intrinsic::floor
, MVT::nxv1f32
, 9},
573 {Intrinsic::floor
, MVT::nxv2f32
, 9},
574 {Intrinsic::floor
, MVT::nxv4f32
, 9},
575 {Intrinsic::floor
, MVT::nxv8f32
, 9},
576 {Intrinsic::floor
, MVT::nxv16f32
, 9},
577 {Intrinsic::floor
, MVT::v2f64
, 9},
578 {Intrinsic::floor
, MVT::v4f64
, 9},
579 {Intrinsic::floor
, MVT::v8f64
, 9},
580 {Intrinsic::floor
, MVT::v16f64
, 9},
581 {Intrinsic::floor
, MVT::nxv1f64
, 9},
582 {Intrinsic::floor
, MVT::nxv2f64
, 9},
583 {Intrinsic::floor
, MVT::nxv4f64
, 9},
584 {Intrinsic::floor
, MVT::nxv8f64
, 9},
585 {Intrinsic::ceil
, MVT::v2f32
, 9},
586 {Intrinsic::ceil
, MVT::v4f32
, 9},
587 {Intrinsic::ceil
, MVT::v8f32
, 9},
588 {Intrinsic::ceil
, MVT::v16f32
, 9},
589 {Intrinsic::ceil
, MVT::nxv1f32
, 9},
590 {Intrinsic::ceil
, MVT::nxv2f32
, 9},
591 {Intrinsic::ceil
, MVT::nxv4f32
, 9},
592 {Intrinsic::ceil
, MVT::nxv8f32
, 9},
593 {Intrinsic::ceil
, MVT::nxv16f32
, 9},
594 {Intrinsic::ceil
, MVT::v2f64
, 9},
595 {Intrinsic::ceil
, MVT::v4f64
, 9},
596 {Intrinsic::ceil
, MVT::v8f64
, 9},
597 {Intrinsic::ceil
, MVT::v16f64
, 9},
598 {Intrinsic::ceil
, MVT::nxv1f64
, 9},
599 {Intrinsic::ceil
, MVT::nxv2f64
, 9},
600 {Intrinsic::ceil
, MVT::nxv4f64
, 9},
601 {Intrinsic::ceil
, MVT::nxv8f64
, 9},
602 {Intrinsic::trunc
, MVT::v2f32
, 7},
603 {Intrinsic::trunc
, MVT::v4f32
, 7},
604 {Intrinsic::trunc
, MVT::v8f32
, 7},
605 {Intrinsic::trunc
, MVT::v16f32
, 7},
606 {Intrinsic::trunc
, MVT::nxv1f32
, 7},
607 {Intrinsic::trunc
, MVT::nxv2f32
, 7},
608 {Intrinsic::trunc
, MVT::nxv4f32
, 7},
609 {Intrinsic::trunc
, MVT::nxv8f32
, 7},
610 {Intrinsic::trunc
, MVT::nxv16f32
, 7},
611 {Intrinsic::trunc
, MVT::v2f64
, 7},
612 {Intrinsic::trunc
, MVT::v4f64
, 7},
613 {Intrinsic::trunc
, MVT::v8f64
, 7},
614 {Intrinsic::trunc
, MVT::v16f64
, 7},
615 {Intrinsic::trunc
, MVT::nxv1f64
, 7},
616 {Intrinsic::trunc
, MVT::nxv2f64
, 7},
617 {Intrinsic::trunc
, MVT::nxv4f64
, 7},
618 {Intrinsic::trunc
, MVT::nxv8f64
, 7},
619 {Intrinsic::round
, MVT::v2f32
, 9},
620 {Intrinsic::round
, MVT::v4f32
, 9},
621 {Intrinsic::round
, MVT::v8f32
, 9},
622 {Intrinsic::round
, MVT::v16f32
, 9},
623 {Intrinsic::round
, MVT::nxv1f32
, 9},
624 {Intrinsic::round
, MVT::nxv2f32
, 9},
625 {Intrinsic::round
, MVT::nxv4f32
, 9},
626 {Intrinsic::round
, MVT::nxv8f32
, 9},
627 {Intrinsic::round
, MVT::nxv16f32
, 9},
628 {Intrinsic::round
, MVT::v2f64
, 9},
629 {Intrinsic::round
, MVT::v4f64
, 9},
630 {Intrinsic::round
, MVT::v8f64
, 9},
631 {Intrinsic::round
, MVT::v16f64
, 9},
632 {Intrinsic::round
, MVT::nxv1f64
, 9},
633 {Intrinsic::round
, MVT::nxv2f64
, 9},
634 {Intrinsic::round
, MVT::nxv4f64
, 9},
635 {Intrinsic::round
, MVT::nxv8f64
, 9},
636 {Intrinsic::roundeven
, MVT::v2f32
, 9},
637 {Intrinsic::roundeven
, MVT::v4f32
, 9},
638 {Intrinsic::roundeven
, MVT::v8f32
, 9},
639 {Intrinsic::roundeven
, MVT::v16f32
, 9},
640 {Intrinsic::roundeven
, MVT::nxv1f32
, 9},
641 {Intrinsic::roundeven
, MVT::nxv2f32
, 9},
642 {Intrinsic::roundeven
, MVT::nxv4f32
, 9},
643 {Intrinsic::roundeven
, MVT::nxv8f32
, 9},
644 {Intrinsic::roundeven
, MVT::nxv16f32
, 9},
645 {Intrinsic::roundeven
, MVT::v2f64
, 9},
646 {Intrinsic::roundeven
, MVT::v4f64
, 9},
647 {Intrinsic::roundeven
, MVT::v8f64
, 9},
648 {Intrinsic::roundeven
, MVT::v16f64
, 9},
649 {Intrinsic::roundeven
, MVT::nxv1f64
, 9},
650 {Intrinsic::roundeven
, MVT::nxv2f64
, 9},
651 {Intrinsic::roundeven
, MVT::nxv4f64
, 9},
652 {Intrinsic::roundeven
, MVT::nxv8f64
, 9},
653 {Intrinsic::rint
, MVT::v2f32
, 7},
654 {Intrinsic::rint
, MVT::v4f32
, 7},
655 {Intrinsic::rint
, MVT::v8f32
, 7},
656 {Intrinsic::rint
, MVT::v16f32
, 7},
657 {Intrinsic::rint
, MVT::nxv1f32
, 7},
658 {Intrinsic::rint
, MVT::nxv2f32
, 7},
659 {Intrinsic::rint
, MVT::nxv4f32
, 7},
660 {Intrinsic::rint
, MVT::nxv8f32
, 7},
661 {Intrinsic::rint
, MVT::nxv16f32
, 7},
662 {Intrinsic::rint
, MVT::v2f64
, 7},
663 {Intrinsic::rint
, MVT::v4f64
, 7},
664 {Intrinsic::rint
, MVT::v8f64
, 7},
665 {Intrinsic::rint
, MVT::v16f64
, 7},
666 {Intrinsic::rint
, MVT::nxv1f64
, 7},
667 {Intrinsic::rint
, MVT::nxv2f64
, 7},
668 {Intrinsic::rint
, MVT::nxv4f64
, 7},
669 {Intrinsic::rint
, MVT::nxv8f64
, 7},
670 {Intrinsic::lrint
, MVT::v2i32
, 1},
671 {Intrinsic::lrint
, MVT::v4i32
, 1},
672 {Intrinsic::lrint
, MVT::v8i32
, 1},
673 {Intrinsic::lrint
, MVT::v16i32
, 1},
674 {Intrinsic::lrint
, MVT::nxv1i32
, 1},
675 {Intrinsic::lrint
, MVT::nxv2i32
, 1},
676 {Intrinsic::lrint
, MVT::nxv4i32
, 1},
677 {Intrinsic::lrint
, MVT::nxv8i32
, 1},
678 {Intrinsic::lrint
, MVT::nxv16i32
, 1},
679 {Intrinsic::lrint
, MVT::v2i64
, 1},
680 {Intrinsic::lrint
, MVT::v4i64
, 1},
681 {Intrinsic::lrint
, MVT::v8i64
, 1},
682 {Intrinsic::lrint
, MVT::v16i64
, 1},
683 {Intrinsic::lrint
, MVT::nxv1i64
, 1},
684 {Intrinsic::lrint
, MVT::nxv2i64
, 1},
685 {Intrinsic::lrint
, MVT::nxv4i64
, 1},
686 {Intrinsic::lrint
, MVT::nxv8i64
, 1},
687 {Intrinsic::llrint
, MVT::v2i64
, 1},
688 {Intrinsic::llrint
, MVT::v4i64
, 1},
689 {Intrinsic::llrint
, MVT::v8i64
, 1},
690 {Intrinsic::llrint
, MVT::v16i64
, 1},
691 {Intrinsic::llrint
, MVT::nxv1i64
, 1},
692 {Intrinsic::llrint
, MVT::nxv2i64
, 1},
693 {Intrinsic::llrint
, MVT::nxv4i64
, 1},
694 {Intrinsic::llrint
, MVT::nxv8i64
, 1},
695 {Intrinsic::nearbyint
, MVT::v2f32
, 9},
696 {Intrinsic::nearbyint
, MVT::v4f32
, 9},
697 {Intrinsic::nearbyint
, MVT::v8f32
, 9},
698 {Intrinsic::nearbyint
, MVT::v16f32
, 9},
699 {Intrinsic::nearbyint
, MVT::nxv1f32
, 9},
700 {Intrinsic::nearbyint
, MVT::nxv2f32
, 9},
701 {Intrinsic::nearbyint
, MVT::nxv4f32
, 9},
702 {Intrinsic::nearbyint
, MVT::nxv8f32
, 9},
703 {Intrinsic::nearbyint
, MVT::nxv16f32
, 9},
704 {Intrinsic::nearbyint
, MVT::v2f64
, 9},
705 {Intrinsic::nearbyint
, MVT::v4f64
, 9},
706 {Intrinsic::nearbyint
, MVT::v8f64
, 9},
707 {Intrinsic::nearbyint
, MVT::v16f64
, 9},
708 {Intrinsic::nearbyint
, MVT::nxv1f64
, 9},
709 {Intrinsic::nearbyint
, MVT::nxv2f64
, 9},
710 {Intrinsic::nearbyint
, MVT::nxv4f64
, 9},
711 {Intrinsic::nearbyint
, MVT::nxv8f64
, 9},
712 {Intrinsic::bswap
, MVT::v2i16
, 3},
713 {Intrinsic::bswap
, MVT::v4i16
, 3},
714 {Intrinsic::bswap
, MVT::v8i16
, 3},
715 {Intrinsic::bswap
, MVT::v16i16
, 3},
716 {Intrinsic::bswap
, MVT::nxv1i16
, 3},
717 {Intrinsic::bswap
, MVT::nxv2i16
, 3},
718 {Intrinsic::bswap
, MVT::nxv4i16
, 3},
719 {Intrinsic::bswap
, MVT::nxv8i16
, 3},
720 {Intrinsic::bswap
, MVT::nxv16i16
, 3},
721 {Intrinsic::bswap
, MVT::v2i32
, 12},
722 {Intrinsic::bswap
, MVT::v4i32
, 12},
723 {Intrinsic::bswap
, MVT::v8i32
, 12},
724 {Intrinsic::bswap
, MVT::v16i32
, 12},
725 {Intrinsic::bswap
, MVT::nxv1i32
, 12},
726 {Intrinsic::bswap
, MVT::nxv2i32
, 12},
727 {Intrinsic::bswap
, MVT::nxv4i32
, 12},
728 {Intrinsic::bswap
, MVT::nxv8i32
, 12},
729 {Intrinsic::bswap
, MVT::nxv16i32
, 12},
730 {Intrinsic::bswap
, MVT::v2i64
, 31},
731 {Intrinsic::bswap
, MVT::v4i64
, 31},
732 {Intrinsic::bswap
, MVT::v8i64
, 31},
733 {Intrinsic::bswap
, MVT::v16i64
, 31},
734 {Intrinsic::bswap
, MVT::nxv1i64
, 31},
735 {Intrinsic::bswap
, MVT::nxv2i64
, 31},
736 {Intrinsic::bswap
, MVT::nxv4i64
, 31},
737 {Intrinsic::bswap
, MVT::nxv8i64
, 31},
738 {Intrinsic::vp_bswap
, MVT::v2i16
, 3},
739 {Intrinsic::vp_bswap
, MVT::v4i16
, 3},
740 {Intrinsic::vp_bswap
, MVT::v8i16
, 3},
741 {Intrinsic::vp_bswap
, MVT::v16i16
, 3},
742 {Intrinsic::vp_bswap
, MVT::nxv1i16
, 3},
743 {Intrinsic::vp_bswap
, MVT::nxv2i16
, 3},
744 {Intrinsic::vp_bswap
, MVT::nxv4i16
, 3},
745 {Intrinsic::vp_bswap
, MVT::nxv8i16
, 3},
746 {Intrinsic::vp_bswap
, MVT::nxv16i16
, 3},
747 {Intrinsic::vp_bswap
, MVT::v2i32
, 12},
748 {Intrinsic::vp_bswap
, MVT::v4i32
, 12},
749 {Intrinsic::vp_bswap
, MVT::v8i32
, 12},
750 {Intrinsic::vp_bswap
, MVT::v16i32
, 12},
751 {Intrinsic::vp_bswap
, MVT::nxv1i32
, 12},
752 {Intrinsic::vp_bswap
, MVT::nxv2i32
, 12},
753 {Intrinsic::vp_bswap
, MVT::nxv4i32
, 12},
754 {Intrinsic::vp_bswap
, MVT::nxv8i32
, 12},
755 {Intrinsic::vp_bswap
, MVT::nxv16i32
, 12},
756 {Intrinsic::vp_bswap
, MVT::v2i64
, 31},
757 {Intrinsic::vp_bswap
, MVT::v4i64
, 31},
758 {Intrinsic::vp_bswap
, MVT::v8i64
, 31},
759 {Intrinsic::vp_bswap
, MVT::v16i64
, 31},
760 {Intrinsic::vp_bswap
, MVT::nxv1i64
, 31},
761 {Intrinsic::vp_bswap
, MVT::nxv2i64
, 31},
762 {Intrinsic::vp_bswap
, MVT::nxv4i64
, 31},
763 {Intrinsic::vp_bswap
, MVT::nxv8i64
, 31},
764 {Intrinsic::vp_fshl
, MVT::v2i8
, 7},
765 {Intrinsic::vp_fshl
, MVT::v4i8
, 7},
766 {Intrinsic::vp_fshl
, MVT::v8i8
, 7},
767 {Intrinsic::vp_fshl
, MVT::v16i8
, 7},
768 {Intrinsic::vp_fshl
, MVT::nxv1i8
, 7},
769 {Intrinsic::vp_fshl
, MVT::nxv2i8
, 7},
770 {Intrinsic::vp_fshl
, MVT::nxv4i8
, 7},
771 {Intrinsic::vp_fshl
, MVT::nxv8i8
, 7},
772 {Intrinsic::vp_fshl
, MVT::nxv16i8
, 7},
773 {Intrinsic::vp_fshl
, MVT::nxv32i8
, 7},
774 {Intrinsic::vp_fshl
, MVT::nxv64i8
, 7},
775 {Intrinsic::vp_fshl
, MVT::v2i16
, 7},
776 {Intrinsic::vp_fshl
, MVT::v4i16
, 7},
777 {Intrinsic::vp_fshl
, MVT::v8i16
, 7},
778 {Intrinsic::vp_fshl
, MVT::v16i16
, 7},
779 {Intrinsic::vp_fshl
, MVT::nxv1i16
, 7},
780 {Intrinsic::vp_fshl
, MVT::nxv2i16
, 7},
781 {Intrinsic::vp_fshl
, MVT::nxv4i16
, 7},
782 {Intrinsic::vp_fshl
, MVT::nxv8i16
, 7},
783 {Intrinsic::vp_fshl
, MVT::nxv16i16
, 7},
784 {Intrinsic::vp_fshl
, MVT::nxv32i16
, 7},
785 {Intrinsic::vp_fshl
, MVT::v2i32
, 7},
786 {Intrinsic::vp_fshl
, MVT::v4i32
, 7},
787 {Intrinsic::vp_fshl
, MVT::v8i32
, 7},
788 {Intrinsic::vp_fshl
, MVT::v16i32
, 7},
789 {Intrinsic::vp_fshl
, MVT::nxv1i32
, 7},
790 {Intrinsic::vp_fshl
, MVT::nxv2i32
, 7},
791 {Intrinsic::vp_fshl
, MVT::nxv4i32
, 7},
792 {Intrinsic::vp_fshl
, MVT::nxv8i32
, 7},
793 {Intrinsic::vp_fshl
, MVT::nxv16i32
, 7},
794 {Intrinsic::vp_fshl
, MVT::v2i64
, 7},
795 {Intrinsic::vp_fshl
, MVT::v4i64
, 7},
796 {Intrinsic::vp_fshl
, MVT::v8i64
, 7},
797 {Intrinsic::vp_fshl
, MVT::v16i64
, 7},
798 {Intrinsic::vp_fshl
, MVT::nxv1i64
, 7},
799 {Intrinsic::vp_fshl
, MVT::nxv2i64
, 7},
800 {Intrinsic::vp_fshl
, MVT::nxv4i64
, 7},
801 {Intrinsic::vp_fshl
, MVT::nxv8i64
, 7},
802 {Intrinsic::vp_fshr
, MVT::v2i8
, 7},
803 {Intrinsic::vp_fshr
, MVT::v4i8
, 7},
804 {Intrinsic::vp_fshr
, MVT::v8i8
, 7},
805 {Intrinsic::vp_fshr
, MVT::v16i8
, 7},
806 {Intrinsic::vp_fshr
, MVT::nxv1i8
, 7},
807 {Intrinsic::vp_fshr
, MVT::nxv2i8
, 7},
808 {Intrinsic::vp_fshr
, MVT::nxv4i8
, 7},
809 {Intrinsic::vp_fshr
, MVT::nxv8i8
, 7},
810 {Intrinsic::vp_fshr
, MVT::nxv16i8
, 7},
811 {Intrinsic::vp_fshr
, MVT::nxv32i8
, 7},
812 {Intrinsic::vp_fshr
, MVT::nxv64i8
, 7},
813 {Intrinsic::vp_fshr
, MVT::v2i16
, 7},
814 {Intrinsic::vp_fshr
, MVT::v4i16
, 7},
815 {Intrinsic::vp_fshr
, MVT::v8i16
, 7},
816 {Intrinsic::vp_fshr
, MVT::v16i16
, 7},
817 {Intrinsic::vp_fshr
, MVT::nxv1i16
, 7},
818 {Intrinsic::vp_fshr
, MVT::nxv2i16
, 7},
819 {Intrinsic::vp_fshr
, MVT::nxv4i16
, 7},
820 {Intrinsic::vp_fshr
, MVT::nxv8i16
, 7},
821 {Intrinsic::vp_fshr
, MVT::nxv16i16
, 7},
822 {Intrinsic::vp_fshr
, MVT::nxv32i16
, 7},
823 {Intrinsic::vp_fshr
, MVT::v2i32
, 7},
824 {Intrinsic::vp_fshr
, MVT::v4i32
, 7},
825 {Intrinsic::vp_fshr
, MVT::v8i32
, 7},
826 {Intrinsic::vp_fshr
, MVT::v16i32
, 7},
827 {Intrinsic::vp_fshr
, MVT::nxv1i32
, 7},
828 {Intrinsic::vp_fshr
, MVT::nxv2i32
, 7},
829 {Intrinsic::vp_fshr
, MVT::nxv4i32
, 7},
830 {Intrinsic::vp_fshr
, MVT::nxv8i32
, 7},
831 {Intrinsic::vp_fshr
, MVT::nxv16i32
, 7},
832 {Intrinsic::vp_fshr
, MVT::v2i64
, 7},
833 {Intrinsic::vp_fshr
, MVT::v4i64
, 7},
834 {Intrinsic::vp_fshr
, MVT::v8i64
, 7},
835 {Intrinsic::vp_fshr
, MVT::v16i64
, 7},
836 {Intrinsic::vp_fshr
, MVT::nxv1i64
, 7},
837 {Intrinsic::vp_fshr
, MVT::nxv2i64
, 7},
838 {Intrinsic::vp_fshr
, MVT::nxv4i64
, 7},
839 {Intrinsic::vp_fshr
, MVT::nxv8i64
, 7},
840 {Intrinsic::bitreverse
, MVT::v2i8
, 17},
841 {Intrinsic::bitreverse
, MVT::v4i8
, 17},
842 {Intrinsic::bitreverse
, MVT::v8i8
, 17},
843 {Intrinsic::bitreverse
, MVT::v16i8
, 17},
844 {Intrinsic::bitreverse
, MVT::nxv1i8
, 17},
845 {Intrinsic::bitreverse
, MVT::nxv2i8
, 17},
846 {Intrinsic::bitreverse
, MVT::nxv4i8
, 17},
847 {Intrinsic::bitreverse
, MVT::nxv8i8
, 17},
848 {Intrinsic::bitreverse
, MVT::nxv16i8
, 17},
849 {Intrinsic::bitreverse
, MVT::v2i16
, 24},
850 {Intrinsic::bitreverse
, MVT::v4i16
, 24},
851 {Intrinsic::bitreverse
, MVT::v8i16
, 24},
852 {Intrinsic::bitreverse
, MVT::v16i16
, 24},
853 {Intrinsic::bitreverse
, MVT::nxv1i16
, 24},
854 {Intrinsic::bitreverse
, MVT::nxv2i16
, 24},
855 {Intrinsic::bitreverse
, MVT::nxv4i16
, 24},
856 {Intrinsic::bitreverse
, MVT::nxv8i16
, 24},
857 {Intrinsic::bitreverse
, MVT::nxv16i16
, 24},
858 {Intrinsic::bitreverse
, MVT::v2i32
, 33},
859 {Intrinsic::bitreverse
, MVT::v4i32
, 33},
860 {Intrinsic::bitreverse
, MVT::v8i32
, 33},
861 {Intrinsic::bitreverse
, MVT::v16i32
, 33},
862 {Intrinsic::bitreverse
, MVT::nxv1i32
, 33},
863 {Intrinsic::bitreverse
, MVT::nxv2i32
, 33},
864 {Intrinsic::bitreverse
, MVT::nxv4i32
, 33},
865 {Intrinsic::bitreverse
, MVT::nxv8i32
, 33},
866 {Intrinsic::bitreverse
, MVT::nxv16i32
, 33},
867 {Intrinsic::bitreverse
, MVT::v2i64
, 52},
868 {Intrinsic::bitreverse
, MVT::v4i64
, 52},
869 {Intrinsic::bitreverse
, MVT::v8i64
, 52},
870 {Intrinsic::bitreverse
, MVT::v16i64
, 52},
871 {Intrinsic::bitreverse
, MVT::nxv1i64
, 52},
872 {Intrinsic::bitreverse
, MVT::nxv2i64
, 52},
873 {Intrinsic::bitreverse
, MVT::nxv4i64
, 52},
874 {Intrinsic::bitreverse
, MVT::nxv8i64
, 52},
875 {Intrinsic::vp_bitreverse
, MVT::v2i8
, 17},
876 {Intrinsic::vp_bitreverse
, MVT::v4i8
, 17},
877 {Intrinsic::vp_bitreverse
, MVT::v8i8
, 17},
878 {Intrinsic::vp_bitreverse
, MVT::v16i8
, 17},
879 {Intrinsic::vp_bitreverse
, MVT::nxv1i8
, 17},
880 {Intrinsic::vp_bitreverse
, MVT::nxv2i8
, 17},
881 {Intrinsic::vp_bitreverse
, MVT::nxv4i8
, 17},
882 {Intrinsic::vp_bitreverse
, MVT::nxv8i8
, 17},
883 {Intrinsic::vp_bitreverse
, MVT::nxv16i8
, 17},
884 {Intrinsic::vp_bitreverse
, MVT::v2i16
, 24},
885 {Intrinsic::vp_bitreverse
, MVT::v4i16
, 24},
886 {Intrinsic::vp_bitreverse
, MVT::v8i16
, 24},
887 {Intrinsic::vp_bitreverse
, MVT::v16i16
, 24},
888 {Intrinsic::vp_bitreverse
, MVT::nxv1i16
, 24},
889 {Intrinsic::vp_bitreverse
, MVT::nxv2i16
, 24},
890 {Intrinsic::vp_bitreverse
, MVT::nxv4i16
, 24},
891 {Intrinsic::vp_bitreverse
, MVT::nxv8i16
, 24},
892 {Intrinsic::vp_bitreverse
, MVT::nxv16i16
, 24},
893 {Intrinsic::vp_bitreverse
, MVT::v2i32
, 33},
894 {Intrinsic::vp_bitreverse
, MVT::v4i32
, 33},
895 {Intrinsic::vp_bitreverse
, MVT::v8i32
, 33},
896 {Intrinsic::vp_bitreverse
, MVT::v16i32
, 33},
897 {Intrinsic::vp_bitreverse
, MVT::nxv1i32
, 33},
898 {Intrinsic::vp_bitreverse
, MVT::nxv2i32
, 33},
899 {Intrinsic::vp_bitreverse
, MVT::nxv4i32
, 33},
900 {Intrinsic::vp_bitreverse
, MVT::nxv8i32
, 33},
901 {Intrinsic::vp_bitreverse
, MVT::nxv16i32
, 33},
902 {Intrinsic::vp_bitreverse
, MVT::v2i64
, 52},
903 {Intrinsic::vp_bitreverse
, MVT::v4i64
, 52},
904 {Intrinsic::vp_bitreverse
, MVT::v8i64
, 52},
905 {Intrinsic::vp_bitreverse
, MVT::v16i64
, 52},
906 {Intrinsic::vp_bitreverse
, MVT::nxv1i64
, 52},
907 {Intrinsic::vp_bitreverse
, MVT::nxv2i64
, 52},
908 {Intrinsic::vp_bitreverse
, MVT::nxv4i64
, 52},
909 {Intrinsic::vp_bitreverse
, MVT::nxv8i64
, 52},
910 {Intrinsic::ctpop
, MVT::v2i8
, 12},
911 {Intrinsic::ctpop
, MVT::v4i8
, 12},
912 {Intrinsic::ctpop
, MVT::v8i8
, 12},
913 {Intrinsic::ctpop
, MVT::v16i8
, 12},
914 {Intrinsic::ctpop
, MVT::nxv1i8
, 12},
915 {Intrinsic::ctpop
, MVT::nxv2i8
, 12},
916 {Intrinsic::ctpop
, MVT::nxv4i8
, 12},
917 {Intrinsic::ctpop
, MVT::nxv8i8
, 12},
918 {Intrinsic::ctpop
, MVT::nxv16i8
, 12},
919 {Intrinsic::ctpop
, MVT::v2i16
, 19},
920 {Intrinsic::ctpop
, MVT::v4i16
, 19},
921 {Intrinsic::ctpop
, MVT::v8i16
, 19},
922 {Intrinsic::ctpop
, MVT::v16i16
, 19},
923 {Intrinsic::ctpop
, MVT::nxv1i16
, 19},
924 {Intrinsic::ctpop
, MVT::nxv2i16
, 19},
925 {Intrinsic::ctpop
, MVT::nxv4i16
, 19},
926 {Intrinsic::ctpop
, MVT::nxv8i16
, 19},
927 {Intrinsic::ctpop
, MVT::nxv16i16
, 19},
928 {Intrinsic::ctpop
, MVT::v2i32
, 20},
929 {Intrinsic::ctpop
, MVT::v4i32
, 20},
930 {Intrinsic::ctpop
, MVT::v8i32
, 20},
931 {Intrinsic::ctpop
, MVT::v16i32
, 20},
932 {Intrinsic::ctpop
, MVT::nxv1i32
, 20},
933 {Intrinsic::ctpop
, MVT::nxv2i32
, 20},
934 {Intrinsic::ctpop
, MVT::nxv4i32
, 20},
935 {Intrinsic::ctpop
, MVT::nxv8i32
, 20},
936 {Intrinsic::ctpop
, MVT::nxv16i32
, 20},
937 {Intrinsic::ctpop
, MVT::v2i64
, 21},
938 {Intrinsic::ctpop
, MVT::v4i64
, 21},
939 {Intrinsic::ctpop
, MVT::v8i64
, 21},
940 {Intrinsic::ctpop
, MVT::v16i64
, 21},
941 {Intrinsic::ctpop
, MVT::nxv1i64
, 21},
942 {Intrinsic::ctpop
, MVT::nxv2i64
, 21},
943 {Intrinsic::ctpop
, MVT::nxv4i64
, 21},
944 {Intrinsic::ctpop
, MVT::nxv8i64
, 21},
945 {Intrinsic::vp_ctpop
, MVT::v2i8
, 12},
946 {Intrinsic::vp_ctpop
, MVT::v4i8
, 12},
947 {Intrinsic::vp_ctpop
, MVT::v8i8
, 12},
948 {Intrinsic::vp_ctpop
, MVT::v16i8
, 12},
949 {Intrinsic::vp_ctpop
, MVT::nxv1i8
, 12},
950 {Intrinsic::vp_ctpop
, MVT::nxv2i8
, 12},
951 {Intrinsic::vp_ctpop
, MVT::nxv4i8
, 12},
952 {Intrinsic::vp_ctpop
, MVT::nxv8i8
, 12},
953 {Intrinsic::vp_ctpop
, MVT::nxv16i8
, 12},
954 {Intrinsic::vp_ctpop
, MVT::v2i16
, 19},
955 {Intrinsic::vp_ctpop
, MVT::v4i16
, 19},
956 {Intrinsic::vp_ctpop
, MVT::v8i16
, 19},
957 {Intrinsic::vp_ctpop
, MVT::v16i16
, 19},
958 {Intrinsic::vp_ctpop
, MVT::nxv1i16
, 19},
959 {Intrinsic::vp_ctpop
, MVT::nxv2i16
, 19},
960 {Intrinsic::vp_ctpop
, MVT::nxv4i16
, 19},
961 {Intrinsic::vp_ctpop
, MVT::nxv8i16
, 19},
962 {Intrinsic::vp_ctpop
, MVT::nxv16i16
, 19},
963 {Intrinsic::vp_ctpop
, MVT::v2i32
, 20},
964 {Intrinsic::vp_ctpop
, MVT::v4i32
, 20},
965 {Intrinsic::vp_ctpop
, MVT::v8i32
, 20},
966 {Intrinsic::vp_ctpop
, MVT::v16i32
, 20},
967 {Intrinsic::vp_ctpop
, MVT::nxv1i32
, 20},
968 {Intrinsic::vp_ctpop
, MVT::nxv2i32
, 20},
969 {Intrinsic::vp_ctpop
, MVT::nxv4i32
, 20},
970 {Intrinsic::vp_ctpop
, MVT::nxv8i32
, 20},
971 {Intrinsic::vp_ctpop
, MVT::nxv16i32
, 20},
972 {Intrinsic::vp_ctpop
, MVT::v2i64
, 21},
973 {Intrinsic::vp_ctpop
, MVT::v4i64
, 21},
974 {Intrinsic::vp_ctpop
, MVT::v8i64
, 21},
975 {Intrinsic::vp_ctpop
, MVT::v16i64
, 21},
976 {Intrinsic::vp_ctpop
, MVT::nxv1i64
, 21},
977 {Intrinsic::vp_ctpop
, MVT::nxv2i64
, 21},
978 {Intrinsic::vp_ctpop
, MVT::nxv4i64
, 21},
979 {Intrinsic::vp_ctpop
, MVT::nxv8i64
, 21},
980 {Intrinsic::vp_ctlz
, MVT::v2i8
, 19},
981 {Intrinsic::vp_ctlz
, MVT::v4i8
, 19},
982 {Intrinsic::vp_ctlz
, MVT::v8i8
, 19},
983 {Intrinsic::vp_ctlz
, MVT::v16i8
, 19},
984 {Intrinsic::vp_ctlz
, MVT::nxv1i8
, 19},
985 {Intrinsic::vp_ctlz
, MVT::nxv2i8
, 19},
986 {Intrinsic::vp_ctlz
, MVT::nxv4i8
, 19},
987 {Intrinsic::vp_ctlz
, MVT::nxv8i8
, 19},
988 {Intrinsic::vp_ctlz
, MVT::nxv16i8
, 19},
989 {Intrinsic::vp_ctlz
, MVT::nxv32i8
, 19},
990 {Intrinsic::vp_ctlz
, MVT::nxv64i8
, 19},
991 {Intrinsic::vp_ctlz
, MVT::v2i16
, 28},
992 {Intrinsic::vp_ctlz
, MVT::v4i16
, 28},
993 {Intrinsic::vp_ctlz
, MVT::v8i16
, 28},
994 {Intrinsic::vp_ctlz
, MVT::v16i16
, 28},
995 {Intrinsic::vp_ctlz
, MVT::nxv1i16
, 28},
996 {Intrinsic::vp_ctlz
, MVT::nxv2i16
, 28},
997 {Intrinsic::vp_ctlz
, MVT::nxv4i16
, 28},
998 {Intrinsic::vp_ctlz
, MVT::nxv8i16
, 28},
999 {Intrinsic::vp_ctlz
, MVT::nxv16i16
, 28},
1000 {Intrinsic::vp_ctlz
, MVT::nxv32i16
, 28},
1001 {Intrinsic::vp_ctlz
, MVT::v2i32
, 31},
1002 {Intrinsic::vp_ctlz
, MVT::v4i32
, 31},
1003 {Intrinsic::vp_ctlz
, MVT::v8i32
, 31},
1004 {Intrinsic::vp_ctlz
, MVT::v16i32
, 31},
1005 {Intrinsic::vp_ctlz
, MVT::nxv1i32
, 31},
1006 {Intrinsic::vp_ctlz
, MVT::nxv2i32
, 31},
1007 {Intrinsic::vp_ctlz
, MVT::nxv4i32
, 31},
1008 {Intrinsic::vp_ctlz
, MVT::nxv8i32
, 31},
1009 {Intrinsic::vp_ctlz
, MVT::nxv16i32
, 31},
1010 {Intrinsic::vp_ctlz
, MVT::v2i64
, 35},
1011 {Intrinsic::vp_ctlz
, MVT::v4i64
, 35},
1012 {Intrinsic::vp_ctlz
, MVT::v8i64
, 35},
1013 {Intrinsic::vp_ctlz
, MVT::v16i64
, 35},
1014 {Intrinsic::vp_ctlz
, MVT::nxv1i64
, 35},
1015 {Intrinsic::vp_ctlz
, MVT::nxv2i64
, 35},
1016 {Intrinsic::vp_ctlz
, MVT::nxv4i64
, 35},
1017 {Intrinsic::vp_ctlz
, MVT::nxv8i64
, 35},
1018 {Intrinsic::vp_cttz
, MVT::v2i8
, 16},
1019 {Intrinsic::vp_cttz
, MVT::v4i8
, 16},
1020 {Intrinsic::vp_cttz
, MVT::v8i8
, 16},
1021 {Intrinsic::vp_cttz
, MVT::v16i8
, 16},
1022 {Intrinsic::vp_cttz
, MVT::nxv1i8
, 16},
1023 {Intrinsic::vp_cttz
, MVT::nxv2i8
, 16},
1024 {Intrinsic::vp_cttz
, MVT::nxv4i8
, 16},
1025 {Intrinsic::vp_cttz
, MVT::nxv8i8
, 16},
1026 {Intrinsic::vp_cttz
, MVT::nxv16i8
, 16},
1027 {Intrinsic::vp_cttz
, MVT::nxv32i8
, 16},
1028 {Intrinsic::vp_cttz
, MVT::nxv64i8
, 16},
1029 {Intrinsic::vp_cttz
, MVT::v2i16
, 23},
1030 {Intrinsic::vp_cttz
, MVT::v4i16
, 23},
1031 {Intrinsic::vp_cttz
, MVT::v8i16
, 23},
1032 {Intrinsic::vp_cttz
, MVT::v16i16
, 23},
1033 {Intrinsic::vp_cttz
, MVT::nxv1i16
, 23},
1034 {Intrinsic::vp_cttz
, MVT::nxv2i16
, 23},
1035 {Intrinsic::vp_cttz
, MVT::nxv4i16
, 23},
1036 {Intrinsic::vp_cttz
, MVT::nxv8i16
, 23},
1037 {Intrinsic::vp_cttz
, MVT::nxv16i16
, 23},
1038 {Intrinsic::vp_cttz
, MVT::nxv32i16
, 23},
1039 {Intrinsic::vp_cttz
, MVT::v2i32
, 24},
1040 {Intrinsic::vp_cttz
, MVT::v4i32
, 24},
1041 {Intrinsic::vp_cttz
, MVT::v8i32
, 24},
1042 {Intrinsic::vp_cttz
, MVT::v16i32
, 24},
1043 {Intrinsic::vp_cttz
, MVT::nxv1i32
, 24},
1044 {Intrinsic::vp_cttz
, MVT::nxv2i32
, 24},
1045 {Intrinsic::vp_cttz
, MVT::nxv4i32
, 24},
1046 {Intrinsic::vp_cttz
, MVT::nxv8i32
, 24},
1047 {Intrinsic::vp_cttz
, MVT::nxv16i32
, 24},
1048 {Intrinsic::vp_cttz
, MVT::v2i64
, 25},
1049 {Intrinsic::vp_cttz
, MVT::v4i64
, 25},
1050 {Intrinsic::vp_cttz
, MVT::v8i64
, 25},
1051 {Intrinsic::vp_cttz
, MVT::v16i64
, 25},
1052 {Intrinsic::vp_cttz
, MVT::nxv1i64
, 25},
1053 {Intrinsic::vp_cttz
, MVT::nxv2i64
, 25},
1054 {Intrinsic::vp_cttz
, MVT::nxv4i64
, 25},
1055 {Intrinsic::vp_cttz
, MVT::nxv8i64
, 25},
1058 static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID
) {
1060 #define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
1061 case Intrinsic::VPID: \
1063 #include "llvm/IR/VPIntrinsics.def"
1064 #undef HELPER_MAP_VPID_TO_VPSD
1066 return ISD::DELETED_NODE
;
1070 RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes
&ICA
,
1071 TTI::TargetCostKind CostKind
) {
1072 auto *RetTy
= ICA
.getReturnType();
1073 switch (ICA
.getID()) {
1074 case Intrinsic::ceil
:
1075 case Intrinsic::floor
:
1076 case Intrinsic::trunc
:
1077 case Intrinsic::rint
:
1078 case Intrinsic::lrint
:
1079 case Intrinsic::llrint
:
1080 case Intrinsic::round
:
1081 case Intrinsic::roundeven
: {
1082 // These all use the same code.
1083 auto LT
= getTypeLegalizationCost(RetTy
);
1084 if (!LT
.second
.isVector() && TLI
->isOperationCustom(ISD::FCEIL
, LT
.second
))
1085 return LT
.first
* 8;
1088 case Intrinsic::umin
:
1089 case Intrinsic::umax
:
1090 case Intrinsic::smin
:
1091 case Intrinsic::smax
: {
1092 auto LT
= getTypeLegalizationCost(RetTy
);
1093 if ((ST
->hasVInstructions() && LT
.second
.isVector()) ||
1094 (LT
.second
.isScalarInteger() && ST
->hasStdExtZbb()))
1098 case Intrinsic::sadd_sat
:
1099 case Intrinsic::ssub_sat
:
1100 case Intrinsic::uadd_sat
:
1101 case Intrinsic::usub_sat
:
1102 case Intrinsic::fabs
:
1103 case Intrinsic::sqrt
: {
1104 auto LT
= getTypeLegalizationCost(RetTy
);
1105 if (ST
->hasVInstructions() && LT
.second
.isVector())
1109 case Intrinsic::ctpop
: {
1110 auto LT
= getTypeLegalizationCost(RetTy
);
1111 if (ST
->hasVInstructions() && ST
->hasStdExtZvbb() && LT
.second
.isVector())
1115 case Intrinsic::abs
: {
1116 auto LT
= getTypeLegalizationCost(RetTy
);
1117 if (ST
->hasVInstructions() && LT
.second
.isVector()) {
1118 // vrsub.vi v10, v8, 0
1119 // vmax.vv v8, v8, v10
1120 return LT
.first
* 2;
1124 // TODO: add more intrinsic
1125 case Intrinsic::experimental_stepvector
: {
1126 unsigned Cost
= 1; // vid
1127 auto LT
= getTypeLegalizationCost(RetTy
);
1128 return Cost
+ (LT
.first
- 1);
1130 case Intrinsic::vp_rint
: {
1131 // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
1133 auto LT
= getTypeLegalizationCost(RetTy
);
1134 if (TLI
->isOperationCustom(ISD::VP_FRINT
, LT
.second
))
1135 return Cost
* LT
.first
;
1138 case Intrinsic::vp_nearbyint
: {
1139 // More one read and one write for fflags than vp_rint.
1141 auto LT
= getTypeLegalizationCost(RetTy
);
1142 if (TLI
->isOperationCustom(ISD::VP_FRINT
, LT
.second
))
1143 return Cost
* LT
.first
;
1146 case Intrinsic::vp_ceil
:
1147 case Intrinsic::vp_floor
:
1148 case Intrinsic::vp_round
:
1149 case Intrinsic::vp_roundeven
:
1150 case Intrinsic::vp_roundtozero
: {
1151 // Rounding with static rounding mode needs two more instructions to
1152 // swap/write FRM than vp_rint.
1154 auto LT
= getTypeLegalizationCost(RetTy
);
1155 unsigned VPISD
= getISDForVPIntrinsicID(ICA
.getID());
1156 if (TLI
->isOperationCustom(VPISD
, LT
.second
))
1157 return Cost
* LT
.first
;
1162 if (ST
->hasVInstructions() && RetTy
->isVectorTy()) {
1163 auto LT
= getTypeLegalizationCost(RetTy
);
1164 if (const auto *Entry
= CostTableLookup(VectorIntrinsicCostTable
,
1165 ICA
.getID(), LT
.second
))
1166 return LT
.first
* Entry
->Cost
;
1169 return BaseT::getIntrinsicInstrCost(ICA
, CostKind
);
1172 InstructionCost
RISCVTTIImpl::getCastInstrCost(unsigned Opcode
, Type
*Dst
,
1174 TTI::CastContextHint CCH
,
1175 TTI::TargetCostKind CostKind
,
1176 const Instruction
*I
) {
1177 if (isa
<VectorType
>(Dst
) && isa
<VectorType
>(Src
)) {
1178 // FIXME: Need to compute legalizing cost for illegal types.
1179 if (!isTypeLegal(Src
) || !isTypeLegal(Dst
))
1180 return BaseT::getCastInstrCost(Opcode
, Dst
, Src
, CCH
, CostKind
, I
);
1182 // Skip if element size of Dst or Src is bigger than ELEN.
1183 if (Src
->getScalarSizeInBits() > ST
->getELen() ||
1184 Dst
->getScalarSizeInBits() > ST
->getELen())
1185 return BaseT::getCastInstrCost(Opcode
, Dst
, Src
, CCH
, CostKind
, I
);
1187 int ISD
= TLI
->InstructionOpcodeToISD(Opcode
);
1188 assert(ISD
&& "Invalid opcode");
1190 // FIXME: Need to consider vsetvli and lmul.
1191 int PowDiff
= (int)Log2_32(Dst
->getScalarSizeInBits()) -
1192 (int)Log2_32(Src
->getScalarSizeInBits());
1194 case ISD::SIGN_EXTEND
:
1195 case ISD::ZERO_EXTEND
:
1196 if (Src
->getScalarSizeInBits() == 1) {
1197 // We do not use vsext/vzext to extend from mask vector.
1198 // Instead we use the following instructions to extend from mask vector:
1200 // vmerge.vim v8, v8, -1, v0
1205 if (Dst
->getScalarSizeInBits() == 1) {
1206 // We do not use several vncvt to truncate to mask vector. So we could
1207 // not use PowDiff to calculate it.
1208 // Instead we use the following instructions to truncate to mask vector:
1209 // vand.vi v8, v8, 1
1210 // vmsne.vi v0, v8, 0
1214 case ISD::FP_EXTEND
:
1216 // Counts of narrow/widen instructions.
1217 return std::abs(PowDiff
);
1218 case ISD::FP_TO_SINT
:
1219 case ISD::FP_TO_UINT
:
1220 case ISD::SINT_TO_FP
:
1221 case ISD::UINT_TO_FP
:
1222 if (Src
->getScalarSizeInBits() == 1 || Dst
->getScalarSizeInBits() == 1) {
1223 // The cost of convert from or to mask vector is different from other
1224 // cases. We could not use PowDiff to calculate it.
1225 // For mask vector to fp, we should use the following instructions:
1227 // vmerge.vim v8, v8, -1, v0
1228 // vfcvt.f.x.v v8, v8
1230 // And for fp vector to mask, we use:
1231 // vfncvt.rtz.x.f.w v9, v8
1232 // vand.vi v8, v9, 1
1233 // vmsne.vi v0, v8, 0
1236 if (std::abs(PowDiff
) <= 1)
1238 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
1239 // so it only need two conversion.
1240 if (Src
->isIntOrIntVectorTy())
1242 // Counts of narrow/widen instructions.
1243 return std::abs(PowDiff
);
1246 return BaseT::getCastInstrCost(Opcode
, Dst
, Src
, CCH
, CostKind
, I
);
1249 unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType
*Ty
) {
1250 if (isa
<ScalableVectorType
>(Ty
)) {
1251 const unsigned EltSize
= DL
.getTypeSizeInBits(Ty
->getElementType());
1252 const unsigned MinSize
= DL
.getTypeSizeInBits(Ty
).getKnownMinValue();
1253 const unsigned VectorBits
= *getVScaleForTuning() * RISCV::RVVBitsPerBlock
;
1254 return RISCVTargetLowering::computeVLMAX(VectorBits
, EltSize
, MinSize
);
1256 return cast
<FixedVectorType
>(Ty
)->getNumElements();
1260 RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID
, VectorType
*Ty
,
1262 TTI::TargetCostKind CostKind
) {
1263 if (isa
<FixedVectorType
>(Ty
) && !ST
->useRVVForFixedLengthVectors())
1264 return BaseT::getMinMaxReductionCost(IID
, Ty
, FMF
, CostKind
);
1266 // Skip if scalar size of Ty is bigger than ELEN.
1267 if (Ty
->getScalarSizeInBits() > ST
->getELen())
1268 return BaseT::getMinMaxReductionCost(IID
, Ty
, FMF
, CostKind
);
1270 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(Ty
);
1271 if (Ty
->getElementType()->isIntegerTy(1))
1272 // vcpop sequences, see vreduction-mask.ll. umax, smin actually only
1273 // cost 2, but we don't have enough info here so we slightly over cost.
1274 return (LT
.first
- 1) + 3;
1276 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1277 InstructionCost BaseCost
= 2;
1279 if (CostKind
== TTI::TCK_CodeSize
)
1280 return (LT
.first
- 1) + BaseCost
;
1282 unsigned VL
= getEstimatedVLFor(Ty
);
1283 return (LT
.first
- 1) + BaseCost
+ Log2_32_Ceil(VL
);
1287 RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode
, VectorType
*Ty
,
1288 std::optional
<FastMathFlags
> FMF
,
1289 TTI::TargetCostKind CostKind
) {
1290 if (isa
<FixedVectorType
>(Ty
) && !ST
->useRVVForFixedLengthVectors())
1291 return BaseT::getArithmeticReductionCost(Opcode
, Ty
, FMF
, CostKind
);
1293 // Skip if scalar size of Ty is bigger than ELEN.
1294 if (Ty
->getScalarSizeInBits() > ST
->getELen())
1295 return BaseT::getArithmeticReductionCost(Opcode
, Ty
, FMF
, CostKind
);
1297 int ISD
= TLI
->InstructionOpcodeToISD(Opcode
);
1298 assert(ISD
&& "Invalid opcode");
1300 if (ISD
!= ISD::ADD
&& ISD
!= ISD::OR
&& ISD
!= ISD::XOR
&& ISD
!= ISD::AND
&&
1302 return BaseT::getArithmeticReductionCost(Opcode
, Ty
, FMF
, CostKind
);
1304 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(Ty
);
1305 if (Ty
->getElementType()->isIntegerTy(1))
1306 // vcpop sequences, see vreduction-mask.ll
1307 return (LT
.first
- 1) + (ISD
== ISD::AND
? 3 : 2);
1309 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1310 InstructionCost BaseCost
= 2;
1312 if (CostKind
== TTI::TCK_CodeSize
)
1313 return (LT
.first
- 1) + BaseCost
;
1315 unsigned VL
= getEstimatedVLFor(Ty
);
1316 if (TTI::requiresOrderedReduction(FMF
))
1317 return (LT
.first
- 1) + BaseCost
+ VL
;
1318 return (LT
.first
- 1) + BaseCost
+ Log2_32_Ceil(VL
);
1321 InstructionCost
RISCVTTIImpl::getExtendedReductionCost(
1322 unsigned Opcode
, bool IsUnsigned
, Type
*ResTy
, VectorType
*ValTy
,
1323 FastMathFlags FMF
, TTI::TargetCostKind CostKind
) {
1324 if (isa
<FixedVectorType
>(ValTy
) && !ST
->useRVVForFixedLengthVectors())
1325 return BaseT::getExtendedReductionCost(Opcode
, IsUnsigned
, ResTy
, ValTy
,
1328 // Skip if scalar size of ResTy is bigger than ELEN.
1329 if (ResTy
->getScalarSizeInBits() > ST
->getELen())
1330 return BaseT::getExtendedReductionCost(Opcode
, IsUnsigned
, ResTy
, ValTy
,
1333 if (Opcode
!= Instruction::Add
&& Opcode
!= Instruction::FAdd
)
1334 return BaseT::getExtendedReductionCost(Opcode
, IsUnsigned
, ResTy
, ValTy
,
1337 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(ValTy
);
1339 if (ResTy
->getScalarSizeInBits() != 2 * LT
.second
.getScalarSizeInBits())
1340 return BaseT::getExtendedReductionCost(Opcode
, IsUnsigned
, ResTy
, ValTy
,
1343 return (LT
.first
- 1) +
1344 getArithmeticReductionCost(Opcode
, ValTy
, FMF
, CostKind
);
1347 InstructionCost
RISCVTTIImpl::getStoreImmCost(Type
*Ty
,
1348 TTI::OperandValueInfo OpInfo
,
1349 TTI::TargetCostKind CostKind
) {
1350 assert(OpInfo
.isConstant() && "non constant operand?");
1351 if (!isa
<VectorType
>(Ty
))
1352 // FIXME: We need to account for immediate materialization here, but doing
1353 // a decent job requires more knowledge about the immediate than we
1354 // currently have here.
1357 if (OpInfo
.isUniform())
1358 // vmv.x.i, vmv.v.x, or vfmv.v.f
1359 // We ignore the cost of the scalar constant materialization to be consistent
1360 // with how we treat scalar constants themselves just above.
1363 return getConstantPoolLoadCost(Ty
, CostKind
);
1367 InstructionCost
RISCVTTIImpl::getMemoryOpCost(unsigned Opcode
, Type
*Src
,
1368 MaybeAlign Alignment
,
1369 unsigned AddressSpace
,
1370 TTI::TargetCostKind CostKind
,
1371 TTI::OperandValueInfo OpInfo
,
1372 const Instruction
*I
) {
1373 EVT VT
= TLI
->getValueType(DL
, Src
, true);
1374 // Type legalization can't handle structs
1375 if (VT
== MVT::Other
)
1376 return BaseT::getMemoryOpCost(Opcode
, Src
, Alignment
, AddressSpace
,
1377 CostKind
, OpInfo
, I
);
1379 InstructionCost Cost
= 0;
1380 if (Opcode
== Instruction::Store
&& OpInfo
.isConstant())
1381 Cost
+= getStoreImmCost(Src
, OpInfo
, CostKind
);
1382 InstructionCost BaseCost
=
1383 BaseT::getMemoryOpCost(Opcode
, Src
, Alignment
, AddressSpace
,
1384 CostKind
, OpInfo
, I
);
1385 // Assume memory ops cost scale with the number of vector registers
1386 // possible accessed by the instruction. Note that BasicTTI already
1387 // handles the LT.first term for us.
1388 if (std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(Src
);
1389 LT
.second
.isVector())
1390 BaseCost
*= TLI
->getLMULCost(LT
.second
);
1391 return Cost
+ BaseCost
;
1395 InstructionCost
RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode
, Type
*ValTy
,
1397 CmpInst::Predicate VecPred
,
1398 TTI::TargetCostKind CostKind
,
1399 const Instruction
*I
) {
1400 if (CostKind
!= TTI::TCK_RecipThroughput
)
1401 return BaseT::getCmpSelInstrCost(Opcode
, ValTy
, CondTy
, VecPred
, CostKind
,
1404 if (isa
<FixedVectorType
>(ValTy
) && !ST
->useRVVForFixedLengthVectors())
1405 return BaseT::getCmpSelInstrCost(Opcode
, ValTy
, CondTy
, VecPred
, CostKind
,
1408 // Skip if scalar size of ValTy is bigger than ELEN.
1409 if (ValTy
->isVectorTy() && ValTy
->getScalarSizeInBits() > ST
->getELen())
1410 return BaseT::getCmpSelInstrCost(Opcode
, ValTy
, CondTy
, VecPred
, CostKind
,
1413 if (Opcode
== Instruction::Select
&& ValTy
->isVectorTy()) {
1414 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(ValTy
);
1415 if (CondTy
->isVectorTy()) {
1416 if (ValTy
->getScalarSizeInBits() == 1) {
1417 // vmandn.mm v8, v8, v9
1418 // vmand.mm v9, v0, v9
1419 // vmor.mm v0, v9, v8
1420 return LT
.first
* 3;
1422 // vselect and max/min are supported natively.
1423 return LT
.first
* 1;
1426 if (ValTy
->getScalarSizeInBits() == 1) {
1428 // vmsne.vi v9, v9, 0
1429 // vmandn.mm v8, v8, v9
1430 // vmand.mm v9, v0, v9
1431 // vmor.mm v0, v9, v8
1432 return LT
.first
* 5;
1436 // vmsne.vi v0, v10, 0
1437 // vmerge.vvm v8, v9, v8, v0
1438 return LT
.first
* 3;
1441 if ((Opcode
== Instruction::ICmp
|| Opcode
== Instruction::FCmp
) &&
1442 ValTy
->isVectorTy()) {
1443 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(ValTy
);
1445 // Support natively.
1446 if (CmpInst::isIntPredicate(VecPred
))
1447 return LT
.first
* 1;
1449 // If we do not support the input floating point vector type, use the base
1450 // one which will calculate as:
1451 // ScalarizeCost + Num * Cost for fixed vector,
1452 // InvalidCost for scalable vector.
1453 if ((ValTy
->getScalarSizeInBits() == 16 && !ST
->hasVInstructionsF16()) ||
1454 (ValTy
->getScalarSizeInBits() == 32 && !ST
->hasVInstructionsF32()) ||
1455 (ValTy
->getScalarSizeInBits() == 64 && !ST
->hasVInstructionsF64()))
1456 return BaseT::getCmpSelInstrCost(Opcode
, ValTy
, CondTy
, VecPred
, CostKind
,
1459 // Support natively.
1460 case CmpInst::FCMP_OEQ
:
1461 case CmpInst::FCMP_OGT
:
1462 case CmpInst::FCMP_OGE
:
1463 case CmpInst::FCMP_OLT
:
1464 case CmpInst::FCMP_OLE
:
1465 case CmpInst::FCMP_UNE
:
1466 return LT
.first
* 1;
1467 // TODO: Other comparisons?
1473 // TODO: Add cost for scalar type.
1475 return BaseT::getCmpSelInstrCost(Opcode
, ValTy
, CondTy
, VecPred
, CostKind
, I
);
1478 InstructionCost
RISCVTTIImpl::getCFInstrCost(unsigned Opcode
,
1479 TTI::TargetCostKind CostKind
,
1480 const Instruction
*I
) {
1481 if (CostKind
!= TTI::TCK_RecipThroughput
)
1482 return Opcode
== Instruction::PHI
? 0 : 1;
1483 // Branches are assumed to be predicted.
1487 InstructionCost
RISCVTTIImpl::getVectorInstrCost(unsigned Opcode
, Type
*Val
,
1488 TTI::TargetCostKind CostKind
,
1489 unsigned Index
, Value
*Op0
,
1491 assert(Val
->isVectorTy() && "This must be a vector type");
1493 if (Opcode
!= Instruction::ExtractElement
&&
1494 Opcode
!= Instruction::InsertElement
)
1495 return BaseT::getVectorInstrCost(Opcode
, Val
, CostKind
, Index
, Op0
, Op1
);
1497 // Legalize the type.
1498 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(Val
);
1500 // This type is legalized to a scalar type.
1501 if (!LT
.second
.isVector()) {
1502 auto *FixedVecTy
= cast
<FixedVectorType
>(Val
);
1503 // If Index is a known constant, cost is zero.
1506 // Extract/InsertElement with non-constant index is very costly when
1507 // scalarized; estimate cost of loads/stores sequence via the stack:
1508 // ExtractElement cost: store vector to stack, load scalar;
1509 // InsertElement cost: store vector to stack, store scalar, load vector.
1510 Type
*ElemTy
= FixedVecTy
->getElementType();
1511 auto NumElems
= FixedVecTy
->getNumElements();
1512 auto Align
= DL
.getPrefTypeAlign(ElemTy
);
1513 InstructionCost LoadCost
=
1514 getMemoryOpCost(Instruction::Load
, ElemTy
, Align
, 0, CostKind
);
1515 InstructionCost StoreCost
=
1516 getMemoryOpCost(Instruction::Store
, ElemTy
, Align
, 0, CostKind
);
1517 return Opcode
== Instruction::ExtractElement
1518 ? StoreCost
* NumElems
+ LoadCost
1519 : (StoreCost
+ LoadCost
) * NumElems
+ StoreCost
;
1522 // For unsupported scalable vector.
1523 if (LT
.second
.isScalableVector() && !LT
.first
.isValid())
1526 if (!isTypeLegal(Val
))
1527 return BaseT::getVectorInstrCost(Opcode
, Val
, CostKind
, Index
, Op0
, Op1
);
1529 // Mask vector extract/insert is expanded via e8.
1530 if (Val
->getScalarSizeInBits() == 1) {
1531 VectorType
*WideTy
=
1532 VectorType::get(IntegerType::get(Val
->getContext(), 8),
1533 cast
<VectorType
>(Val
)->getElementCount());
1534 if (Opcode
== Instruction::ExtractElement
) {
1535 InstructionCost ExtendCost
1536 = getCastInstrCost(Instruction::ZExt
, WideTy
, Val
,
1537 TTI::CastContextHint::None
, CostKind
);
1538 InstructionCost ExtractCost
1539 = getVectorInstrCost(Opcode
, WideTy
, CostKind
, Index
, nullptr, nullptr);
1540 return ExtendCost
+ ExtractCost
;
1542 InstructionCost ExtendCost
1543 = getCastInstrCost(Instruction::ZExt
, WideTy
, Val
,
1544 TTI::CastContextHint::None
, CostKind
);
1545 InstructionCost InsertCost
1546 = getVectorInstrCost(Opcode
, WideTy
, CostKind
, Index
, nullptr, nullptr);
1547 InstructionCost TruncCost
1548 = getCastInstrCost(Instruction::Trunc
, Val
, WideTy
,
1549 TTI::CastContextHint::None
, CostKind
);
1550 return ExtendCost
+ InsertCost
+ TruncCost
;
1554 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1555 // and vslideup + vmv.s.x to insert element to vector.
1556 unsigned BaseCost
= 1;
1557 // When insertelement we should add the index with 1 as the input of vslideup.
1558 unsigned SlideCost
= Opcode
== Instruction::InsertElement
? 2 : 1;
1561 // The type may be split. For fixed-width vectors we can normalize the
1562 // index to the new type.
1563 if (LT
.second
.isFixedLengthVector()) {
1564 unsigned Width
= LT
.second
.getVectorNumElements();
1565 Index
= Index
% Width
;
1568 // We could extract/insert the first element without vslidedown/vslideup.
1571 else if (Opcode
== Instruction::InsertElement
)
1572 SlideCost
= 1; // With a constant index, we do not need to use addi.
1575 // Extract i64 in the target that has XLEN=32 need more instruction.
1576 if (Val
->getScalarType()->isIntegerTy() &&
1577 ST
->getXLen() < Val
->getScalarSizeInBits()) {
1578 // For extractelement, we need the following instructions:
1579 // vsetivli zero, 1, e64, m1, ta, mu (not count)
1580 // vslidedown.vx v8, v8, a0
1583 // vsrl.vx v8, v8, a1
1586 // For insertelement, we need the following instructions:
1587 // vsetivli zero, 2, e32, m4, ta, mu (not count)
1589 // vslide1up.vx v16, v12, a1
1590 // vslide1up.vx v12, v16, a0
1592 // vsetvli zero, a0, e64, m4, tu, mu (not count)
1593 // vslideup.vx v8, v12, a2
1595 // TODO: should we count these special vsetvlis?
1596 BaseCost
= Opcode
== Instruction::InsertElement
? 3 : 4;
1598 return BaseCost
+ SlideCost
;
1601 InstructionCost
RISCVTTIImpl::getArithmeticInstrCost(
1602 unsigned Opcode
, Type
*Ty
, TTI::TargetCostKind CostKind
,
1603 TTI::OperandValueInfo Op1Info
, TTI::OperandValueInfo Op2Info
,
1604 ArrayRef
<const Value
*> Args
, const Instruction
*CxtI
) {
1606 // TODO: Handle more cost kinds.
1607 if (CostKind
!= TTI::TCK_RecipThroughput
)
1608 return BaseT::getArithmeticInstrCost(Opcode
, Ty
, CostKind
, Op1Info
, Op2Info
,
1611 if (isa
<FixedVectorType
>(Ty
) && !ST
->useRVVForFixedLengthVectors())
1612 return BaseT::getArithmeticInstrCost(Opcode
, Ty
, CostKind
, Op1Info
, Op2Info
,
1615 // Skip if scalar size of Ty is bigger than ELEN.
1616 if (isa
<VectorType
>(Ty
) && Ty
->getScalarSizeInBits() > ST
->getELen())
1617 return BaseT::getArithmeticInstrCost(Opcode
, Ty
, CostKind
, Op1Info
, Op2Info
,
1620 // Legalize the type.
1621 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(Ty
);
1623 // TODO: Handle scalar type.
1624 if (!LT
.second
.isVector())
1625 return BaseT::getArithmeticInstrCost(Opcode
, Ty
, CostKind
, Op1Info
, Op2Info
,
1629 auto getConstantMatCost
=
1630 [&](unsigned Operand
, TTI::OperandValueInfo OpInfo
) -> InstructionCost
{
1631 if (OpInfo
.isUniform() && TLI
->canSplatOperand(Opcode
, Operand
))
1633 // * Has a 5 bit immediate operand which can be splatted.
1634 // * Has a larger immediate which must be materialized in scalar register
1635 // We return 0 for both as we currently ignore the cost of materializing
1636 // scalar constants in GPRs.
1639 return getConstantPoolLoadCost(Ty
, CostKind
);
1642 // Add the cost of materializing any constant vectors required.
1643 InstructionCost ConstantMatCost
= 0;
1644 if (Op1Info
.isConstant())
1645 ConstantMatCost
+= getConstantMatCost(0, Op1Info
);
1646 if (Op2Info
.isConstant())
1647 ConstantMatCost
+= getConstantMatCost(1, Op2Info
);
1649 switch (TLI
->InstructionOpcodeToISD(Opcode
)) {
1665 return ConstantMatCost
+ TLI
->getLMULCost(LT
.second
) * LT
.first
* 1;
1668 return ConstantMatCost
+
1669 BaseT::getArithmeticInstrCost(Opcode
, Ty
, CostKind
, Op1Info
, Op2Info
,
1674 // TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
1675 InstructionCost
RISCVTTIImpl::getPointersChainCost(
1676 ArrayRef
<const Value
*> Ptrs
, const Value
*Base
,
1677 const TTI::PointersChainInfo
&Info
, Type
*AccessTy
,
1678 TTI::TargetCostKind CostKind
) {
1679 InstructionCost Cost
= TTI::TCC_Free
;
1680 // In the basic model we take into account GEP instructions only
1681 // (although here can come alloca instruction, a value, constants and/or
1682 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
1683 // pointer). Typically, if Base is a not a GEP-instruction and all the
1684 // pointers are relative to the same base address, all the rest are
1685 // either GEP instructions, PHIs, bitcasts or constants. When we have same
1686 // base, we just calculate cost of each non-Base GEP as an ADD operation if
1687 // any their index is a non-const.
1688 // If no known dependecies between the pointers cost is calculated as a sum
1689 // of costs of GEP instructions.
1690 for (auto [I
, V
] : enumerate(Ptrs
)) {
1691 const auto *GEP
= dyn_cast
<GetElementPtrInst
>(V
);
1694 if (Info
.isSameBase() && V
!= Base
) {
1695 if (GEP
->hasAllConstantIndices())
1697 // If the chain is unit-stride and BaseReg + stride*i is a legal
1698 // addressing mode, then presume the base GEP is sitting around in a
1699 // register somewhere and check if we can fold the offset relative to
1701 unsigned Stride
= DL
.getTypeStoreSize(AccessTy
);
1702 if (Info
.isUnitStride() &&
1703 isLegalAddressingMode(AccessTy
,
1704 /* BaseGV */ nullptr,
1705 /* BaseOffset */ Stride
* I
,
1706 /* HasBaseReg */ true,
1708 GEP
->getType()->getPointerAddressSpace()))
1710 Cost
+= getArithmeticInstrCost(Instruction::Add
, GEP
->getType(), CostKind
,
1711 {TTI::OK_AnyValue
, TTI::OP_None
},
1712 {TTI::OK_AnyValue
, TTI::OP_None
},
1715 SmallVector
<const Value
*> Indices(GEP
->indices());
1716 Cost
+= getGEPCost(GEP
->getSourceElementType(), GEP
->getPointerOperand(),
1717 Indices
, AccessTy
, CostKind
);
1723 void RISCVTTIImpl::getUnrollingPreferences(Loop
*L
, ScalarEvolution
&SE
,
1724 TTI::UnrollingPreferences
&UP
,
1725 OptimizationRemarkEmitter
*ORE
) {
1726 // TODO: More tuning on benchmarks and metrics with changes as needed
1727 // would apply to all settings below to enable performance.
1730 if (ST
->enableDefaultUnroll())
1731 return BasicTTIImplBase::getUnrollingPreferences(L
, SE
, UP
, ORE
);
1733 // Enable Upper bound unrolling universally, not dependant upon the conditions
1735 UP
.UpperBound
= true;
1737 // Disable loop unrolling for Oz and Os.
1738 UP
.OptSizeThreshold
= 0;
1739 UP
.PartialOptSizeThreshold
= 0;
1740 if (L
->getHeader()->getParent()->hasOptSize())
1743 SmallVector
<BasicBlock
*, 4> ExitingBlocks
;
1744 L
->getExitingBlocks(ExitingBlocks
);
1745 LLVM_DEBUG(dbgs() << "Loop has:\n"
1746 << "Blocks: " << L
->getNumBlocks() << "\n"
1747 << "Exit blocks: " << ExitingBlocks
.size() << "\n");
1749 // Only allow another exit other than the latch. This acts as an early exit
1750 // as it mirrors the profitability calculation of the runtime unroller.
1751 if (ExitingBlocks
.size() > 2)
1754 // Limit the CFG of the loop body for targets with a branch predictor.
1755 // Allowing 4 blocks permits if-then-else diamonds in the body.
1756 if (L
->getNumBlocks() > 4)
1759 // Don't unroll vectorized loops, including the remainder loop
1760 if (getBooleanLoopAttribute(L
, "llvm.loop.isvectorized"))
1763 // Scan the loop: don't unroll loops with calls as this could prevent
1765 InstructionCost Cost
= 0;
1766 for (auto *BB
: L
->getBlocks()) {
1767 for (auto &I
: *BB
) {
1768 // Initial setting - Don't unroll loops containing vectorized
1770 if (I
.getType()->isVectorTy())
1773 if (isa
<CallInst
>(I
) || isa
<InvokeInst
>(I
)) {
1774 if (const Function
*F
= cast
<CallBase
>(I
).getCalledFunction()) {
1775 if (!isLoweredToCall(F
))
1781 SmallVector
<const Value
*> Operands(I
.operand_values());
1782 Cost
+= getInstructionCost(&I
, Operands
,
1783 TargetTransformInfo::TCK_SizeAndLatency
);
1787 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost
<< "\n");
1791 UP
.UnrollRemainder
= true;
1792 UP
.UnrollAndJam
= true;
1793 UP
.UnrollAndJamInnerLoopThreshold
= 60;
1795 // Force unrolling small loops can be very useful because of the branch
1796 // taken cost of the backedge.
1801 void RISCVTTIImpl::getPeelingPreferences(Loop
*L
, ScalarEvolution
&SE
,
1802 TTI::PeelingPreferences
&PP
) {
1803 BaseT::getPeelingPreferences(L
, SE
, PP
);
1806 unsigned RISCVTTIImpl::getRegUsageForType(Type
*Ty
) {
1807 TypeSize Size
= DL
.getTypeSizeInBits(Ty
);
1808 if (Ty
->isVectorTy()) {
1809 if (Size
.isScalable() && ST
->hasVInstructions())
1810 return divideCeil(Size
.getKnownMinValue(), RISCV::RVVBitsPerBlock
);
1812 if (ST
->useRVVForFixedLengthVectors())
1813 return divideCeil(Size
, ST
->getRealMinVLen());
1816 return BaseT::getRegUsageForType(Ty
);
1819 unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth
, unsigned Opcode
) const {
1820 if (SLPMaxVF
.getNumOccurrences())
1823 // Return how many elements can fit in getRegisterBitwidth. This is the
1824 // same routine as used in LoopVectorizer. We should probably be
1825 // accounting for whether we actually have instructions with the right
1826 // lane type, but we don't have enough information to do that without
1827 // some additional plumbing which hasn't been justified yet.
1829 getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector
);
1830 // If no vector registers, or absurd element widths, disable
1831 // vectorization by returning 1.
1832 return std::max
<unsigned>(1U, RegWidth
.getFixedValue() / ElemWidth
);
1835 bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost
&C1
,
1836 const TargetTransformInfo::LSRCost
&C2
) {
1837 // RISC-V specific here are "instruction number 1st priority".
1838 return std::tie(C1
.Insns
, C1
.NumRegs
, C1
.AddRecCost
,
1839 C1
.NumIVMuls
, C1
.NumBaseAdds
,
1840 C1
.ScaleCost
, C1
.ImmCost
, C1
.SetupCost
) <
1841 std::tie(C2
.Insns
, C2
.NumRegs
, C2
.AddRecCost
,
1842 C2
.NumIVMuls
, C2
.NumBaseAdds
,
1843 C2
.ScaleCost
, C2
.ImmCost
, C2
.SetupCost
);