1 //===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "RISCVTargetTransformInfo.h"
10 #include "MCTargetDesc/RISCVMatInt.h"
11 #include "llvm/ADT/STLExtras.h"
12 #include "llvm/Analysis/TargetTransformInfo.h"
13 #include "llvm/CodeGen/BasicTTIImpl.h"
14 #include "llvm/CodeGen/CostTable.h"
15 #include "llvm/CodeGen/TargetLowering.h"
16 #include "llvm/IR/Instructions.h"
17 #include "llvm/IR/PatternMatch.h"
21 using namespace llvm::PatternMatch
;
23 #define DEBUG_TYPE "riscvtti"
25 static cl::opt
<unsigned> RVVRegisterWidthLMUL(
26 "riscv-v-register-bit-width-lmul",
28 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
29 "by autovectorized code. Fractional LMULs are not supported."),
30 cl::init(2), cl::Hidden
);
32 static cl::opt
<unsigned> SLPMaxVF(
35 "Overrides result used for getMaximumVF query which is used "
36 "exclusively by SLP vectorizer."),
40 RISCVTTIImpl::getRISCVInstructionCost(ArrayRef
<unsigned> OpCodes
, MVT VT
,
41 TTI::TargetCostKind CostKind
) {
42 // Check if the type is valid for all CostKind
44 return InstructionCost::getInvalid();
45 size_t NumInstr
= OpCodes
.size();
46 if (CostKind
== TTI::TCK_CodeSize
)
48 InstructionCost LMULCost
= TLI
->getLMULCost(VT
);
49 if ((CostKind
!= TTI::TCK_RecipThroughput
) && (CostKind
!= TTI::TCK_Latency
))
50 return LMULCost
* NumInstr
;
51 InstructionCost Cost
= 0;
52 for (auto Op
: OpCodes
) {
54 case RISCV::VRGATHER_VI
:
55 Cost
+= TLI
->getVRGatherVICost(VT
);
57 case RISCV::VRGATHER_VV
:
58 Cost
+= TLI
->getVRGatherVVCost(VT
);
60 case RISCV::VSLIDEUP_VI
:
61 case RISCV::VSLIDEDOWN_VI
:
62 Cost
+= TLI
->getVSlideVICost(VT
);
64 case RISCV::VSLIDEUP_VX
:
65 case RISCV::VSLIDEDOWN_VX
:
66 Cost
+= TLI
->getVSlideVXCost(VT
);
68 case RISCV::VREDMAX_VS
:
69 case RISCV::VREDMIN_VS
:
70 case RISCV::VREDMAXU_VS
:
71 case RISCV::VREDMINU_VS
:
72 case RISCV::VREDSUM_VS
:
73 case RISCV::VREDAND_VS
:
74 case RISCV::VREDOR_VS
:
75 case RISCV::VREDXOR_VS
:
76 case RISCV::VFREDMAX_VS
:
77 case RISCV::VFREDMIN_VS
:
78 case RISCV::VFREDUSUM_VS
: {
79 unsigned VL
= VT
.getVectorMinNumElements();
80 if (!VT
.isFixedLengthVector())
81 VL
*= *getVScaleForTuning();
82 Cost
+= Log2_32_Ceil(VL
);
85 case RISCV::VFREDOSUM_VS
: {
86 unsigned VL
= VT
.getVectorMinNumElements();
87 if (!VT
.isFixedLengthVector())
88 VL
*= *getVScaleForTuning();
99 case RISCV::VMANDN_MM
:
100 case RISCV::VMNAND_MM
:
102 case RISCV::VFIRST_M
:
112 static InstructionCost
getIntImmCostImpl(const DataLayout
&DL
,
113 const RISCVSubtarget
*ST
,
114 const APInt
&Imm
, Type
*Ty
,
115 TTI::TargetCostKind CostKind
,
117 assert(Ty
->isIntegerTy() &&
118 "getIntImmCost can only estimate cost of materialising integers");
120 // We have a Zero register, so 0 is always free.
122 return TTI::TCC_Free
;
124 // Otherwise, we check how many instructions it will take to materialise.
125 return RISCVMatInt::getIntMatCost(Imm
, DL
.getTypeSizeInBits(Ty
), *ST
,
126 /*CompressionCost=*/false, FreeZeroes
);
129 InstructionCost
RISCVTTIImpl::getIntImmCost(const APInt
&Imm
, Type
*Ty
,
130 TTI::TargetCostKind CostKind
) {
131 return getIntImmCostImpl(getDataLayout(), getST(), Imm
, Ty
, CostKind
, false);
134 // Look for patterns of shift followed by AND that can be turned into a pair of
135 // shifts. We won't need to materialize an immediate for the AND so these can
136 // be considered free.
137 static bool canUseShiftPair(Instruction
*Inst
, const APInt
&Imm
) {
138 uint64_t Mask
= Imm
.getZExtValue();
139 auto *BO
= dyn_cast
<BinaryOperator
>(Inst
->getOperand(0));
140 if (!BO
|| !BO
->hasOneUse())
143 if (BO
->getOpcode() != Instruction::Shl
)
146 if (!isa
<ConstantInt
>(BO
->getOperand(1)))
149 unsigned ShAmt
= cast
<ConstantInt
>(BO
->getOperand(1))->getZExtValue();
150 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
151 // is a mask shifted by c2 bits with c3 leading zeros.
152 if (isShiftedMask_64(Mask
)) {
153 unsigned Trailing
= llvm::countr_zero(Mask
);
154 if (ShAmt
== Trailing
)
161 InstructionCost
RISCVTTIImpl::getIntImmCostInst(unsigned Opcode
, unsigned Idx
,
162 const APInt
&Imm
, Type
*Ty
,
163 TTI::TargetCostKind CostKind
,
165 assert(Ty
->isIntegerTy() &&
166 "getIntImmCost can only estimate cost of materialising integers");
168 // We have a Zero register, so 0 is always free.
170 return TTI::TCC_Free
;
172 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
173 // commutative, in others the immediate comes from a specific argument index.
174 bool Takes12BitImm
= false;
175 unsigned ImmArgIdx
= ~0U;
178 case Instruction::GetElementPtr
:
179 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
180 // split up large offsets in GEP into better parts than ConstantHoisting
182 return TTI::TCC_Free
;
183 case Instruction::Store
: {
184 // Use the materialization cost regardless of if it's the address or the
185 // value that is constant, except for if the store is misaligned and
186 // misaligned accesses are not legal (experience shows constant hoisting
187 // can sometimes be harmful in such cases).
188 if (Idx
== 1 || !Inst
)
189 return getIntImmCostImpl(getDataLayout(), getST(), Imm
, Ty
, CostKind
,
190 /*FreeZeroes=*/true);
192 StoreInst
*ST
= cast
<StoreInst
>(Inst
);
193 if (!getTLI()->allowsMemoryAccessForAlignment(
194 Ty
->getContext(), DL
, getTLI()->getValueType(DL
, Ty
),
195 ST
->getPointerAddressSpace(), ST
->getAlign()))
196 return TTI::TCC_Free
;
198 return getIntImmCostImpl(getDataLayout(), getST(), Imm
, Ty
, CostKind
,
199 /*FreeZeroes=*/true);
201 case Instruction::Load
:
202 // If the address is a constant, use the materialization cost.
203 return getIntImmCost(Imm
, Ty
, CostKind
);
204 case Instruction::And
:
206 if (Imm
== UINT64_C(0xffff) && ST
->hasStdExtZbb())
207 return TTI::TCC_Free
;
209 if (Imm
== UINT64_C(0xffffffff) && ST
->hasStdExtZba())
210 return TTI::TCC_Free
;
212 if (ST
->hasStdExtZbs() && (~Imm
).isPowerOf2())
213 return TTI::TCC_Free
;
214 if (Inst
&& Idx
== 1 && Imm
.getBitWidth() <= ST
->getXLen() &&
215 canUseShiftPair(Inst
, Imm
))
216 return TTI::TCC_Free
;
217 Takes12BitImm
= true;
219 case Instruction::Add
:
220 Takes12BitImm
= true;
222 case Instruction::Or
:
223 case Instruction::Xor
:
225 if (ST
->hasStdExtZbs() && Imm
.isPowerOf2())
226 return TTI::TCC_Free
;
227 Takes12BitImm
= true;
229 case Instruction::Mul
:
230 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
231 if (Imm
.isPowerOf2() || Imm
.isNegatedPowerOf2())
232 return TTI::TCC_Free
;
233 // One more or less than a power of 2 can use SLLI+ADD/SUB.
234 if ((Imm
+ 1).isPowerOf2() || (Imm
- 1).isPowerOf2())
235 return TTI::TCC_Free
;
236 // FIXME: There is no MULI instruction.
237 Takes12BitImm
= true;
239 case Instruction::Sub
:
240 case Instruction::Shl
:
241 case Instruction::LShr
:
242 case Instruction::AShr
:
243 Takes12BitImm
= true;
251 // Check immediate is the correct argument...
252 if (Instruction::isCommutative(Opcode
) || Idx
== ImmArgIdx
) {
253 // ... and fits into the 12-bit immediate.
254 if (Imm
.getSignificantBits() <= 64 &&
255 getTLI()->isLegalAddImmediate(Imm
.getSExtValue())) {
256 return TTI::TCC_Free
;
260 // Otherwise, use the full materialisation cost.
261 return getIntImmCost(Imm
, Ty
, CostKind
);
264 // By default, prevent hoisting.
265 return TTI::TCC_Free
;
269 RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID
, unsigned Idx
,
270 const APInt
&Imm
, Type
*Ty
,
271 TTI::TargetCostKind CostKind
) {
272 // Prevent hoisting in unknown cases.
273 return TTI::TCC_Free
;
276 bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type
*DataTy
, Align
) const {
277 return ST
->hasVInstructions();
280 TargetTransformInfo::PopcntSupportKind
281 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth
) {
282 assert(isPowerOf2_32(TyWidth
) && "Ty width must be power of 2");
283 return ST
->hasStdExtZbb() || (ST
->hasVendorXCVbitmanip() && !ST
->is64Bit())
284 ? TTI::PSK_FastHardware
288 bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst
*II
) const {
289 // Currently, the ExpandReductions pass can't expand scalable-vector
290 // reductions, but we still request expansion as RVV doesn't support certain
291 // reductions and the SelectionDAG can't legalize them either.
292 switch (II
->getIntrinsicID()) {
295 // These reductions have no equivalent in RVV
296 case Intrinsic::vector_reduce_mul
:
297 case Intrinsic::vector_reduce_fmul
:
302 std::optional
<unsigned> RISCVTTIImpl::getMaxVScale() const {
303 if (ST
->hasVInstructions())
304 return ST
->getRealMaxVLen() / RISCV::RVVBitsPerBlock
;
305 return BaseT::getMaxVScale();
308 std::optional
<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
309 if (ST
->hasVInstructions())
310 if (unsigned MinVLen
= ST
->getRealMinVLen();
311 MinVLen
>= RISCV::RVVBitsPerBlock
)
312 return MinVLen
/ RISCV::RVVBitsPerBlock
;
313 return BaseT::getVScaleForTuning();
317 RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K
) const {
319 llvm::bit_floor(std::clamp
<unsigned>(RVVRegisterWidthLMUL
, 1, 8));
321 case TargetTransformInfo::RGK_Scalar
:
322 return TypeSize::getFixed(ST
->getXLen());
323 case TargetTransformInfo::RGK_FixedWidthVector
:
324 return TypeSize::getFixed(
325 ST
->useRVVForFixedLengthVectors() ? LMUL
* ST
->getRealMinVLen() : 0);
326 case TargetTransformInfo::RGK_ScalableVector
:
327 return TypeSize::getScalable(
328 (ST
->hasVInstructions() &&
329 ST
->getRealMinVLen() >= RISCV::RVVBitsPerBlock
)
330 ? LMUL
* RISCV::RVVBitsPerBlock
334 llvm_unreachable("Unsupported register kind");
338 RISCVTTIImpl::getConstantPoolLoadCost(Type
*Ty
, TTI::TargetCostKind CostKind
) {
339 // Add a cost of address generation + the cost of the load. The address
340 // is expected to be a PC relative offset to a constant pool entry
342 return 2 + getMemoryOpCost(Instruction::Load
, Ty
, DL
.getABITypeAlign(Ty
),
343 /*AddressSpace=*/0, CostKind
);
346 static bool isRepeatedConcatMask(ArrayRef
<int> Mask
, int &SubVectorSize
) {
347 unsigned Size
= Mask
.size();
348 if (!isPowerOf2_32(Size
))
350 for (unsigned I
= 0; I
!= Size
; ++I
) {
351 if (static_cast<unsigned>(Mask
[I
]) == I
)
357 for (unsigned J
= I
+ 1; J
!= Size
; ++J
)
358 // Check the pattern is repeated.
359 if (static_cast<unsigned>(Mask
[J
]) != J
% I
)
364 // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
368 static VectorType
*getVRGatherIndexType(MVT DataVT
, const RISCVSubtarget
&ST
,
370 assert((DataVT
.getScalarSizeInBits() != 8 ||
371 DataVT
.getVectorNumElements() <= 256) && "unhandled case in lowering");
372 MVT IndexVT
= DataVT
.changeTypeToInteger();
373 if (IndexVT
.getScalarType().bitsGT(ST
.getXLenVT()))
374 IndexVT
= IndexVT
.changeVectorElementType(MVT::i16
);
375 return cast
<VectorType
>(EVT(IndexVT
).getTypeForEVT(C
));
378 InstructionCost
RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind
,
379 VectorType
*Tp
, ArrayRef
<int> Mask
,
380 TTI::TargetCostKind CostKind
,
381 int Index
, VectorType
*SubTp
,
382 ArrayRef
<const Value
*> Args
,
383 const Instruction
*CxtI
) {
384 Kind
= improveShuffleKindFromMask(Kind
, Mask
, Tp
, Index
, SubTp
);
386 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(Tp
);
388 // First, handle cases where having a fixed length vector enables us to
389 // give a more accurate cost than falling back to generic scalable codegen.
390 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
391 if (isa
<FixedVectorType
>(Tp
)) {
395 case TTI::SK_PermuteSingleSrc
: {
396 if (Mask
.size() >= 2 && LT
.second
.isFixedLengthVector()) {
397 MVT EltTp
= LT
.second
.getVectorElementType();
398 // If the size of the element is < ELEN then shuffles of interleaves and
399 // deinterleaves of 2 vectors can be lowered into the following
401 if (EltTp
.getScalarSizeInBits() < ST
->getELen()) {
403 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
404 // vwaddu.vv v10, v8, v9
405 // li a0, -1 (ignored)
406 // vwmaccu.vx v10, a0, v9
407 if (ShuffleVectorInst::isInterleaveMask(Mask
, 2, Mask
.size()))
408 return 2 * LT
.first
* TLI
->getLMULCost(LT
.second
);
410 if (Mask
[0] == 0 || Mask
[0] == 1) {
411 auto DeinterleaveMask
= createStrideMask(Mask
[0], 2, Mask
.size());
413 // vnsrl.wi v10, v8, 0
414 if (equal(DeinterleaveMask
, Mask
))
415 return LT
.first
* getRISCVInstructionCost(RISCV::VNSRL_WI
,
416 LT
.second
, CostKind
);
420 if (LT
.second
.getScalarSizeInBits() != 1 &&
421 isRepeatedConcatMask(Mask
, SubVectorSize
)) {
422 InstructionCost Cost
= 0;
423 unsigned NumSlides
= Log2_32(Mask
.size() / SubVectorSize
);
424 // The cost of extraction from a subvector is 0 if the index is 0.
425 for (unsigned I
= 0; I
!= NumSlides
; ++I
) {
426 unsigned InsertIndex
= SubVectorSize
* (1 << I
);
427 FixedVectorType
*SubTp
=
428 FixedVectorType::get(Tp
->getElementType(), InsertIndex
);
429 FixedVectorType
*DestTp
=
430 FixedVectorType::getDoubleElementsVectorType(SubTp
);
431 std::pair
<InstructionCost
, MVT
> DestLT
=
432 getTypeLegalizationCost(DestTp
);
433 // Add the cost of whole vector register move because the
434 // destination vector register group for vslideup cannot overlap the
436 Cost
+= DestLT
.first
* TLI
->getLMULCost(DestLT
.second
);
437 Cost
+= getShuffleCost(TTI::SK_InsertSubvector
, DestTp
, {},
438 CostKind
, InsertIndex
, SubTp
);
443 // vrgather + cost of generating the mask constant.
444 // We model this for an unknown mask with a single vrgather.
445 if (LT
.second
.isFixedLengthVector() && LT
.first
== 1 &&
446 (LT
.second
.getScalarSizeInBits() != 8 ||
447 LT
.second
.getVectorNumElements() <= 256)) {
448 VectorType
*IdxTy
= getVRGatherIndexType(LT
.second
, *ST
, Tp
->getContext());
449 InstructionCost IndexCost
= getConstantPoolLoadCost(IdxTy
, CostKind
);
451 getRISCVInstructionCost(RISCV::VRGATHER_VV
, LT
.second
, CostKind
);
455 case TTI::SK_Transpose
:
456 case TTI::SK_PermuteTwoSrc
: {
457 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
458 // register for the second vrgather. We model this for an unknown
460 if (LT
.second
.isFixedLengthVector() && LT
.first
== 1 &&
461 (LT
.second
.getScalarSizeInBits() != 8 ||
462 LT
.second
.getVectorNumElements() <= 256)) {
463 auto &C
= Tp
->getContext();
464 auto EC
= Tp
->getElementCount();
465 VectorType
*IdxTy
= getVRGatherIndexType(LT
.second
, *ST
, C
);
466 VectorType
*MaskTy
= VectorType::get(IntegerType::getInt1Ty(C
), EC
);
467 InstructionCost IndexCost
= getConstantPoolLoadCost(IdxTy
, CostKind
);
468 InstructionCost MaskCost
= getConstantPoolLoadCost(MaskTy
, CostKind
);
469 return 2 * IndexCost
+
470 getRISCVInstructionCost({RISCV::VRGATHER_VV
, RISCV::VRGATHER_VV
},
471 LT
.second
, CostKind
) +
476 case TTI::SK_Select
: {
477 // We are going to permute multiple sources and the result will be in
478 // multiple destinations. Providing an accurate cost only for splits where
479 // the element type remains the same.
480 if (!Mask
.empty() && LT
.first
.isValid() && LT
.first
!= 1 &&
481 LT
.second
.isFixedLengthVector() &&
482 LT
.second
.getVectorElementType().getSizeInBits() ==
483 Tp
->getElementType()->getPrimitiveSizeInBits() &&
484 LT
.second
.getVectorNumElements() <
485 cast
<FixedVectorType
>(Tp
)->getNumElements() &&
486 divideCeil(Mask
.size(),
487 cast
<FixedVectorType
>(Tp
)->getNumElements()) ==
488 static_cast<unsigned>(*LT
.first
.getValue())) {
489 unsigned NumRegs
= *LT
.first
.getValue();
490 unsigned VF
= cast
<FixedVectorType
>(Tp
)->getNumElements();
491 unsigned SubVF
= PowerOf2Ceil(VF
/ NumRegs
);
492 auto *SubVecTy
= FixedVectorType::get(Tp
->getElementType(), SubVF
);
494 InstructionCost Cost
= 0;
495 for (unsigned I
= 0, NumSrcRegs
= divideCeil(Mask
.size(), SubVF
);
496 I
< NumSrcRegs
; ++I
) {
497 bool IsSingleVector
= true;
498 SmallVector
<int> SubMask(SubVF
, PoisonMaskElem
);
500 Mask
.slice(I
* SubVF
,
501 I
== NumSrcRegs
- 1 ? Mask
.size() % SubVF
: SubVF
),
502 SubMask
.begin(), [&](int I
) -> int {
503 if (I
== PoisonMaskElem
)
504 return PoisonMaskElem
;
505 bool SingleSubVector
= I
/ VF
== 0;
506 IsSingleVector
&= SingleSubVector
;
507 return (SingleSubVector
? 0 : 1) * SubVF
+ (I
% VF
) % SubVF
;
509 if (all_of(enumerate(SubMask
), [](auto &&P
) {
510 return P
.value() == PoisonMaskElem
||
511 static_cast<unsigned>(P
.value()) == P
.index();
514 Cost
+= getShuffleCost(IsSingleVector
? TTI::SK_PermuteSingleSrc
515 : TTI::SK_PermuteTwoSrc
,
516 SubVecTy
, SubMask
, CostKind
, 0, nullptr);
525 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
528 // Fallthrough to generic handling.
529 // TODO: Most of these cases will return getInvalid in generic code, and
530 // must be implemented here.
532 case TTI::SK_ExtractSubvector
:
533 // Extract at zero is always a subregister extract
535 return TTI::TCC_Free
;
537 // If we're extracting a subvector of at most m1 size at a sub-register
538 // boundary - which unfortunately we need exact vlen to identify - this is
539 // a subregister extract at worst and thus won't require a vslidedown.
540 // TODO: Extend for aligned m2, m4 subvector extracts
541 // TODO: Extend for misalgined (but contained) extracts
542 // TODO: Extend for scalable subvector types
543 if (std::pair
<InstructionCost
, MVT
> SubLT
= getTypeLegalizationCost(SubTp
);
544 SubLT
.second
.isValid() && SubLT
.second
.isFixedLengthVector()) {
545 const unsigned MinVLen
= ST
->getRealMinVLen();
546 const unsigned MaxVLen
= ST
->getRealMaxVLen();
547 if (MinVLen
== MaxVLen
&&
548 SubLT
.second
.getScalarSizeInBits() * Index
% MinVLen
== 0 &&
549 SubLT
.second
.getSizeInBits() <= MinVLen
)
550 return TTI::TCC_Free
;
554 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
555 // vslidedown.vi v8, v9, 2
557 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI
, LT
.second
, CostKind
);
558 case TTI::SK_InsertSubvector
:
560 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
561 // vslideup.vi v8, v9, 2
563 getRISCVInstructionCost(RISCV::VSLIDEUP_VI
, LT
.second
, CostKind
);
564 case TTI::SK_Select
: {
567 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
569 // vmerge.vvm v8, v9, v8, v0
570 // We use 2 for the cost of the mask materialization as this is the true
571 // cost for small masks and most shuffles are small. At worst, this cost
572 // should be a very small constant for the constant pool load. As such,
573 // we may bias towards large selects slightly more than truely warranted.
575 (1 + getRISCVInstructionCost({RISCV::VMV_S_X
, RISCV::VMERGE_VVM
},
576 LT
.second
, CostKind
));
578 case TTI::SK_Broadcast
: {
579 bool HasScalar
= (Args
.size() > 0) && (Operator::getOpcode(Args
[0]) ==
580 Instruction::InsertElement
);
581 if (LT
.second
.getScalarSizeInBits() == 1) {
585 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
587 // vmsne.vi v0, v8, 0
589 (1 + getRISCVInstructionCost({RISCV::VMV_V_X
, RISCV::VMSNE_VI
},
590 LT
.second
, CostKind
));
593 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
595 // vmerge.vim v8, v8, 1, v0
599 // vmsne.vi v0, v8, 0
602 (1 + getRISCVInstructionCost({RISCV::VMV_V_I
, RISCV::VMERGE_VIM
,
603 RISCV::VMV_X_S
, RISCV::VMV_V_X
,
605 LT
.second
, CostKind
));
612 getRISCVInstructionCost(RISCV::VMV_V_X
, LT
.second
, CostKind
);
616 // vrgather.vi v9, v8, 0
618 getRISCVInstructionCost(RISCV::VRGATHER_VI
, LT
.second
, CostKind
);
620 case TTI::SK_Splice
: {
621 // vslidedown+vslideup.
622 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
623 // of similar code, but I think we expand through memory.
624 unsigned Opcodes
[2] = {RISCV::VSLIDEDOWN_VX
, RISCV::VSLIDEUP_VX
};
625 if (Index
>= 0 && Index
< 32)
626 Opcodes
[0] = RISCV::VSLIDEDOWN_VI
;
627 else if (Index
< 0 && Index
> -32)
628 Opcodes
[1] = RISCV::VSLIDEUP_VI
;
629 return LT
.first
* getRISCVInstructionCost(Opcodes
, LT
.second
, CostKind
);
631 case TTI::SK_Reverse
: {
632 // TODO: Cases to improve here:
633 // * Illegal vector types
636 // At low LMUL, most of the cost is producing the vrgather index register.
637 // At high LMUL, the cost of the vrgather itself will dominate.
642 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
644 // vrsub.vx v10, v9, a0
645 // vrgather.vv v9, v8, v10
646 InstructionCost LenCost
= 3;
647 if (LT
.second
.isFixedLengthVector())
648 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
649 LenCost
= isInt
<5>(LT
.second
.getVectorNumElements() - 1) ? 0 : 1;
650 unsigned Opcodes
[] = {RISCV::VID_V
, RISCV::VRSUB_VX
, RISCV::VRGATHER_VV
};
651 if (LT
.second
.isFixedLengthVector() &&
652 isInt
<5>(LT
.second
.getVectorNumElements() - 1))
653 Opcodes
[1] = RISCV::VRSUB_VI
;
654 InstructionCost GatherCost
=
655 getRISCVInstructionCost(Opcodes
, LT
.second
, CostKind
);
656 // Mask operation additionally required extend and truncate
657 InstructionCost ExtendCost
= Tp
->getElementType()->isIntegerTy(1) ? 3 : 0;
658 return LT
.first
* (LenCost
+ GatherCost
+ ExtendCost
);
661 return BaseT::getShuffleCost(Kind
, Tp
, Mask
, CostKind
, Index
, SubTp
);
664 static unsigned isM1OrSmaller(MVT VT
) {
665 RISCVII::VLMUL LMUL
= RISCVTargetLowering::getLMUL(VT
);
666 return (LMUL
== RISCVII::VLMUL::LMUL_F8
|| LMUL
== RISCVII::VLMUL::LMUL_F4
||
667 LMUL
== RISCVII::VLMUL::LMUL_F2
|| LMUL
== RISCVII::VLMUL::LMUL_1
);
670 InstructionCost
RISCVTTIImpl::getScalarizationOverhead(
671 VectorType
*Ty
, const APInt
&DemandedElts
, bool Insert
, bool Extract
,
672 TTI::TargetCostKind CostKind
, ArrayRef
<Value
*> VL
) {
673 if (isa
<ScalableVectorType
>(Ty
))
674 return InstructionCost::getInvalid();
676 // A build_vector (which is m1 sized or smaller) can be done in no
677 // worse than one vslide1down.vx per element in the type. We could
678 // in theory do an explode_vector in the inverse manner, but our
679 // lowering today does not have a first class node for this pattern.
680 InstructionCost Cost
= BaseT::getScalarizationOverhead(
681 Ty
, DemandedElts
, Insert
, Extract
, CostKind
);
682 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(Ty
);
683 if (Insert
&& !Extract
&& LT
.first
.isValid() && LT
.second
.isVector()) {
684 if (Ty
->getScalarSizeInBits() == 1) {
685 auto *WideVecTy
= cast
<VectorType
>(Ty
->getWithNewBitWidth(8));
686 // Note: Implicit scalar anyextend is assumed to be free since the i1
687 // must be stored in a GPR.
688 return getScalarizationOverhead(WideVecTy
, DemandedElts
, Insert
, Extract
,
690 getCastInstrCost(Instruction::Trunc
, Ty
, WideVecTy
,
691 TTI::CastContextHint::None
, CostKind
, nullptr);
694 assert(LT
.second
.isFixedLengthVector());
695 MVT ContainerVT
= TLI
->getContainerForFixedLengthVector(LT
.second
);
696 if (isM1OrSmaller(ContainerVT
)) {
698 cast
<FixedVectorType
>(Ty
)->getNumElements() *
699 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX
, LT
.second
, CostKind
);
708 RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode
, Type
*Src
, Align Alignment
,
709 unsigned AddressSpace
,
710 TTI::TargetCostKind CostKind
) {
711 if (!isLegalMaskedLoadStore(Src
, Alignment
) ||
712 CostKind
!= TTI::TCK_RecipThroughput
)
713 return BaseT::getMaskedMemoryOpCost(Opcode
, Src
, Alignment
, AddressSpace
,
716 return getMemoryOpCost(Opcode
, Src
, Alignment
, AddressSpace
, CostKind
);
719 InstructionCost
RISCVTTIImpl::getInterleavedMemoryOpCost(
720 unsigned Opcode
, Type
*VecTy
, unsigned Factor
, ArrayRef
<unsigned> Indices
,
721 Align Alignment
, unsigned AddressSpace
, TTI::TargetCostKind CostKind
,
722 bool UseMaskForCond
, bool UseMaskForGaps
) {
724 // The interleaved memory access pass will lower interleaved memory ops (i.e
725 // a load and store followed by a specific shuffle) to vlseg/vsseg
727 if (!UseMaskForCond
&& !UseMaskForGaps
&&
728 Factor
<= TLI
->getMaxSupportedInterleaveFactor()) {
729 auto *VTy
= cast
<VectorType
>(VecTy
);
730 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(VTy
);
731 // Need to make sure type has't been scalarized
732 if (LT
.second
.isVector()) {
734 VectorType::get(VTy
->getElementType(),
735 VTy
->getElementCount().divideCoefficientBy(Factor
));
736 if (VTy
->getElementCount().isKnownMultipleOf(Factor
) &&
737 TLI
->isLegalInterleavedAccessType(SubVecTy
, Factor
, Alignment
,
740 // Some processors optimize segment loads/stores as one wide memory op +
741 // Factor * LMUL shuffle ops.
742 if (ST
->hasOptimizedSegmentLoadStore(Factor
)) {
743 InstructionCost Cost
=
744 getMemoryOpCost(Opcode
, VTy
, Alignment
, AddressSpace
, CostKind
);
745 MVT SubVecVT
= getTLI()->getValueType(DL
, SubVecTy
).getSimpleVT();
746 Cost
+= Factor
* TLI
->getLMULCost(SubVecVT
);
747 return LT
.first
* Cost
;
750 // Otherwise, the cost is proportional to the number of elements (VL *
752 InstructionCost MemOpCost
=
753 getMemoryOpCost(Opcode
, VTy
->getElementType(), Alignment
, 0,
754 CostKind
, {TTI::OK_AnyValue
, TTI::OP_None
});
755 unsigned NumLoads
= getEstimatedVLFor(VTy
);
756 return NumLoads
* MemOpCost
;
761 // TODO: Return the cost of interleaved accesses for scalable vector when
762 // unable to convert to segment accesses instructions.
763 if (isa
<ScalableVectorType
>(VecTy
))
764 return InstructionCost::getInvalid();
766 auto *FVTy
= cast
<FixedVectorType
>(VecTy
);
767 InstructionCost MemCost
=
768 getMemoryOpCost(Opcode
, VecTy
, Alignment
, AddressSpace
, CostKind
);
769 unsigned VF
= FVTy
->getNumElements() / Factor
;
771 // An interleaved load will look like this for Factor=3:
772 // %wide.vec = load <12 x i32>, ptr %3, align 4
773 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
774 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
775 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
776 if (Opcode
== Instruction::Load
) {
777 InstructionCost Cost
= MemCost
;
778 for (unsigned Index
: Indices
) {
779 FixedVectorType
*SubVecTy
=
780 FixedVectorType::get(FVTy
->getElementType(), VF
* Factor
);
781 auto Mask
= createStrideMask(Index
, Factor
, VF
);
782 InstructionCost ShuffleCost
=
783 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc
, SubVecTy
, Mask
,
784 CostKind
, 0, nullptr, {});
790 // TODO: Model for NF > 2
791 // We'll need to enhance getShuffleCost to model shuffles that are just
792 // inserts and extracts into subvectors, since they won't have the full cost
794 // An interleaved store for 3 vectors of 4 lanes will look like
795 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
796 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
797 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
798 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
799 // store <12 x i32> %interleaved.vec, ptr %10, align 4
801 return BaseT::getInterleavedMemoryOpCost(Opcode
, VecTy
, Factor
, Indices
,
802 Alignment
, AddressSpace
, CostKind
,
803 UseMaskForCond
, UseMaskForGaps
);
805 assert(Opcode
== Instruction::Store
&& "Opcode must be a store");
806 // For an interleaving store of 2 vectors, we perform one large interleaving
807 // shuffle that goes into the wide store
808 auto Mask
= createInterleaveMask(VF
, Factor
);
809 InstructionCost ShuffleCost
=
810 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc
, FVTy
, Mask
,
811 CostKind
, 0, nullptr, {});
812 return MemCost
+ ShuffleCost
;
815 InstructionCost
RISCVTTIImpl::getGatherScatterOpCost(
816 unsigned Opcode
, Type
*DataTy
, const Value
*Ptr
, bool VariableMask
,
817 Align Alignment
, TTI::TargetCostKind CostKind
, const Instruction
*I
) {
818 if (CostKind
!= TTI::TCK_RecipThroughput
)
819 return BaseT::getGatherScatterOpCost(Opcode
, DataTy
, Ptr
, VariableMask
,
820 Alignment
, CostKind
, I
);
822 if ((Opcode
== Instruction::Load
&&
823 !isLegalMaskedGather(DataTy
, Align(Alignment
))) ||
824 (Opcode
== Instruction::Store
&&
825 !isLegalMaskedScatter(DataTy
, Align(Alignment
))))
826 return BaseT::getGatherScatterOpCost(Opcode
, DataTy
, Ptr
, VariableMask
,
827 Alignment
, CostKind
, I
);
829 // Cost is proportional to the number of memory operations implied. For
830 // scalable vectors, we use an estimate on that number since we don't
831 // know exactly what VL will be.
832 auto &VTy
= *cast
<VectorType
>(DataTy
);
833 InstructionCost MemOpCost
=
834 getMemoryOpCost(Opcode
, VTy
.getElementType(), Alignment
, 0, CostKind
,
835 {TTI::OK_AnyValue
, TTI::OP_None
}, I
);
836 unsigned NumLoads
= getEstimatedVLFor(&VTy
);
837 return NumLoads
* MemOpCost
;
840 InstructionCost
RISCVTTIImpl::getStridedMemoryOpCost(
841 unsigned Opcode
, Type
*DataTy
, const Value
*Ptr
, bool VariableMask
,
842 Align Alignment
, TTI::TargetCostKind CostKind
, const Instruction
*I
) {
843 if (((Opcode
== Instruction::Load
|| Opcode
== Instruction::Store
) &&
844 !isLegalStridedLoadStore(DataTy
, Alignment
)) ||
845 (Opcode
!= Instruction::Load
&& Opcode
!= Instruction::Store
))
846 return BaseT::getStridedMemoryOpCost(Opcode
, DataTy
, Ptr
, VariableMask
,
847 Alignment
, CostKind
, I
);
849 if (CostKind
== TTI::TCK_CodeSize
)
850 return TTI::TCC_Basic
;
852 // Cost is proportional to the number of memory operations implied. For
853 // scalable vectors, we use an estimate on that number since we don't
854 // know exactly what VL will be.
855 auto &VTy
= *cast
<VectorType
>(DataTy
);
856 InstructionCost MemOpCost
=
857 getMemoryOpCost(Opcode
, VTy
.getElementType(), Alignment
, 0, CostKind
,
858 {TTI::OK_AnyValue
, TTI::OP_None
}, I
);
859 unsigned NumLoads
= getEstimatedVLFor(&VTy
);
860 return NumLoads
* MemOpCost
;
864 RISCVTTIImpl::getCostOfKeepingLiveOverCall(ArrayRef
<Type
*> Tys
) {
865 // FIXME: This is a property of the default vector convention, not
866 // all possible calling conventions. Fixing that will require
867 // some TTI API and SLP rework.
868 InstructionCost Cost
= 0;
869 TTI::TargetCostKind CostKind
= TTI::TCK_RecipThroughput
;
870 for (auto *Ty
: Tys
) {
871 if (!Ty
->isVectorTy())
873 Align A
= DL
.getPrefTypeAlign(Ty
);
874 Cost
+= getMemoryOpCost(Instruction::Store
, Ty
, A
, 0, CostKind
) +
875 getMemoryOpCost(Instruction::Load
, Ty
, A
, 0, CostKind
);
880 // Currently, these represent both throughput and codesize costs
881 // for the respective intrinsics. The costs in this table are simply
882 // instruction counts with the following adjustments made:
883 // * One vsetvli is considered free.
884 static const CostTblEntry VectorIntrinsicCostTable
[]{
885 {Intrinsic::floor
, MVT::f32
, 9},
886 {Intrinsic::floor
, MVT::f64
, 9},
887 {Intrinsic::ceil
, MVT::f32
, 9},
888 {Intrinsic::ceil
, MVT::f64
, 9},
889 {Intrinsic::trunc
, MVT::f32
, 7},
890 {Intrinsic::trunc
, MVT::f64
, 7},
891 {Intrinsic::round
, MVT::f32
, 9},
892 {Intrinsic::round
, MVT::f64
, 9},
893 {Intrinsic::roundeven
, MVT::f32
, 9},
894 {Intrinsic::roundeven
, MVT::f64
, 9},
895 {Intrinsic::rint
, MVT::f32
, 7},
896 {Intrinsic::rint
, MVT::f64
, 7},
897 {Intrinsic::lrint
, MVT::i32
, 1},
898 {Intrinsic::lrint
, MVT::i64
, 1},
899 {Intrinsic::llrint
, MVT::i64
, 1},
900 {Intrinsic::nearbyint
, MVT::f32
, 9},
901 {Intrinsic::nearbyint
, MVT::f64
, 9},
902 {Intrinsic::bswap
, MVT::i16
, 3},
903 {Intrinsic::bswap
, MVT::i32
, 12},
904 {Intrinsic::bswap
, MVT::i64
, 31},
905 {Intrinsic::vp_bswap
, MVT::i16
, 3},
906 {Intrinsic::vp_bswap
, MVT::i32
, 12},
907 {Intrinsic::vp_bswap
, MVT::i64
, 31},
908 {Intrinsic::vp_fshl
, MVT::i8
, 7},
909 {Intrinsic::vp_fshl
, MVT::i16
, 7},
910 {Intrinsic::vp_fshl
, MVT::i32
, 7},
911 {Intrinsic::vp_fshl
, MVT::i64
, 7},
912 {Intrinsic::vp_fshr
, MVT::i8
, 7},
913 {Intrinsic::vp_fshr
, MVT::i16
, 7},
914 {Intrinsic::vp_fshr
, MVT::i32
, 7},
915 {Intrinsic::vp_fshr
, MVT::i64
, 7},
916 {Intrinsic::bitreverse
, MVT::i8
, 17},
917 {Intrinsic::bitreverse
, MVT::i16
, 24},
918 {Intrinsic::bitreverse
, MVT::i32
, 33},
919 {Intrinsic::bitreverse
, MVT::i64
, 52},
920 {Intrinsic::vp_bitreverse
, MVT::i8
, 17},
921 {Intrinsic::vp_bitreverse
, MVT::i16
, 24},
922 {Intrinsic::vp_bitreverse
, MVT::i32
, 33},
923 {Intrinsic::vp_bitreverse
, MVT::i64
, 52},
924 {Intrinsic::ctpop
, MVT::i8
, 12},
925 {Intrinsic::ctpop
, MVT::i16
, 19},
926 {Intrinsic::ctpop
, MVT::i32
, 20},
927 {Intrinsic::ctpop
, MVT::i64
, 21},
928 {Intrinsic::ctlz
, MVT::i8
, 19},
929 {Intrinsic::ctlz
, MVT::i16
, 28},
930 {Intrinsic::ctlz
, MVT::i32
, 31},
931 {Intrinsic::ctlz
, MVT::i64
, 35},
932 {Intrinsic::cttz
, MVT::i8
, 16},
933 {Intrinsic::cttz
, MVT::i16
, 23},
934 {Intrinsic::cttz
, MVT::i32
, 24},
935 {Intrinsic::cttz
, MVT::i64
, 25},
936 {Intrinsic::vp_ctpop
, MVT::i8
, 12},
937 {Intrinsic::vp_ctpop
, MVT::i16
, 19},
938 {Intrinsic::vp_ctpop
, MVT::i32
, 20},
939 {Intrinsic::vp_ctpop
, MVT::i64
, 21},
940 {Intrinsic::vp_ctlz
, MVT::i8
, 19},
941 {Intrinsic::vp_ctlz
, MVT::i16
, 28},
942 {Intrinsic::vp_ctlz
, MVT::i32
, 31},
943 {Intrinsic::vp_ctlz
, MVT::i64
, 35},
944 {Intrinsic::vp_cttz
, MVT::i8
, 16},
945 {Intrinsic::vp_cttz
, MVT::i16
, 23},
946 {Intrinsic::vp_cttz
, MVT::i32
, 24},
947 {Intrinsic::vp_cttz
, MVT::i64
, 25},
950 static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID
) {
952 #define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
953 case Intrinsic::VPID: \
955 #include "llvm/IR/VPIntrinsics.def"
956 #undef HELPER_MAP_VPID_TO_VPSD
958 return ISD::DELETED_NODE
;
962 RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes
&ICA
,
963 TTI::TargetCostKind CostKind
) {
964 auto *RetTy
= ICA
.getReturnType();
965 switch (ICA
.getID()) {
966 case Intrinsic::lrint
:
967 case Intrinsic::llrint
:
968 // We can't currently lower half or bfloat vector lrint/llrint.
969 if (auto *VecTy
= dyn_cast
<VectorType
>(ICA
.getArgTypes()[0]);
970 VecTy
&& VecTy
->getElementType()->is16bitFPTy())
971 return InstructionCost::getInvalid();
973 case Intrinsic::ceil
:
974 case Intrinsic::floor
:
975 case Intrinsic::trunc
:
976 case Intrinsic::rint
:
977 case Intrinsic::round
:
978 case Intrinsic::roundeven
: {
979 // These all use the same code.
980 auto LT
= getTypeLegalizationCost(RetTy
);
981 if (!LT
.second
.isVector() && TLI
->isOperationCustom(ISD::FCEIL
, LT
.second
))
985 case Intrinsic::umin
:
986 case Intrinsic::umax
:
987 case Intrinsic::smin
:
988 case Intrinsic::smax
: {
989 auto LT
= getTypeLegalizationCost(RetTy
);
990 if (LT
.second
.isScalarInteger() && ST
->hasStdExtZbb())
993 if (ST
->hasVInstructions() && LT
.second
.isVector()) {
995 switch (ICA
.getID()) {
996 case Intrinsic::umin
:
997 Op
= RISCV::VMINU_VV
;
999 case Intrinsic::umax
:
1000 Op
= RISCV::VMAXU_VV
;
1002 case Intrinsic::smin
:
1003 Op
= RISCV::VMIN_VV
;
1005 case Intrinsic::smax
:
1006 Op
= RISCV::VMAX_VV
;
1009 return LT
.first
* getRISCVInstructionCost(Op
, LT
.second
, CostKind
);
1013 case Intrinsic::sadd_sat
:
1014 case Intrinsic::ssub_sat
:
1015 case Intrinsic::uadd_sat
:
1016 case Intrinsic::usub_sat
: {
1017 auto LT
= getTypeLegalizationCost(RetTy
);
1018 if (ST
->hasVInstructions() && LT
.second
.isVector()) {
1020 switch (ICA
.getID()) {
1021 case Intrinsic::sadd_sat
:
1022 Op
= RISCV::VSADD_VV
;
1024 case Intrinsic::ssub_sat
:
1025 Op
= RISCV::VSSUBU_VV
;
1027 case Intrinsic::uadd_sat
:
1028 Op
= RISCV::VSADDU_VV
;
1030 case Intrinsic::usub_sat
:
1031 Op
= RISCV::VSSUBU_VV
;
1034 return LT
.first
* getRISCVInstructionCost(Op
, LT
.second
, CostKind
);
1038 case Intrinsic::fabs
:
1039 case Intrinsic::sqrt
: {
1040 auto LT
= getTypeLegalizationCost(RetTy
);
1041 // TODO: add f16/bf16, bf16 with zvfbfmin && f16 with zvfhmin
1042 if (ST
->hasVInstructions() && LT
.second
.isVector()) {
1044 switch (ICA
.getID()) {
1045 case Intrinsic::fabs
:
1046 Op
= RISCV::VFSGNJX_VV
;
1048 case Intrinsic::sqrt
:
1049 Op
= RISCV::VFSQRT_V
;
1052 return LT
.first
* getRISCVInstructionCost(Op
, LT
.second
, CostKind
);
1056 case Intrinsic::cttz
:
1057 case Intrinsic::ctlz
:
1058 case Intrinsic::ctpop
: {
1059 auto LT
= getTypeLegalizationCost(RetTy
);
1060 if (ST
->hasVInstructions() && ST
->hasStdExtZvbb() && LT
.second
.isVector()) {
1062 switch (ICA
.getID()) {
1063 case Intrinsic::cttz
:
1066 case Intrinsic::ctlz
:
1069 case Intrinsic::ctpop
:
1070 Op
= RISCV::VCPOP_V
;
1073 return LT
.first
* getRISCVInstructionCost(Op
, LT
.second
, CostKind
);
1077 case Intrinsic::abs
: {
1078 auto LT
= getTypeLegalizationCost(RetTy
);
1079 if (ST
->hasVInstructions() && LT
.second
.isVector()) {
1080 // vrsub.vi v10, v8, 0
1081 // vmax.vv v8, v8, v10
1083 getRISCVInstructionCost({RISCV::VRSUB_VI
, RISCV::VMAX_VV
},
1084 LT
.second
, CostKind
);
1088 case Intrinsic::get_active_lane_mask
: {
1089 if (ST
->hasVInstructions()) {
1090 Type
*ExpRetTy
= VectorType::get(
1091 ICA
.getArgTypes()[0], cast
<VectorType
>(RetTy
)->getElementCount());
1092 auto LT
= getTypeLegalizationCost(ExpRetTy
);
1094 // vid.v v8 // considered hoisted
1095 // vsaddu.vx v8, v8, a0
1096 // vmsltu.vx v0, v8, a1
1098 getRISCVInstructionCost({RISCV::VSADDU_VX
, RISCV::VMSLTU_VX
},
1099 LT
.second
, CostKind
);
1103 // TODO: add more intrinsic
1104 case Intrinsic::stepvector
: {
1105 auto LT
= getTypeLegalizationCost(RetTy
);
1106 // Legalisation of illegal types involves an `index' instruction plus
1107 // (LT.first - 1) vector adds.
1108 if (ST
->hasVInstructions())
1109 return getRISCVInstructionCost(RISCV::VID_V
, LT
.second
, CostKind
) +
1111 getRISCVInstructionCost(RISCV::VADD_VX
, LT
.second
, CostKind
);
1112 return 1 + (LT
.first
- 1);
1114 case Intrinsic::experimental_cttz_elts
: {
1115 Type
*ArgTy
= ICA
.getArgTypes()[0];
1116 EVT ArgType
= TLI
->getValueType(DL
, ArgTy
, true);
1117 if (getTLI()->shouldExpandCttzElements(ArgType
))
1119 InstructionCost Cost
= getRISCVInstructionCost(
1120 RISCV::VFIRST_M
, getTypeLegalizationCost(ArgTy
).second
, CostKind
);
1122 // If zero_is_poison is false, then we will generate additional
1123 // cmp + select instructions to convert -1 to EVL.
1124 Type
*BoolTy
= Type::getInt1Ty(RetTy
->getContext());
1125 if (ICA
.getArgs().size() > 1 &&
1126 cast
<ConstantInt
>(ICA
.getArgs()[1])->isZero())
1127 Cost
+= getCmpSelInstrCost(Instruction::ICmp
, BoolTy
, RetTy
,
1128 CmpInst::ICMP_SLT
, CostKind
) +
1129 getCmpSelInstrCost(Instruction::Select
, RetTy
, BoolTy
,
1130 CmpInst::BAD_ICMP_PREDICATE
, CostKind
);
1134 case Intrinsic::vp_rint
: {
1135 // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
1137 auto LT
= getTypeLegalizationCost(RetTy
);
1138 if (TLI
->isOperationCustom(ISD::VP_FRINT
, LT
.second
))
1139 return Cost
* LT
.first
;
1142 case Intrinsic::vp_nearbyint
: {
1143 // More one read and one write for fflags than vp_rint.
1145 auto LT
= getTypeLegalizationCost(RetTy
);
1146 if (TLI
->isOperationCustom(ISD::VP_FRINT
, LT
.second
))
1147 return Cost
* LT
.first
;
1150 case Intrinsic::vp_ceil
:
1151 case Intrinsic::vp_floor
:
1152 case Intrinsic::vp_round
:
1153 case Intrinsic::vp_roundeven
:
1154 case Intrinsic::vp_roundtozero
: {
1155 // Rounding with static rounding mode needs two more instructions to
1156 // swap/write FRM than vp_rint.
1158 auto LT
= getTypeLegalizationCost(RetTy
);
1159 unsigned VPISD
= getISDForVPIntrinsicID(ICA
.getID());
1160 if (TLI
->isOperationCustom(VPISD
, LT
.second
))
1161 return Cost
* LT
.first
;
1164 case Intrinsic::vp_fneg
: {
1165 std::optional
<unsigned> FOp
=
1166 VPIntrinsic::getFunctionalOpcodeForVP(ICA
.getID());
1167 assert(FOp
.has_value());
1168 return getArithmeticInstrCost(*FOp
, ICA
.getReturnType(), CostKind
);
1171 case Intrinsic::vp_select
: {
1172 Intrinsic::ID IID
= ICA
.getID();
1173 std::optional
<unsigned> FOp
= VPIntrinsic::getFunctionalOpcodeForVP(IID
);
1174 assert(FOp
.has_value());
1175 return getCmpSelInstrCost(*FOp
, ICA
.getReturnType(), ICA
.getArgTypes()[0],
1176 CmpInst::BAD_ICMP_PREDICATE
, CostKind
);
1178 case Intrinsic::vp_merge
:
1179 return getCmpSelInstrCost(Instruction::Select
, ICA
.getReturnType(),
1180 ICA
.getArgTypes()[0], CmpInst::BAD_ICMP_PREDICATE
,
1182 case Intrinsic::experimental_vp_splat
: {
1183 auto LT
= getTypeLegalizationCost(RetTy
);
1184 // TODO: Lower i1 experimental_vp_splat
1185 if (!ST
->hasVInstructions() || LT
.second
.getScalarType() == MVT::i1
)
1186 return InstructionCost::getInvalid();
1187 return LT
.first
* getRISCVInstructionCost(LT
.second
.isFloatingPoint()
1190 LT
.second
, CostKind
);
1194 if (ST
->hasVInstructions() && RetTy
->isVectorTy()) {
1195 if (auto LT
= getTypeLegalizationCost(RetTy
);
1196 LT
.second
.isVector()) {
1197 MVT EltTy
= LT
.second
.getVectorElementType();
1198 if (const auto *Entry
= CostTableLookup(VectorIntrinsicCostTable
,
1199 ICA
.getID(), EltTy
))
1200 return LT
.first
* Entry
->Cost
;
1204 return BaseT::getIntrinsicInstrCost(ICA
, CostKind
);
1207 InstructionCost
RISCVTTIImpl::getCastInstrCost(unsigned Opcode
, Type
*Dst
,
1209 TTI::CastContextHint CCH
,
1210 TTI::TargetCostKind CostKind
,
1211 const Instruction
*I
) {
1212 bool IsVectorType
= isa
<VectorType
>(Dst
) && isa
<VectorType
>(Src
);
1214 return BaseT::getCastInstrCost(Opcode
, Dst
, Src
, CCH
, CostKind
, I
);
1216 // FIXME: Need to compute legalizing cost for illegal types. The current
1217 // code handles only legal types and those which can be trivially
1218 // promoted to legal.
1219 if (!ST
->hasVInstructions() || Src
->getScalarSizeInBits() > ST
->getELen() ||
1220 Dst
->getScalarSizeInBits() > ST
->getELen())
1221 return BaseT::getCastInstrCost(Opcode
, Dst
, Src
, CCH
, CostKind
, I
);
1223 int ISD
= TLI
->InstructionOpcodeToISD(Opcode
);
1224 assert(ISD
&& "Invalid opcode");
1225 std::pair
<InstructionCost
, MVT
> SrcLT
= getTypeLegalizationCost(Src
);
1226 std::pair
<InstructionCost
, MVT
> DstLT
= getTypeLegalizationCost(Dst
);
1228 // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1229 // The shared implementation doesn't model vector widening during legalization
1230 // and instead assumes scalarization. In order to scalarize an <N x i1>
1231 // vector, we need to extend/trunc to/from i8. If we don't special case
1232 // this, we can get an infinite recursion cycle.
1236 case ISD::SIGN_EXTEND
:
1237 case ISD::ZERO_EXTEND
:
1238 if (Src
->getScalarSizeInBits() == 1) {
1239 // We do not use vsext/vzext to extend from mask vector.
1240 // Instead we use the following instructions to extend from mask vector:
1242 // vmerge.vim v8, v8, -1, v0 (repeated per split)
1243 return getRISCVInstructionCost(RISCV::VMV_V_I
, DstLT
.second
, CostKind
) +
1244 DstLT
.first
* getRISCVInstructionCost(RISCV::VMERGE_VIM
,
1245 DstLT
.second
, CostKind
) +
1250 if (Dst
->getScalarSizeInBits() == 1) {
1251 // We do not use several vncvt to truncate to mask vector. So we could
1252 // not use PowDiff to calculate it.
1253 // Instead we use the following instructions to truncate to mask vector:
1254 // vand.vi v8, v8, 1
1255 // vmsne.vi v0, v8, 0
1256 return SrcLT
.first
*
1257 getRISCVInstructionCost({RISCV::VAND_VI
, RISCV::VMSNE_VI
},
1258 SrcLT
.second
, CostKind
) +
1264 // Our actual lowering for the case where a wider legal type is available
1265 // uses promotion to the wider type. This is reflected in the result of
1266 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1267 // scalarized if the legalized Src and Dst are not equal sized.
1268 const DataLayout
&DL
= this->getDataLayout();
1269 if (!SrcLT
.second
.isVector() || !DstLT
.second
.isVector() ||
1270 !TypeSize::isKnownLE(DL
.getTypeSizeInBits(Src
),
1271 SrcLT
.second
.getSizeInBits()) ||
1272 !TypeSize::isKnownLE(DL
.getTypeSizeInBits(Dst
),
1273 DstLT
.second
.getSizeInBits()))
1274 return BaseT::getCastInstrCost(Opcode
, Dst
, Src
, CCH
, CostKind
, I
);
1276 // The split cost is handled by the base getCastInstrCost
1277 assert((SrcLT
.first
== 1) && (DstLT
.first
== 1) && "Illegal type");
1279 int PowDiff
= (int)Log2_32(DstLT
.second
.getScalarSizeInBits()) -
1280 (int)Log2_32(SrcLT
.second
.getScalarSizeInBits());
1282 case ISD::SIGN_EXTEND
:
1283 case ISD::ZERO_EXTEND
: {
1284 if ((PowDiff
< 1) || (PowDiff
> 3))
1285 return BaseT::getCastInstrCost(Opcode
, Dst
, Src
, CCH
, CostKind
, I
);
1286 unsigned SExtOp
[] = {RISCV::VSEXT_VF2
, RISCV::VSEXT_VF4
, RISCV::VSEXT_VF8
};
1287 unsigned ZExtOp
[] = {RISCV::VZEXT_VF2
, RISCV::VZEXT_VF4
, RISCV::VZEXT_VF8
};
1289 (ISD
== ISD::SIGN_EXTEND
) ? SExtOp
[PowDiff
- 1] : ZExtOp
[PowDiff
- 1];
1290 return getRISCVInstructionCost(Op
, DstLT
.second
, CostKind
);
1293 case ISD::FP_EXTEND
:
1294 case ISD::FP_ROUND
: {
1295 // Counts of narrow/widen instructions.
1296 unsigned SrcEltSize
= SrcLT
.second
.getScalarSizeInBits();
1297 unsigned DstEltSize
= DstLT
.second
.getScalarSizeInBits();
1299 unsigned Op
= (ISD
== ISD::TRUNCATE
) ? RISCV::VNSRL_WI
1300 : (ISD
== ISD::FP_EXTEND
) ? RISCV::VFWCVT_F_F_V
1301 : RISCV::VFNCVT_F_F_W
;
1302 InstructionCost Cost
= 0;
1303 for (; SrcEltSize
!= DstEltSize
;) {
1304 MVT ElementMVT
= (ISD
== ISD::TRUNCATE
)
1305 ? MVT::getIntegerVT(DstEltSize
)
1306 : MVT::getFloatingPointVT(DstEltSize
);
1307 MVT DstMVT
= DstLT
.second
.changeVectorElementType(ElementMVT
);
1309 (DstEltSize
> SrcEltSize
) ? DstEltSize
>> 1 : DstEltSize
<< 1;
1310 Cost
+= getRISCVInstructionCost(Op
, DstMVT
, CostKind
);
1314 case ISD::FP_TO_SINT
:
1315 case ISD::FP_TO_UINT
: {
1316 unsigned IsSigned
= ISD
== ISD::FP_TO_SINT
;
1317 unsigned FCVT
= IsSigned
? RISCV::VFCVT_RTZ_X_F_V
: RISCV::VFCVT_RTZ_XU_F_V
;
1319 IsSigned
? RISCV::VFWCVT_RTZ_X_F_V
: RISCV::VFWCVT_RTZ_XU_F_V
;
1321 IsSigned
? RISCV::VFNCVT_RTZ_X_F_W
: RISCV::VFNCVT_RTZ_XU_F_W
;
1322 unsigned SrcEltSize
= Src
->getScalarSizeInBits();
1323 unsigned DstEltSize
= Dst
->getScalarSizeInBits();
1324 InstructionCost Cost
= 0;
1325 if ((SrcEltSize
== 16) &&
1326 (!ST
->hasVInstructionsF16() || ((DstEltSize
/ 2) > SrcEltSize
))) {
1327 // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1328 // pre-widening to f32 and then convert f32 to integer
1329 VectorType
*VecF32Ty
=
1330 VectorType::get(Type::getFloatTy(Dst
->getContext()),
1331 cast
<VectorType
>(Dst
)->getElementCount());
1332 std::pair
<InstructionCost
, MVT
> VecF32LT
=
1333 getTypeLegalizationCost(VecF32Ty
);
1335 VecF32LT
.first
* getRISCVInstructionCost(RISCV::VFWCVT_F_F_V
,
1336 VecF32LT
.second
, CostKind
);
1337 Cost
+= getCastInstrCost(Opcode
, Dst
, VecF32Ty
, CCH
, CostKind
, I
);
1340 if (DstEltSize
== SrcEltSize
)
1341 Cost
+= getRISCVInstructionCost(FCVT
, DstLT
.second
, CostKind
);
1342 else if (DstEltSize
> SrcEltSize
)
1343 Cost
+= getRISCVInstructionCost(FWCVT
, DstLT
.second
, CostKind
);
1344 else { // (SrcEltSize > DstEltSize)
1345 // First do a narrowing conversion to an integer half the size, then
1346 // truncate if needed.
1347 MVT ElementVT
= MVT::getIntegerVT(SrcEltSize
/ 2);
1348 MVT VecVT
= DstLT
.second
.changeVectorElementType(ElementVT
);
1349 Cost
+= getRISCVInstructionCost(FNCVT
, VecVT
, CostKind
);
1350 if ((SrcEltSize
/ 2) > DstEltSize
) {
1351 Type
*VecTy
= EVT(VecVT
).getTypeForEVT(Dst
->getContext());
1353 getCastInstrCost(Instruction::Trunc
, Dst
, VecTy
, CCH
, CostKind
, I
);
1358 case ISD::SINT_TO_FP
:
1359 case ISD::UINT_TO_FP
: {
1360 unsigned IsSigned
= ISD
== ISD::SINT_TO_FP
;
1361 unsigned FCVT
= IsSigned
? RISCV::VFCVT_F_X_V
: RISCV::VFCVT_F_XU_V
;
1362 unsigned FWCVT
= IsSigned
? RISCV::VFWCVT_F_X_V
: RISCV::VFWCVT_F_XU_V
;
1363 unsigned FNCVT
= IsSigned
? RISCV::VFNCVT_F_X_W
: RISCV::VFNCVT_F_XU_W
;
1364 unsigned SrcEltSize
= Src
->getScalarSizeInBits();
1365 unsigned DstEltSize
= Dst
->getScalarSizeInBits();
1367 InstructionCost Cost
= 0;
1368 if ((DstEltSize
== 16) &&
1369 (!ST
->hasVInstructionsF16() || ((SrcEltSize
/ 2) > DstEltSize
))) {
1370 // If the target only supports zvfhmin or it is i64-to-fp16 conversion
1371 // it is converted to f32 and then converted to f16
1372 VectorType
*VecF32Ty
=
1373 VectorType::get(Type::getFloatTy(Dst
->getContext()),
1374 cast
<VectorType
>(Dst
)->getElementCount());
1375 std::pair
<InstructionCost
, MVT
> VecF32LT
=
1376 getTypeLegalizationCost(VecF32Ty
);
1377 Cost
+= getCastInstrCost(Opcode
, VecF32Ty
, Src
, CCH
, CostKind
, I
);
1378 Cost
+= VecF32LT
.first
* getRISCVInstructionCost(RISCV::VFNCVT_F_F_W
,
1379 DstLT
.second
, CostKind
);
1383 if (DstEltSize
== SrcEltSize
)
1384 Cost
+= getRISCVInstructionCost(FCVT
, DstLT
.second
, CostKind
);
1385 else if (DstEltSize
> SrcEltSize
) {
1386 if ((DstEltSize
/ 2) > SrcEltSize
) {
1388 VectorType::get(IntegerType::get(Dst
->getContext(), DstEltSize
/ 2),
1389 cast
<VectorType
>(Dst
)->getElementCount());
1390 unsigned Op
= IsSigned
? Instruction::SExt
: Instruction::ZExt
;
1391 Cost
+= getCastInstrCost(Op
, VecTy
, Src
, CCH
, CostKind
, I
);
1393 Cost
+= getRISCVInstructionCost(FWCVT
, DstLT
.second
, CostKind
);
1395 Cost
+= getRISCVInstructionCost(FNCVT
, DstLT
.second
, CostKind
);
1399 return BaseT::getCastInstrCost(Opcode
, Dst
, Src
, CCH
, CostKind
, I
);
1402 unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType
*Ty
) {
1403 if (isa
<ScalableVectorType
>(Ty
)) {
1404 const unsigned EltSize
= DL
.getTypeSizeInBits(Ty
->getElementType());
1405 const unsigned MinSize
= DL
.getTypeSizeInBits(Ty
).getKnownMinValue();
1406 const unsigned VectorBits
= *getVScaleForTuning() * RISCV::RVVBitsPerBlock
;
1407 return RISCVTargetLowering::computeVLMAX(VectorBits
, EltSize
, MinSize
);
1409 return cast
<FixedVectorType
>(Ty
)->getNumElements();
1413 RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID
, VectorType
*Ty
,
1415 TTI::TargetCostKind CostKind
) {
1416 if (isa
<FixedVectorType
>(Ty
) && !ST
->useRVVForFixedLengthVectors())
1417 return BaseT::getMinMaxReductionCost(IID
, Ty
, FMF
, CostKind
);
1419 // Skip if scalar size of Ty is bigger than ELEN.
1420 if (Ty
->getScalarSizeInBits() > ST
->getELen())
1421 return BaseT::getMinMaxReductionCost(IID
, Ty
, FMF
, CostKind
);
1423 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(Ty
);
1424 if (Ty
->getElementType()->isIntegerTy(1)) {
1425 // SelectionDAGBuilder does following transforms:
1426 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1427 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1428 if (IID
== Intrinsic::umax
|| IID
== Intrinsic::smin
)
1429 return getArithmeticReductionCost(Instruction::Or
, Ty
, FMF
, CostKind
);
1431 return getArithmeticReductionCost(Instruction::And
, Ty
, FMF
, CostKind
);
1434 if (IID
== Intrinsic::maximum
|| IID
== Intrinsic::minimum
) {
1435 SmallVector
<unsigned, 3> Opcodes
;
1436 InstructionCost ExtraCost
= 0;
1438 case Intrinsic::maximum
:
1440 Opcodes
= {RISCV::VFREDMAX_VS
, RISCV::VFMV_F_S
};
1442 Opcodes
= {RISCV::VMFNE_VV
, RISCV::VCPOP_M
, RISCV::VFREDMAX_VS
,
1444 // Cost of Canonical Nan + branch
1447 Type
*DstTy
= Ty
->getScalarType();
1448 const unsigned EltTyBits
= DstTy
->getScalarSizeInBits();
1449 Type
*SrcTy
= IntegerType::getIntNTy(DstTy
->getContext(), EltTyBits
);
1451 getCastInstrCost(Instruction::UIToFP
, DstTy
, SrcTy
,
1452 TTI::CastContextHint::None
, CostKind
) +
1453 getCFInstrCost(Instruction::Br
, CostKind
);
1457 case Intrinsic::minimum
:
1459 Opcodes
= {RISCV::VFREDMIN_VS
, RISCV::VFMV_F_S
};
1461 Opcodes
= {RISCV::VMFNE_VV
, RISCV::VCPOP_M
, RISCV::VFREDMIN_VS
,
1463 // Cost of Canonical Nan + branch
1466 Type
*DstTy
= Ty
->getScalarType();
1467 const unsigned EltTyBits
= DL
.getTypeSizeInBits(DstTy
);
1468 Type
*SrcTy
= IntegerType::getIntNTy(DstTy
->getContext(), EltTyBits
);
1470 getCastInstrCost(Instruction::UIToFP
, DstTy
, SrcTy
,
1471 TTI::CastContextHint::None
, CostKind
) +
1472 getCFInstrCost(Instruction::Br
, CostKind
);
1476 return ExtraCost
+ getRISCVInstructionCost(Opcodes
, LT
.second
, CostKind
);
1479 // IR Reduction is composed by one rvv reduction instruction and vmv
1481 SmallVector
<unsigned, 3> Opcodes
;
1484 llvm_unreachable("Unsupported intrinsic");
1485 case Intrinsic::smax
:
1486 SplitOp
= RISCV::VMAX_VV
;
1487 Opcodes
= {RISCV::VREDMAX_VS
, RISCV::VMV_X_S
};
1489 case Intrinsic::smin
:
1490 SplitOp
= RISCV::VMIN_VV
;
1491 Opcodes
= {RISCV::VREDMIN_VS
, RISCV::VMV_X_S
};
1493 case Intrinsic::umax
:
1494 SplitOp
= RISCV::VMAXU_VV
;
1495 Opcodes
= {RISCV::VREDMAXU_VS
, RISCV::VMV_X_S
};
1497 case Intrinsic::umin
:
1498 SplitOp
= RISCV::VMINU_VV
;
1499 Opcodes
= {RISCV::VREDMINU_VS
, RISCV::VMV_X_S
};
1501 case Intrinsic::maxnum
:
1502 SplitOp
= RISCV::VFMAX_VV
;
1503 Opcodes
= {RISCV::VFREDMAX_VS
, RISCV::VFMV_F_S
};
1505 case Intrinsic::minnum
:
1506 SplitOp
= RISCV::VFMIN_VV
;
1507 Opcodes
= {RISCV::VFREDMIN_VS
, RISCV::VFMV_F_S
};
1510 // Add a cost for data larger than LMUL8
1511 InstructionCost SplitCost
=
1512 (LT
.first
> 1) ? (LT
.first
- 1) *
1513 getRISCVInstructionCost(SplitOp
, LT
.second
, CostKind
)
1515 return SplitCost
+ getRISCVInstructionCost(Opcodes
, LT
.second
, CostKind
);
1519 RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode
, VectorType
*Ty
,
1520 std::optional
<FastMathFlags
> FMF
,
1521 TTI::TargetCostKind CostKind
) {
1522 if (isa
<FixedVectorType
>(Ty
) && !ST
->useRVVForFixedLengthVectors())
1523 return BaseT::getArithmeticReductionCost(Opcode
, Ty
, FMF
, CostKind
);
1525 // Skip if scalar size of Ty is bigger than ELEN.
1526 if (Ty
->getScalarSizeInBits() > ST
->getELen())
1527 return BaseT::getArithmeticReductionCost(Opcode
, Ty
, FMF
, CostKind
);
1529 int ISD
= TLI
->InstructionOpcodeToISD(Opcode
);
1530 assert(ISD
&& "Invalid opcode");
1532 if (ISD
!= ISD::ADD
&& ISD
!= ISD::OR
&& ISD
!= ISD::XOR
&& ISD
!= ISD::AND
&&
1534 return BaseT::getArithmeticReductionCost(Opcode
, Ty
, FMF
, CostKind
);
1536 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(Ty
);
1537 Type
*ElementTy
= Ty
->getElementType();
1538 if (ElementTy
->isIntegerTy(1)) {
1539 if (ISD
== ISD::AND
) {
1540 // Example sequences:
1541 // vsetvli a0, zero, e8, mf8, ta, ma
1542 // vmand.mm v8, v9, v8 ; needed every time type is split
1546 return LT
.first
* getRISCVInstructionCost(RISCV::VMNAND_MM
, LT
.second
,
1548 getRISCVInstructionCost(RISCV::VCPOP_M
, LT
.second
, CostKind
) +
1549 getCmpSelInstrCost(Instruction::ICmp
, ElementTy
, ElementTy
,
1550 CmpInst::ICMP_EQ
, CostKind
);
1551 } else if (ISD
== ISD::XOR
) {
1552 // Example sequences:
1553 // vsetvli a0, zero, e8, mf8, ta, ma
1554 // vmxor.mm v8, v0, v8 ; needed every time type is split
1557 return (LT
.first
- 1) *
1558 getRISCVInstructionCost(RISCV::VMXOR_MM
, LT
.second
, CostKind
) +
1559 getRISCVInstructionCost(RISCV::VCPOP_M
, LT
.second
, CostKind
) + 1;
1561 // Example sequences:
1562 // vsetvli a0, zero, e8, mf8, ta, ma
1563 // vmxor.mm v8, v9, v8 ; needed every time type is split
1566 return (LT
.first
- 1) *
1567 getRISCVInstructionCost(RISCV::VMXOR_MM
, LT
.second
, CostKind
) +
1568 getRISCVInstructionCost(RISCV::VCPOP_M
, LT
.second
, CostKind
) +
1569 getCmpSelInstrCost(Instruction::ICmp
, ElementTy
, ElementTy
,
1570 CmpInst::ICMP_NE
, CostKind
);
1574 // IR Reduction of or/and is composed by one vmv and one rvv reduction
1575 // instruction, and others is composed by two vmv and one rvv reduction
1578 SmallVector
<unsigned, 3> Opcodes
;
1581 SplitOp
= RISCV::VADD_VV
;
1582 Opcodes
= {RISCV::VMV_S_X
, RISCV::VREDSUM_VS
, RISCV::VMV_X_S
};
1585 SplitOp
= RISCV::VOR_VV
;
1586 Opcodes
= {RISCV::VREDOR_VS
, RISCV::VMV_X_S
};
1589 SplitOp
= RISCV::VXOR_VV
;
1590 Opcodes
= {RISCV::VMV_S_X
, RISCV::VREDXOR_VS
, RISCV::VMV_X_S
};
1593 SplitOp
= RISCV::VAND_VV
;
1594 Opcodes
= {RISCV::VREDAND_VS
, RISCV::VMV_X_S
};
1597 // We can't promote f16/bf16 fadd reductions.
1598 if ((LT
.second
.getVectorElementType() == MVT::f16
&&
1599 !ST
->hasVInstructionsF16()) ||
1600 LT
.second
.getVectorElementType() == MVT::bf16
)
1601 return BaseT::getArithmeticReductionCost(Opcode
, Ty
, FMF
, CostKind
);
1602 if (TTI::requiresOrderedReduction(FMF
)) {
1603 Opcodes
.push_back(RISCV::VFMV_S_F
);
1604 for (unsigned i
= 0; i
< LT
.first
.getValue(); i
++)
1605 Opcodes
.push_back(RISCV::VFREDOSUM_VS
);
1606 Opcodes
.push_back(RISCV::VFMV_F_S
);
1607 return getRISCVInstructionCost(Opcodes
, LT
.second
, CostKind
);
1609 SplitOp
= RISCV::VFADD_VV
;
1610 Opcodes
= {RISCV::VFMV_S_F
, RISCV::VFREDUSUM_VS
, RISCV::VFMV_F_S
};
1613 // Add a cost for data larger than LMUL8
1614 InstructionCost SplitCost
=
1615 (LT
.first
> 1) ? (LT
.first
- 1) *
1616 getRISCVInstructionCost(SplitOp
, LT
.second
, CostKind
)
1618 return SplitCost
+ getRISCVInstructionCost(Opcodes
, LT
.second
, CostKind
);
1621 InstructionCost
RISCVTTIImpl::getExtendedReductionCost(
1622 unsigned Opcode
, bool IsUnsigned
, Type
*ResTy
, VectorType
*ValTy
,
1623 FastMathFlags FMF
, TTI::TargetCostKind CostKind
) {
1624 if (isa
<FixedVectorType
>(ValTy
) && !ST
->useRVVForFixedLengthVectors())
1625 return BaseT::getExtendedReductionCost(Opcode
, IsUnsigned
, ResTy
, ValTy
,
1628 // Skip if scalar size of ResTy is bigger than ELEN.
1629 if (ResTy
->getScalarSizeInBits() > ST
->getELen())
1630 return BaseT::getExtendedReductionCost(Opcode
, IsUnsigned
, ResTy
, ValTy
,
1633 if (Opcode
!= Instruction::Add
&& Opcode
!= Instruction::FAdd
)
1634 return BaseT::getExtendedReductionCost(Opcode
, IsUnsigned
, ResTy
, ValTy
,
1637 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(ValTy
);
1639 if (IsUnsigned
&& Opcode
== Instruction::Add
&&
1640 LT
.second
.isFixedLengthVector() && LT
.second
.getScalarType() == MVT::i1
) {
1641 // Represent vector_reduce_add(ZExt(<n x i1>)) as
1642 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
1644 getRISCVInstructionCost(RISCV::VCPOP_M
, LT
.second
, CostKind
);
1647 if (ResTy
->getScalarSizeInBits() != 2 * LT
.second
.getScalarSizeInBits())
1648 return BaseT::getExtendedReductionCost(Opcode
, IsUnsigned
, ResTy
, ValTy
,
1651 return (LT
.first
- 1) +
1652 getArithmeticReductionCost(Opcode
, ValTy
, FMF
, CostKind
);
1655 InstructionCost
RISCVTTIImpl::getStoreImmCost(Type
*Ty
,
1656 TTI::OperandValueInfo OpInfo
,
1657 TTI::TargetCostKind CostKind
) {
1658 assert(OpInfo
.isConstant() && "non constant operand?");
1659 if (!isa
<VectorType
>(Ty
))
1660 // FIXME: We need to account for immediate materialization here, but doing
1661 // a decent job requires more knowledge about the immediate than we
1662 // currently have here.
1665 if (OpInfo
.isUniform())
1666 // vmv.v.i, vmv.v.x, or vfmv.v.f
1667 // We ignore the cost of the scalar constant materialization to be consistent
1668 // with how we treat scalar constants themselves just above.
1671 return getConstantPoolLoadCost(Ty
, CostKind
);
1675 InstructionCost
RISCVTTIImpl::getMemoryOpCost(unsigned Opcode
, Type
*Src
,
1676 MaybeAlign Alignment
,
1677 unsigned AddressSpace
,
1678 TTI::TargetCostKind CostKind
,
1679 TTI::OperandValueInfo OpInfo
,
1680 const Instruction
*I
) {
1681 EVT VT
= TLI
->getValueType(DL
, Src
, true);
1682 // Type legalization can't handle structs
1683 if (VT
== MVT::Other
)
1684 return BaseT::getMemoryOpCost(Opcode
, Src
, Alignment
, AddressSpace
,
1685 CostKind
, OpInfo
, I
);
1687 InstructionCost Cost
= 0;
1688 if (Opcode
== Instruction::Store
&& OpInfo
.isConstant())
1689 Cost
+= getStoreImmCost(Src
, OpInfo
, CostKind
);
1691 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(Src
);
1693 InstructionCost BaseCost
= [&]() {
1694 InstructionCost Cost
= LT
.first
;
1695 if (CostKind
!= TTI::TCK_RecipThroughput
)
1698 // Our actual lowering for the case where a wider legal type is available
1699 // uses the a VL predicated load on the wider type. This is reflected in
1700 // the result of getTypeLegalizationCost, but BasicTTI assumes the
1701 // widened cases are scalarized.
1702 const DataLayout
&DL
= this->getDataLayout();
1703 if (Src
->isVectorTy() && LT
.second
.isVector() &&
1704 TypeSize::isKnownLT(DL
.getTypeStoreSizeInBits(Src
),
1705 LT
.second
.getSizeInBits()))
1708 return BaseT::getMemoryOpCost(Opcode
, Src
, Alignment
, AddressSpace
,
1709 CostKind
, OpInfo
, I
);
1712 // Assume memory ops cost scale with the number of vector registers
1713 // possible accessed by the instruction. Note that BasicTTI already
1714 // handles the LT.first term for us.
1715 if (LT
.second
.isVector() && CostKind
!= TTI::TCK_CodeSize
)
1716 BaseCost
*= TLI
->getLMULCost(LT
.second
);
1717 return Cost
+ BaseCost
;
1721 InstructionCost
RISCVTTIImpl::getCmpSelInstrCost(
1722 unsigned Opcode
, Type
*ValTy
, Type
*CondTy
, CmpInst::Predicate VecPred
,
1723 TTI::TargetCostKind CostKind
, TTI::OperandValueInfo Op1Info
,
1724 TTI::OperandValueInfo Op2Info
, const Instruction
*I
) {
1725 if (CostKind
!= TTI::TCK_RecipThroughput
)
1726 return BaseT::getCmpSelInstrCost(Opcode
, ValTy
, CondTy
, VecPred
, CostKind
,
1727 Op1Info
, Op2Info
, I
);
1729 if (isa
<FixedVectorType
>(ValTy
) && !ST
->useRVVForFixedLengthVectors())
1730 return BaseT::getCmpSelInstrCost(Opcode
, ValTy
, CondTy
, VecPred
, CostKind
,
1731 Op1Info
, Op2Info
, I
);
1733 // Skip if scalar size of ValTy is bigger than ELEN.
1734 if (ValTy
->isVectorTy() && ValTy
->getScalarSizeInBits() > ST
->getELen())
1735 return BaseT::getCmpSelInstrCost(Opcode
, ValTy
, CondTy
, VecPred
, CostKind
,
1736 Op1Info
, Op2Info
, I
);
1738 auto GetConstantMatCost
=
1739 [&](TTI::OperandValueInfo OpInfo
) -> InstructionCost
{
1740 if (OpInfo
.isUniform())
1741 // We return 0 we currently ignore the cost of materializing scalar
1742 // constants in GPRs.
1745 return getConstantPoolLoadCost(ValTy
, CostKind
);
1748 InstructionCost ConstantMatCost
;
1749 if (Op1Info
.isConstant())
1750 ConstantMatCost
+= GetConstantMatCost(Op1Info
);
1751 if (Op2Info
.isConstant())
1752 ConstantMatCost
+= GetConstantMatCost(Op2Info
);
1754 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(ValTy
);
1755 if (Opcode
== Instruction::Select
&& ValTy
->isVectorTy()) {
1756 if (CondTy
->isVectorTy()) {
1757 if (ValTy
->getScalarSizeInBits() == 1) {
1758 // vmandn.mm v8, v8, v9
1759 // vmand.mm v9, v0, v9
1760 // vmor.mm v0, v9, v8
1761 return ConstantMatCost
+
1763 getRISCVInstructionCost(
1764 {RISCV::VMANDN_MM
, RISCV::VMAND_MM
, RISCV::VMOR_MM
},
1765 LT
.second
, CostKind
);
1767 // vselect and max/min are supported natively.
1768 return ConstantMatCost
+
1769 LT
.first
* getRISCVInstructionCost(RISCV::VMERGE_VVM
, LT
.second
,
1773 if (ValTy
->getScalarSizeInBits() == 1) {
1775 // vmsne.vi v9, v9, 0
1776 // vmandn.mm v8, v8, v9
1777 // vmand.mm v9, v0, v9
1778 // vmor.mm v0, v9, v8
1779 MVT InterimVT
= LT
.second
.changeVectorElementType(MVT::i8
);
1780 return ConstantMatCost
+
1782 getRISCVInstructionCost({RISCV::VMV_V_X
, RISCV::VMSNE_VI
},
1783 InterimVT
, CostKind
) +
1784 LT
.first
* getRISCVInstructionCost(
1785 {RISCV::VMANDN_MM
, RISCV::VMAND_MM
, RISCV::VMOR_MM
},
1786 LT
.second
, CostKind
);
1790 // vmsne.vi v0, v10, 0
1791 // vmerge.vvm v8, v9, v8, v0
1792 return ConstantMatCost
+
1793 LT
.first
* getRISCVInstructionCost(
1794 {RISCV::VMV_V_X
, RISCV::VMSNE_VI
, RISCV::VMERGE_VVM
},
1795 LT
.second
, CostKind
);
1798 if ((Opcode
== Instruction::ICmp
) && ValTy
->isVectorTy() &&
1799 CmpInst::isIntPredicate(VecPred
)) {
1800 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
1801 // provided they incur the same cost across all implementations
1802 return ConstantMatCost
+ LT
.first
* getRISCVInstructionCost(RISCV::VMSLT_VV
,
1807 if ((Opcode
== Instruction::FCmp
) && ValTy
->isVectorTy() &&
1808 CmpInst::isFPPredicate(VecPred
)) {
1810 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
1811 if ((VecPred
== CmpInst::FCMP_FALSE
) || (VecPred
== CmpInst::FCMP_TRUE
))
1812 return ConstantMatCost
+
1813 getRISCVInstructionCost(RISCV::VMXOR_MM
, LT
.second
, CostKind
);
1815 // If we do not support the input floating point vector type, use the base
1816 // one which will calculate as:
1817 // ScalarizeCost + Num * Cost for fixed vector,
1818 // InvalidCost for scalable vector.
1819 if ((ValTy
->getScalarSizeInBits() == 16 && !ST
->hasVInstructionsF16()) ||
1820 (ValTy
->getScalarSizeInBits() == 32 && !ST
->hasVInstructionsF32()) ||
1821 (ValTy
->getScalarSizeInBits() == 64 && !ST
->hasVInstructionsF64()))
1822 return BaseT::getCmpSelInstrCost(Opcode
, ValTy
, CondTy
, VecPred
, CostKind
,
1823 Op1Info
, Op2Info
, I
);
1825 // Assuming vector fp compare and mask instructions are all the same cost
1826 // until a need arises to differentiate them.
1828 case CmpInst::FCMP_ONE
: // vmflt.vv + vmflt.vv + vmor.mm
1829 case CmpInst::FCMP_ORD
: // vmfeq.vv + vmfeq.vv + vmand.mm
1830 case CmpInst::FCMP_UNO
: // vmfne.vv + vmfne.vv + vmor.mm
1831 case CmpInst::FCMP_UEQ
: // vmflt.vv + vmflt.vv + vmnor.mm
1832 return ConstantMatCost
+
1833 LT
.first
* getRISCVInstructionCost(
1834 {RISCV::VMFLT_VV
, RISCV::VMFLT_VV
, RISCV::VMOR_MM
},
1835 LT
.second
, CostKind
);
1837 case CmpInst::FCMP_UGT
: // vmfle.vv + vmnot.m
1838 case CmpInst::FCMP_UGE
: // vmflt.vv + vmnot.m
1839 case CmpInst::FCMP_ULT
: // vmfle.vv + vmnot.m
1840 case CmpInst::FCMP_ULE
: // vmflt.vv + vmnot.m
1841 return ConstantMatCost
+
1843 getRISCVInstructionCost({RISCV::VMFLT_VV
, RISCV::VMNAND_MM
},
1844 LT
.second
, CostKind
);
1846 case CmpInst::FCMP_OEQ
: // vmfeq.vv
1847 case CmpInst::FCMP_OGT
: // vmflt.vv
1848 case CmpInst::FCMP_OGE
: // vmfle.vv
1849 case CmpInst::FCMP_OLT
: // vmflt.vv
1850 case CmpInst::FCMP_OLE
: // vmfle.vv
1851 case CmpInst::FCMP_UNE
: // vmfne.vv
1852 return ConstantMatCost
+
1854 getRISCVInstructionCost(RISCV::VMFLT_VV
, LT
.second
, CostKind
);
1860 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
1861 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
1862 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
1863 // be (0 + select instr cost).
1864 if (ST
->hasConditionalMoveFusion() && I
&& isa
<ICmpInst
>(I
) &&
1865 ValTy
->isIntegerTy() && !I
->user_empty()) {
1866 if (all_of(I
->users(), [&](const User
*U
) {
1867 return match(U
, m_Select(m_Specific(I
), m_Value(), m_Value())) &&
1868 U
->getType()->isIntegerTy() &&
1869 !isa
<ConstantData
>(U
->getOperand(1)) &&
1870 !isa
<ConstantData
>(U
->getOperand(2));
1875 // TODO: Add cost for scalar type.
1877 return BaseT::getCmpSelInstrCost(Opcode
, ValTy
, CondTy
, VecPred
, CostKind
,
1878 Op1Info
, Op2Info
, I
);
1881 InstructionCost
RISCVTTIImpl::getCFInstrCost(unsigned Opcode
,
1882 TTI::TargetCostKind CostKind
,
1883 const Instruction
*I
) {
1884 if (CostKind
!= TTI::TCK_RecipThroughput
)
1885 return Opcode
== Instruction::PHI
? 0 : 1;
1886 // Branches are assumed to be predicted.
1890 InstructionCost
RISCVTTIImpl::getVectorInstrCost(unsigned Opcode
, Type
*Val
,
1891 TTI::TargetCostKind CostKind
,
1892 unsigned Index
, Value
*Op0
,
1894 assert(Val
->isVectorTy() && "This must be a vector type");
1896 if (Opcode
!= Instruction::ExtractElement
&&
1897 Opcode
!= Instruction::InsertElement
)
1898 return BaseT::getVectorInstrCost(Opcode
, Val
, CostKind
, Index
, Op0
, Op1
);
1900 // Legalize the type.
1901 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(Val
);
1903 // This type is legalized to a scalar type.
1904 if (!LT
.second
.isVector()) {
1905 auto *FixedVecTy
= cast
<FixedVectorType
>(Val
);
1906 // If Index is a known constant, cost is zero.
1909 // Extract/InsertElement with non-constant index is very costly when
1910 // scalarized; estimate cost of loads/stores sequence via the stack:
1911 // ExtractElement cost: store vector to stack, load scalar;
1912 // InsertElement cost: store vector to stack, store scalar, load vector.
1913 Type
*ElemTy
= FixedVecTy
->getElementType();
1914 auto NumElems
= FixedVecTy
->getNumElements();
1915 auto Align
= DL
.getPrefTypeAlign(ElemTy
);
1916 InstructionCost LoadCost
=
1917 getMemoryOpCost(Instruction::Load
, ElemTy
, Align
, 0, CostKind
);
1918 InstructionCost StoreCost
=
1919 getMemoryOpCost(Instruction::Store
, ElemTy
, Align
, 0, CostKind
);
1920 return Opcode
== Instruction::ExtractElement
1921 ? StoreCost
* NumElems
+ LoadCost
1922 : (StoreCost
+ LoadCost
) * NumElems
+ StoreCost
;
1925 // For unsupported scalable vector.
1926 if (LT
.second
.isScalableVector() && !LT
.first
.isValid())
1929 // Mask vector extract/insert is expanded via e8.
1930 if (Val
->getScalarSizeInBits() == 1) {
1931 VectorType
*WideTy
=
1932 VectorType::get(IntegerType::get(Val
->getContext(), 8),
1933 cast
<VectorType
>(Val
)->getElementCount());
1934 if (Opcode
== Instruction::ExtractElement
) {
1935 InstructionCost ExtendCost
1936 = getCastInstrCost(Instruction::ZExt
, WideTy
, Val
,
1937 TTI::CastContextHint::None
, CostKind
);
1938 InstructionCost ExtractCost
1939 = getVectorInstrCost(Opcode
, WideTy
, CostKind
, Index
, nullptr, nullptr);
1940 return ExtendCost
+ ExtractCost
;
1942 InstructionCost ExtendCost
1943 = getCastInstrCost(Instruction::ZExt
, WideTy
, Val
,
1944 TTI::CastContextHint::None
, CostKind
);
1945 InstructionCost InsertCost
1946 = getVectorInstrCost(Opcode
, WideTy
, CostKind
, Index
, nullptr, nullptr);
1947 InstructionCost TruncCost
1948 = getCastInstrCost(Instruction::Trunc
, Val
, WideTy
,
1949 TTI::CastContextHint::None
, CostKind
);
1950 return ExtendCost
+ InsertCost
+ TruncCost
;
1954 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1955 // and vslideup + vmv.s.x to insert element to vector.
1956 unsigned BaseCost
= 1;
1957 // When insertelement we should add the index with 1 as the input of vslideup.
1958 unsigned SlideCost
= Opcode
== Instruction::InsertElement
? 2 : 1;
1961 // The type may be split. For fixed-width vectors we can normalize the
1962 // index to the new type.
1963 if (LT
.second
.isFixedLengthVector()) {
1964 unsigned Width
= LT
.second
.getVectorNumElements();
1965 Index
= Index
% Width
;
1968 // If exact VLEN is known, we will insert/extract into the appropriate
1969 // subvector with no additional subvector insert/extract cost.
1970 if (auto VLEN
= ST
->getRealVLen()) {
1971 unsigned EltSize
= LT
.second
.getScalarSizeInBits();
1972 unsigned M1Max
= *VLEN
/ EltSize
;
1973 Index
= Index
% M1Max
;
1976 // We could extract/insert the first element without vslidedown/vslideup.
1979 else if (Opcode
== Instruction::InsertElement
)
1980 SlideCost
= 1; // With a constant index, we do not need to use addi.
1983 // When the vector needs to split into multiple register groups and the index
1984 // exceeds single vector register group, we need to insert/extract the element
1987 ((Index
== -1U) || (Index
>= LT
.second
.getVectorMinNumElements() &&
1988 LT
.second
.isScalableVector()))) {
1989 Type
*ScalarType
= Val
->getScalarType();
1990 Align VecAlign
= DL
.getPrefTypeAlign(Val
);
1991 Align SclAlign
= DL
.getPrefTypeAlign(ScalarType
);
1992 // Extra addi for unknown index.
1993 InstructionCost IdxCost
= Index
== -1U ? 1 : 0;
1995 // Store all split vectors into stack and load the target element.
1996 if (Opcode
== Instruction::ExtractElement
)
1997 return getMemoryOpCost(Instruction::Store
, Val
, VecAlign
, 0, CostKind
) +
1998 getMemoryOpCost(Instruction::Load
, ScalarType
, SclAlign
, 0,
2002 // Store all split vectors into stack and store the target element and load
2004 return getMemoryOpCost(Instruction::Store
, Val
, VecAlign
, 0, CostKind
) +
2005 getMemoryOpCost(Instruction::Load
, Val
, VecAlign
, 0, CostKind
) +
2006 getMemoryOpCost(Instruction::Store
, ScalarType
, SclAlign
, 0,
2011 // Extract i64 in the target that has XLEN=32 need more instruction.
2012 if (Val
->getScalarType()->isIntegerTy() &&
2013 ST
->getXLen() < Val
->getScalarSizeInBits()) {
2014 // For extractelement, we need the following instructions:
2015 // vsetivli zero, 1, e64, m1, ta, mu (not count)
2016 // vslidedown.vx v8, v8, a0
2019 // vsrl.vx v8, v8, a1
2022 // For insertelement, we need the following instructions:
2023 // vsetivli zero, 2, e32, m4, ta, mu (not count)
2025 // vslide1up.vx v16, v12, a1
2026 // vslide1up.vx v12, v16, a0
2028 // vsetvli zero, a0, e64, m4, tu, mu (not count)
2029 // vslideup.vx v8, v12, a2
2031 // TODO: should we count these special vsetvlis?
2032 BaseCost
= Opcode
== Instruction::InsertElement
? 3 : 4;
2034 return BaseCost
+ SlideCost
;
2037 InstructionCost
RISCVTTIImpl::getArithmeticInstrCost(
2038 unsigned Opcode
, Type
*Ty
, TTI::TargetCostKind CostKind
,
2039 TTI::OperandValueInfo Op1Info
, TTI::OperandValueInfo Op2Info
,
2040 ArrayRef
<const Value
*> Args
, const Instruction
*CxtI
) {
2042 // TODO: Handle more cost kinds.
2043 if (CostKind
!= TTI::TCK_RecipThroughput
)
2044 return BaseT::getArithmeticInstrCost(Opcode
, Ty
, CostKind
, Op1Info
, Op2Info
,
2047 if (isa
<FixedVectorType
>(Ty
) && !ST
->useRVVForFixedLengthVectors())
2048 return BaseT::getArithmeticInstrCost(Opcode
, Ty
, CostKind
, Op1Info
, Op2Info
,
2051 // Skip if scalar size of Ty is bigger than ELEN.
2052 if (isa
<VectorType
>(Ty
) && Ty
->getScalarSizeInBits() > ST
->getELen())
2053 return BaseT::getArithmeticInstrCost(Opcode
, Ty
, CostKind
, Op1Info
, Op2Info
,
2056 // Legalize the type.
2057 std::pair
<InstructionCost
, MVT
> LT
= getTypeLegalizationCost(Ty
);
2059 // TODO: Handle scalar type.
2060 if (!LT
.second
.isVector())
2061 return BaseT::getArithmeticInstrCost(Opcode
, Ty
, CostKind
, Op1Info
, Op2Info
,
2064 // f16 with zvfhmin and bf16 will be promoted to f32.
2065 // FIXME: nxv32[b]f16 will be custom lowered and split.
2066 unsigned ISDOpcode
= TLI
->InstructionOpcodeToISD(Opcode
);
2067 InstructionCost CastCost
= 0;
2068 if ((LT
.second
.getVectorElementType() == MVT::f16
||
2069 LT
.second
.getVectorElementType() == MVT::bf16
) &&
2070 TLI
->getOperationAction(ISDOpcode
, LT
.second
) ==
2071 TargetLoweringBase::LegalizeAction::Promote
) {
2072 MVT PromotedVT
= TLI
->getTypeToPromoteTo(ISDOpcode
, LT
.second
);
2073 Type
*PromotedTy
= EVT(PromotedVT
).getTypeForEVT(Ty
->getContext());
2074 Type
*LegalTy
= EVT(LT
.second
).getTypeForEVT(Ty
->getContext());
2075 // Add cost of extending arguments
2076 CastCost
+= LT
.first
* Args
.size() *
2077 getCastInstrCost(Instruction::FPExt
, PromotedTy
, LegalTy
,
2078 TTI::CastContextHint::None
, CostKind
);
2079 // Add cost of truncating result
2081 LT
.first
* getCastInstrCost(Instruction::FPTrunc
, LegalTy
, PromotedTy
,
2082 TTI::CastContextHint::None
, CostKind
);
2083 // Compute cost of op in promoted type
2084 LT
.second
= PromotedVT
;
2087 auto getConstantMatCost
=
2088 [&](unsigned Operand
, TTI::OperandValueInfo OpInfo
) -> InstructionCost
{
2089 if (OpInfo
.isUniform() && canSplatOperand(Opcode
, Operand
))
2091 // * Has a 5 bit immediate operand which can be splatted.
2092 // * Has a larger immediate which must be materialized in scalar register
2093 // We return 0 for both as we currently ignore the cost of materializing
2094 // scalar constants in GPRs.
2097 return getConstantPoolLoadCost(Ty
, CostKind
);
2100 // Add the cost of materializing any constant vectors required.
2101 InstructionCost ConstantMatCost
= 0;
2102 if (Op1Info
.isConstant())
2103 ConstantMatCost
+= getConstantMatCost(0, Op1Info
);
2104 if (Op2Info
.isConstant())
2105 ConstantMatCost
+= getConstantMatCost(1, Op2Info
);
2108 switch (ISDOpcode
) {
2111 Op
= RISCV::VADD_VV
;
2116 Op
= RISCV::VSLL_VV
;
2121 Op
= (Ty
->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM
: RISCV::VAND_VV
;
2126 Op
= RISCV::VMUL_VV
;
2130 Op
= RISCV::VDIV_VV
;
2134 Op
= RISCV::VREM_VV
;
2138 Op
= RISCV::VFADD_VV
;
2141 Op
= RISCV::VFMUL_VV
;
2144 Op
= RISCV::VFDIV_VV
;
2147 Op
= RISCV::VFSGNJN_VV
;
2150 // Assuming all other instructions have the same cost until a need arises to
2151 // differentiate them.
2152 return CastCost
+ ConstantMatCost
+
2153 BaseT::getArithmeticInstrCost(Opcode
, Ty
, CostKind
, Op1Info
, Op2Info
,
2157 InstructionCost InstrCost
= getRISCVInstructionCost(Op
, LT
.second
, CostKind
);
2158 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2159 // ops are twice as expensive as integer ops. Do the same for vectors so
2160 // scalar floating point ops aren't cheaper than their vector equivalents.
2161 if (Ty
->isFPOrFPVectorTy())
2163 return CastCost
+ ConstantMatCost
+ LT
.first
* InstrCost
;
2166 // TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2167 InstructionCost
RISCVTTIImpl::getPointersChainCost(
2168 ArrayRef
<const Value
*> Ptrs
, const Value
*Base
,
2169 const TTI::PointersChainInfo
&Info
, Type
*AccessTy
,
2170 TTI::TargetCostKind CostKind
) {
2171 InstructionCost Cost
= TTI::TCC_Free
;
2172 // In the basic model we take into account GEP instructions only
2173 // (although here can come alloca instruction, a value, constants and/or
2174 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2175 // pointer). Typically, if Base is a not a GEP-instruction and all the
2176 // pointers are relative to the same base address, all the rest are
2177 // either GEP instructions, PHIs, bitcasts or constants. When we have same
2178 // base, we just calculate cost of each non-Base GEP as an ADD operation if
2179 // any their index is a non-const.
2180 // If no known dependecies between the pointers cost is calculated as a sum
2181 // of costs of GEP instructions.
2182 for (auto [I
, V
] : enumerate(Ptrs
)) {
2183 const auto *GEP
= dyn_cast
<GetElementPtrInst
>(V
);
2186 if (Info
.isSameBase() && V
!= Base
) {
2187 if (GEP
->hasAllConstantIndices())
2189 // If the chain is unit-stride and BaseReg + stride*i is a legal
2190 // addressing mode, then presume the base GEP is sitting around in a
2191 // register somewhere and check if we can fold the offset relative to
2193 unsigned Stride
= DL
.getTypeStoreSize(AccessTy
);
2194 if (Info
.isUnitStride() &&
2195 isLegalAddressingMode(AccessTy
,
2196 /* BaseGV */ nullptr,
2197 /* BaseOffset */ Stride
* I
,
2198 /* HasBaseReg */ true,
2200 GEP
->getType()->getPointerAddressSpace()))
2202 Cost
+= getArithmeticInstrCost(Instruction::Add
, GEP
->getType(), CostKind
,
2203 {TTI::OK_AnyValue
, TTI::OP_None
},
2204 {TTI::OK_AnyValue
, TTI::OP_None
}, {});
2206 SmallVector
<const Value
*> Indices(GEP
->indices());
2207 Cost
+= getGEPCost(GEP
->getSourceElementType(), GEP
->getPointerOperand(),
2208 Indices
, AccessTy
, CostKind
);
2214 void RISCVTTIImpl::getUnrollingPreferences(Loop
*L
, ScalarEvolution
&SE
,
2215 TTI::UnrollingPreferences
&UP
,
2216 OptimizationRemarkEmitter
*ORE
) {
2217 // TODO: More tuning on benchmarks and metrics with changes as needed
2218 // would apply to all settings below to enable performance.
2221 if (ST
->enableDefaultUnroll())
2222 return BasicTTIImplBase::getUnrollingPreferences(L
, SE
, UP
, ORE
);
2224 // Enable Upper bound unrolling universally, not dependant upon the conditions
2226 UP
.UpperBound
= true;
2228 // Disable loop unrolling for Oz and Os.
2229 UP
.OptSizeThreshold
= 0;
2230 UP
.PartialOptSizeThreshold
= 0;
2231 if (L
->getHeader()->getParent()->hasOptSize())
2234 SmallVector
<BasicBlock
*, 4> ExitingBlocks
;
2235 L
->getExitingBlocks(ExitingBlocks
);
2236 LLVM_DEBUG(dbgs() << "Loop has:\n"
2237 << "Blocks: " << L
->getNumBlocks() << "\n"
2238 << "Exit blocks: " << ExitingBlocks
.size() << "\n");
2240 // Only allow another exit other than the latch. This acts as an early exit
2241 // as it mirrors the profitability calculation of the runtime unroller.
2242 if (ExitingBlocks
.size() > 2)
2245 // Limit the CFG of the loop body for targets with a branch predictor.
2246 // Allowing 4 blocks permits if-then-else diamonds in the body.
2247 if (L
->getNumBlocks() > 4)
2250 // Don't unroll vectorized loops, including the remainder loop
2251 if (getBooleanLoopAttribute(L
, "llvm.loop.isvectorized"))
2254 // Scan the loop: don't unroll loops with calls as this could prevent
2256 InstructionCost Cost
= 0;
2257 for (auto *BB
: L
->getBlocks()) {
2258 for (auto &I
: *BB
) {
2259 // Initial setting - Don't unroll loops containing vectorized
2261 if (I
.getType()->isVectorTy())
2264 if (isa
<CallInst
>(I
) || isa
<InvokeInst
>(I
)) {
2265 if (const Function
*F
= cast
<CallBase
>(I
).getCalledFunction()) {
2266 if (!isLoweredToCall(F
))
2272 SmallVector
<const Value
*> Operands(I
.operand_values());
2273 Cost
+= getInstructionCost(&I
, Operands
,
2274 TargetTransformInfo::TCK_SizeAndLatency
);
2278 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost
<< "\n");
2282 UP
.UnrollRemainder
= true;
2283 UP
.UnrollAndJam
= true;
2285 // Force unrolling small loops can be very useful because of the branch
2286 // taken cost of the backedge.
2291 void RISCVTTIImpl::getPeelingPreferences(Loop
*L
, ScalarEvolution
&SE
,
2292 TTI::PeelingPreferences
&PP
) {
2293 BaseT::getPeelingPreferences(L
, SE
, PP
);
2296 unsigned RISCVTTIImpl::getRegUsageForType(Type
*Ty
) {
2297 if (Ty
->isVectorTy()) {
2298 // f16 with only zvfhmin and bf16 will be promoted to f32
2299 Type
*EltTy
= cast
<VectorType
>(Ty
)->getElementType();
2300 if ((EltTy
->isHalfTy() && !ST
->hasVInstructionsF16()) ||
2301 EltTy
->isBFloatTy())
2302 Ty
= VectorType::get(Type::getFloatTy(Ty
->getContext()),
2303 cast
<VectorType
>(Ty
));
2305 TypeSize Size
= DL
.getTypeSizeInBits(Ty
);
2306 if (Size
.isScalable() && ST
->hasVInstructions())
2307 return divideCeil(Size
.getKnownMinValue(), RISCV::RVVBitsPerBlock
);
2309 if (ST
->useRVVForFixedLengthVectors())
2310 return divideCeil(Size
, ST
->getRealMinVLen());
2313 return BaseT::getRegUsageForType(Ty
);
2316 unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth
, unsigned Opcode
) const {
2317 if (SLPMaxVF
.getNumOccurrences())
2320 // Return how many elements can fit in getRegisterBitwidth. This is the
2321 // same routine as used in LoopVectorizer. We should probably be
2322 // accounting for whether we actually have instructions with the right
2323 // lane type, but we don't have enough information to do that without
2324 // some additional plumbing which hasn't been justified yet.
2326 getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector
);
2327 // If no vector registers, or absurd element widths, disable
2328 // vectorization by returning 1.
2329 return std::max
<unsigned>(1U, RegWidth
.getFixedValue() / ElemWidth
);
2332 TTI::AddressingModeKind
2333 RISCVTTIImpl::getPreferredAddressingMode(const Loop
*L
,
2334 ScalarEvolution
*SE
) const {
2335 if (ST
->hasVendorXCVmem() && !ST
->is64Bit())
2336 return TTI::AMK_PostIndexed
;
2338 return BasicTTIImplBase::getPreferredAddressingMode(L
, SE
);
2341 bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost
&C1
,
2342 const TargetTransformInfo::LSRCost
&C2
) {
2343 // RISC-V specific here are "instruction number 1st priority".
2344 // If we need to emit adds inside the loop to add up base registers, then
2345 // we need at least one extra temporary register.
2346 unsigned C1NumRegs
= C1
.NumRegs
+ (C1
.NumBaseAdds
!= 0);
2347 unsigned C2NumRegs
= C2
.NumRegs
+ (C2
.NumBaseAdds
!= 0);
2348 return std::tie(C1
.Insns
, C1NumRegs
, C1
.AddRecCost
,
2349 C1
.NumIVMuls
, C1
.NumBaseAdds
,
2350 C1
.ScaleCost
, C1
.ImmCost
, C1
.SetupCost
) <
2351 std::tie(C2
.Insns
, C2NumRegs
, C2
.AddRecCost
,
2352 C2
.NumIVMuls
, C2
.NumBaseAdds
,
2353 C2
.ScaleCost
, C2
.ImmCost
, C2
.SetupCost
);
2356 bool RISCVTTIImpl::isLegalMaskedExpandLoad(Type
*DataTy
, Align Alignment
) {
2357 auto *VTy
= dyn_cast
<VectorType
>(DataTy
);
2358 if (!VTy
|| VTy
->isScalableTy())
2361 if (!isLegalMaskedLoadStore(DataTy
, Alignment
))
2364 // FIXME: If it is an i8 vector and the element count exceeds 256, we should
2365 // scalarize these types with LMUL >= maximum fixed-length LMUL.
2366 if (VTy
->getElementType()->isIntegerTy(8))
2367 if (VTy
->getElementCount().getFixedValue() > 256)
2368 return VTy
->getPrimitiveSizeInBits() / ST
->getRealMinVLen() <
2369 ST
->getMaxLMULForFixedLengthVectors();
2373 bool RISCVTTIImpl::isLegalMaskedCompressStore(Type
*DataTy
, Align Alignment
) {
2374 auto *VTy
= dyn_cast
<VectorType
>(DataTy
);
2375 if (!VTy
|| VTy
->isScalableTy())
2378 if (!isLegalMaskedLoadStore(DataTy
, Alignment
))
2383 /// See if \p I should be considered for address type promotion. We check if \p
2384 /// I is a sext with right type and used in memory accesses. If it used in a
2385 /// "complex" getelementptr, we allow it to be promoted without finding other
2386 /// sext instructions that sign extended the same initial value. A getelementptr
2387 /// is considered as "complex" if it has more than 2 operands.
2388 bool RISCVTTIImpl::shouldConsiderAddressTypePromotion(
2389 const Instruction
&I
, bool &AllowPromotionWithoutCommonHeader
) {
2390 bool Considerable
= false;
2391 AllowPromotionWithoutCommonHeader
= false;
2392 if (!isa
<SExtInst
>(&I
))
2394 Type
*ConsideredSExtType
=
2395 Type::getInt64Ty(I
.getParent()->getParent()->getContext());
2396 if (I
.getType() != ConsideredSExtType
)
2398 // See if the sext is the one with the right type and used in at least one
2399 // GetElementPtrInst.
2400 for (const User
*U
: I
.users()) {
2401 if (const GetElementPtrInst
*GEPInst
= dyn_cast
<GetElementPtrInst
>(U
)) {
2402 Considerable
= true;
2403 // A getelementptr is considered as "complex" if it has more than 2
2404 // operands. We will promote a SExt used in such complex GEP as we
2405 // expect some computation to be merged if they are done on 64 bits.
2406 if (GEPInst
->getNumOperands() > 2) {
2407 AllowPromotionWithoutCommonHeader
= true;
2412 return Considerable
;
2415 bool RISCVTTIImpl::canSplatOperand(unsigned Opcode
, int Operand
) const {
2417 case Instruction::Add
:
2418 case Instruction::Sub
:
2419 case Instruction::Mul
:
2420 case Instruction::And
:
2421 case Instruction::Or
:
2422 case Instruction::Xor
:
2423 case Instruction::FAdd
:
2424 case Instruction::FSub
:
2425 case Instruction::FMul
:
2426 case Instruction::FDiv
:
2427 case Instruction::ICmp
:
2428 case Instruction::FCmp
:
2430 case Instruction::Shl
:
2431 case Instruction::LShr
:
2432 case Instruction::AShr
:
2433 case Instruction::UDiv
:
2434 case Instruction::SDiv
:
2435 case Instruction::URem
:
2436 case Instruction::SRem
:
2437 case Instruction::Select
:
2438 return Operand
== 1;
2444 bool RISCVTTIImpl::canSplatOperand(Instruction
*I
, int Operand
) const {
2445 if (!I
->getType()->isVectorTy() || !ST
->hasVInstructions())
2448 if (canSplatOperand(I
->getOpcode(), Operand
))
2451 auto *II
= dyn_cast
<IntrinsicInst
>(I
);
2455 switch (II
->getIntrinsicID()) {
2456 case Intrinsic::fma
:
2457 case Intrinsic::vp_fma
:
2458 case Intrinsic::fmuladd
:
2459 case Intrinsic::vp_fmuladd
:
2460 return Operand
== 0 || Operand
== 1;
2461 case Intrinsic::vp_shl
:
2462 case Intrinsic::vp_lshr
:
2463 case Intrinsic::vp_ashr
:
2464 case Intrinsic::vp_udiv
:
2465 case Intrinsic::vp_sdiv
:
2466 case Intrinsic::vp_urem
:
2467 case Intrinsic::vp_srem
:
2468 case Intrinsic::ssub_sat
:
2469 case Intrinsic::vp_ssub_sat
:
2470 case Intrinsic::usub_sat
:
2471 case Intrinsic::vp_usub_sat
:
2472 case Intrinsic::vp_select
:
2473 return Operand
== 1;
2474 // These intrinsics are commutative.
2475 case Intrinsic::vp_add
:
2476 case Intrinsic::vp_mul
:
2477 case Intrinsic::vp_and
:
2478 case Intrinsic::vp_or
:
2479 case Intrinsic::vp_xor
:
2480 case Intrinsic::vp_fadd
:
2481 case Intrinsic::vp_fmul
:
2482 case Intrinsic::vp_icmp
:
2483 case Intrinsic::vp_fcmp
:
2484 case Intrinsic::smin
:
2485 case Intrinsic::vp_smin
:
2486 case Intrinsic::umin
:
2487 case Intrinsic::vp_umin
:
2488 case Intrinsic::smax
:
2489 case Intrinsic::vp_smax
:
2490 case Intrinsic::umax
:
2491 case Intrinsic::vp_umax
:
2492 case Intrinsic::sadd_sat
:
2493 case Intrinsic::vp_sadd_sat
:
2494 case Intrinsic::uadd_sat
:
2495 case Intrinsic::vp_uadd_sat
:
2496 // These intrinsics have 'vr' versions.
2497 case Intrinsic::vp_sub
:
2498 case Intrinsic::vp_fsub
:
2499 case Intrinsic::vp_fdiv
:
2500 return Operand
== 0 || Operand
== 1;
2506 /// Check if sinking \p I's operands to I's basic block is profitable, because
2507 /// the operands can be folded into a target instruction, e.g.
2508 /// splats of scalars can fold into vector instructions.
2509 bool RISCVTTIImpl::isProfitableToSinkOperands(
2510 Instruction
*I
, SmallVectorImpl
<Use
*> &Ops
) const {
2511 using namespace llvm::PatternMatch
;
2513 if (!I
->getType()->isVectorTy() || !ST
->hasVInstructions())
2516 // Don't sink splat operands if the target prefers it. Some targets requires
2517 // S2V transfer buffers and we can run out of them copying the same value
2519 // FIXME: It could still be worth doing if it would improve vector register
2520 // pressure and prevent a vector spill.
2521 if (!ST
->sinkSplatOperands())
2524 for (auto OpIdx
: enumerate(I
->operands())) {
2525 if (!canSplatOperand(I
, OpIdx
.index()))
2528 Instruction
*Op
= dyn_cast
<Instruction
>(OpIdx
.value().get());
2529 // Make sure we are not already sinking this operand
2530 if (!Op
|| any_of(Ops
, [&](Use
*U
) { return U
->get() == Op
; }))
2533 // We are looking for a splat that can be sunk.
2534 if (!match(Op
, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),
2535 m_Undef(), m_ZeroMask())))
2538 // Don't sink i1 splats.
2539 if (cast
<VectorType
>(Op
->getType())->getElementType()->isIntegerTy(1))
2542 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
2543 // and vector registers
2544 for (Use
&U
: Op
->uses()) {
2545 Instruction
*Insn
= cast
<Instruction
>(U
.getUser());
2546 if (!canSplatOperand(Insn
, U
.getOperandNo()))
2550 Ops
.push_back(&Op
->getOperandUse(0));
2551 Ops
.push_back(&OpIdx
.value());
2556 RISCVTTIImpl::TTI::MemCmpExpansionOptions
2557 RISCVTTIImpl::enableMemCmpExpansion(bool OptSize
, bool IsZeroCmp
) const {
2558 TTI::MemCmpExpansionOptions Options
;
2559 // TODO: Enable expansion when unaligned access is not supported after we fix
2560 // issues in ExpandMemcmp.
2561 if (!ST
->enableUnalignedScalarMem())
2564 if (!ST
->hasStdExtZbb() && !ST
->hasStdExtZbkb() && !IsZeroCmp
)
2567 Options
.AllowOverlappingLoads
= true;
2568 Options
.MaxNumLoads
= TLI
->getMaxExpandSizeMemcmp(OptSize
);
2569 Options
.NumLoadsPerBlock
= Options
.MaxNumLoads
;
2570 if (ST
->is64Bit()) {
2571 Options
.LoadSizes
= {8, 4, 2, 1};
2572 Options
.AllowedTailExpansions
= {3, 5, 6};
2574 Options
.LoadSizes
= {4, 2, 1};
2575 Options
.AllowedTailExpansions
= {3};