Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / llvm / lib / Target / RISCV / RISCVTargetTransformInfo.cpp
blob25bbb189cadd835c0983c164e3403b1b0ce35ec8
1 //===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
9 #include "RISCVTargetTransformInfo.h"
10 #include "MCTargetDesc/RISCVMatInt.h"
11 #include "llvm/ADT/STLExtras.h"
12 #include "llvm/Analysis/TargetTransformInfo.h"
13 #include "llvm/CodeGen/BasicTTIImpl.h"
14 #include "llvm/CodeGen/CostTable.h"
15 #include "llvm/CodeGen/TargetLowering.h"
16 #include "llvm/IR/Instructions.h"
17 #include <cmath>
18 #include <optional>
19 using namespace llvm;
21 #define DEBUG_TYPE "riscvtti"
23 static cl::opt<unsigned> RVVRegisterWidthLMUL(
24 "riscv-v-register-bit-width-lmul",
25 cl::desc(
26 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
27 "by autovectorized code. Fractional LMULs are not supported."),
28 cl::init(2), cl::Hidden);
30 static cl::opt<unsigned> SLPMaxVF(
31 "riscv-v-slp-max-vf",
32 cl::desc(
33 "Overrides result used for getMaximumVF query which is used "
34 "exclusively by SLP vectorizer."),
35 cl::Hidden);
37 InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
38 TTI::TargetCostKind CostKind) {
39 assert(Ty->isIntegerTy() &&
40 "getIntImmCost can only estimate cost of materialising integers");
42 // We have a Zero register, so 0 is always free.
43 if (Imm == 0)
44 return TTI::TCC_Free;
46 // Otherwise, we check how many instructions it will take to materialise.
47 const DataLayout &DL = getDataLayout();
48 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty),
49 getST()->getFeatureBits());
52 // Look for patterns of shift followed by AND that can be turned into a pair of
53 // shifts. We won't need to materialize an immediate for the AND so these can
54 // be considered free.
55 static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
56 uint64_t Mask = Imm.getZExtValue();
57 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
58 if (!BO || !BO->hasOneUse())
59 return false;
61 if (BO->getOpcode() != Instruction::Shl)
62 return false;
64 if (!isa<ConstantInt>(BO->getOperand(1)))
65 return false;
67 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
68 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
69 // is a mask shifted by c2 bits with c3 leading zeros.
70 if (isShiftedMask_64(Mask)) {
71 unsigned Trailing = llvm::countr_zero(Mask);
72 if (ShAmt == Trailing)
73 return true;
76 return false;
79 InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
80 const APInt &Imm, Type *Ty,
81 TTI::TargetCostKind CostKind,
82 Instruction *Inst) {
83 assert(Ty->isIntegerTy() &&
84 "getIntImmCost can only estimate cost of materialising integers");
86 // We have a Zero register, so 0 is always free.
87 if (Imm == 0)
88 return TTI::TCC_Free;
90 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
91 // commutative, in others the immediate comes from a specific argument index.
92 bool Takes12BitImm = false;
93 unsigned ImmArgIdx = ~0U;
95 switch (Opcode) {
96 case Instruction::GetElementPtr:
97 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
98 // split up large offsets in GEP into better parts than ConstantHoisting
99 // can.
100 return TTI::TCC_Free;
101 case Instruction::And:
102 // zext.h
103 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
104 return TTI::TCC_Free;
105 // zext.w
106 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
107 return TTI::TCC_Free;
108 // bclri
109 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
110 return TTI::TCC_Free;
111 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
112 canUseShiftPair(Inst, Imm))
113 return TTI::TCC_Free;
114 Takes12BitImm = true;
115 break;
116 case Instruction::Add:
117 Takes12BitImm = true;
118 break;
119 case Instruction::Or:
120 case Instruction::Xor:
121 // bseti/binvi
122 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
123 return TTI::TCC_Free;
124 Takes12BitImm = true;
125 break;
126 case Instruction::Mul:
127 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
128 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
129 return TTI::TCC_Free;
130 // One more or less than a power of 2 can use SLLI+ADD/SUB.
131 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
132 return TTI::TCC_Free;
133 // FIXME: There is no MULI instruction.
134 Takes12BitImm = true;
135 break;
136 case Instruction::Sub:
137 case Instruction::Shl:
138 case Instruction::LShr:
139 case Instruction::AShr:
140 Takes12BitImm = true;
141 ImmArgIdx = 1;
142 break;
143 default:
144 break;
147 if (Takes12BitImm) {
148 // Check immediate is the correct argument...
149 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
150 // ... and fits into the 12-bit immediate.
151 if (Imm.getSignificantBits() <= 64 &&
152 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
153 return TTI::TCC_Free;
157 // Otherwise, use the full materialisation cost.
158 return getIntImmCost(Imm, Ty, CostKind);
161 // By default, prevent hoisting.
162 return TTI::TCC_Free;
165 InstructionCost
166 RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
167 const APInt &Imm, Type *Ty,
168 TTI::TargetCostKind CostKind) {
169 // Prevent hoisting in unknown cases.
170 return TTI::TCC_Free;
173 TargetTransformInfo::PopcntSupportKind
174 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) {
175 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
176 return ST->hasStdExtZbb() ? TTI::PSK_FastHardware : TTI::PSK_Software;
179 bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
180 // Currently, the ExpandReductions pass can't expand scalable-vector
181 // reductions, but we still request expansion as RVV doesn't support certain
182 // reductions and the SelectionDAG can't legalize them either.
183 switch (II->getIntrinsicID()) {
184 default:
185 return false;
186 // These reductions have no equivalent in RVV
187 case Intrinsic::vector_reduce_mul:
188 case Intrinsic::vector_reduce_fmul:
189 return true;
193 std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
194 if (ST->hasVInstructions())
195 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
196 return BaseT::getMaxVScale();
199 std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
200 if (ST->hasVInstructions())
201 if (unsigned MinVLen = ST->getRealMinVLen();
202 MinVLen >= RISCV::RVVBitsPerBlock)
203 return MinVLen / RISCV::RVVBitsPerBlock;
204 return BaseT::getVScaleForTuning();
207 TypeSize
208 RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
209 unsigned LMUL =
210 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
211 switch (K) {
212 case TargetTransformInfo::RGK_Scalar:
213 return TypeSize::Fixed(ST->getXLen());
214 case TargetTransformInfo::RGK_FixedWidthVector:
215 return TypeSize::Fixed(
216 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
217 case TargetTransformInfo::RGK_ScalableVector:
218 return TypeSize::Scalable((ST->hasVInstructions() &&
219 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
220 ? LMUL * RISCV::RVVBitsPerBlock
221 : 0);
224 llvm_unreachable("Unsupported register kind");
227 InstructionCost
228 RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) {
229 // Add a cost of address generation + the cost of the load. The address
230 // is expected to be a PC relative offset to a constant pool entry
231 // using auipc/addi.
232 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
233 /*AddressSpace=*/0, CostKind);
236 static VectorType *getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST,
237 LLVMContext &C) {
238 assert((DataVT.getScalarSizeInBits() != 8 ||
239 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
240 MVT IndexVT = DataVT.changeTypeToInteger();
241 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
242 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
243 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
246 InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
247 VectorType *Tp, ArrayRef<int> Mask,
248 TTI::TargetCostKind CostKind,
249 int Index, VectorType *SubTp,
250 ArrayRef<const Value *> Args) {
251 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
253 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
255 // First, handle cases where having a fixed length vector enables us to
256 // give a more accurate cost than falling back to generic scalable codegen.
257 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
258 if (isa<FixedVectorType>(Tp)) {
259 switch (Kind) {
260 default:
261 break;
262 case TTI::SK_PermuteSingleSrc: {
263 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) {
264 MVT EltTp = LT.second.getVectorElementType();
265 // If the size of the element is < ELEN then shuffles of interleaves and
266 // deinterleaves of 2 vectors can be lowered into the following
267 // sequences
268 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
269 // Example sequence:
270 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
271 // vwaddu.vv v10, v8, v9
272 // li a0, -1 (ignored)
273 // vwmaccu.vx v10, a0, v9
274 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
275 return 2 * LT.first * TLI->getLMULCost(LT.second);
277 if (Mask[0] == 0 || Mask[0] == 1) {
278 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
279 // Example sequence:
280 // vnsrl.wi v10, v8, 0
281 if (equal(DeinterleaveMask, Mask))
282 return LT.first * TLI->getLMULCost(LT.second);
286 // vrgather + cost of generating the mask constant.
287 // We model this for an unknown mask with a single vrgather.
288 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
289 (LT.second.getScalarSizeInBits() != 8 ||
290 LT.second.getVectorNumElements() <= 256)) {
291 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext());
292 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
293 return IndexCost + TLI->getVRGatherVVCost(LT.second);
295 [[fallthrough]];
297 case TTI::SK_Transpose:
298 case TTI::SK_PermuteTwoSrc: {
299 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
300 // register for the second vrgather. We model this for an unknown
301 // (shuffle) mask.
302 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
303 (LT.second.getScalarSizeInBits() != 8 ||
304 LT.second.getVectorNumElements() <= 256)) {
305 auto &C = Tp->getContext();
306 auto EC = Tp->getElementCount();
307 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
308 VectorType *MaskTy = VectorType::get(IntegerType::getInt1Ty(C), EC);
309 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
310 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
311 return 2 * IndexCost + 2 * TLI->getVRGatherVVCost(LT.second) + MaskCost;
313 [[fallthrough]];
315 case TTI::SK_Select: {
316 // We are going to permute multiple sources and the result will be in
317 // multiple destinations. Providing an accurate cost only for splits where
318 // the element type remains the same.
319 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
320 LT.second.isFixedLengthVector() &&
321 LT.second.getVectorElementType().getSizeInBits() ==
322 Tp->getElementType()->getPrimitiveSizeInBits() &&
323 LT.second.getVectorNumElements() <
324 cast<FixedVectorType>(Tp)->getNumElements() &&
325 divideCeil(Mask.size(),
326 cast<FixedVectorType>(Tp)->getNumElements()) ==
327 static_cast<unsigned>(*LT.first.getValue())) {
328 unsigned NumRegs = *LT.first.getValue();
329 unsigned VF = cast<FixedVectorType>(Tp)->getNumElements();
330 unsigned SubVF = PowerOf2Ceil(VF / NumRegs);
331 auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF);
333 InstructionCost Cost = 0;
334 for (unsigned I = 0; I < NumRegs; ++I) {
335 bool IsSingleVector = true;
336 SmallVector<int> SubMask(SubVF, PoisonMaskElem);
337 transform(Mask.slice(I * SubVF,
338 I == NumRegs - 1 ? Mask.size() % SubVF : SubVF),
339 SubMask.begin(), [&](int I) {
340 bool SingleSubVector = I / VF == 0;
341 IsSingleVector &= SingleSubVector;
342 return (SingleSubVector ? 0 : 1) * SubVF + I % VF;
344 Cost += getShuffleCost(IsSingleVector ? TTI::SK_PermuteSingleSrc
345 : TTI::SK_PermuteTwoSrc,
346 SubVecTy, SubMask, CostKind, 0, nullptr);
347 return Cost;
350 break;
355 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
356 switch (Kind) {
357 default:
358 // Fallthrough to generic handling.
359 // TODO: Most of these cases will return getInvalid in generic code, and
360 // must be implemented here.
361 break;
362 case TTI::SK_ExtractSubvector:
363 // Example sequence:
364 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
365 // vslidedown.vi v8, v9, 2
366 return LT.first * TLI->getVSlideCost(LT.second);
367 case TTI::SK_InsertSubvector:
368 // Example sequence:
369 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
370 // vslideup.vi v8, v9, 2
371 return LT.first * TLI->getVSlideCost(LT.second);
372 case TTI::SK_Select: {
373 // Example sequence:
374 // li a0, 90
375 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
376 // vmv.s.x v0, a0
377 // vmerge.vvm v8, v9, v8, v0
378 return LT.first * 3 * TLI->getLMULCost(LT.second);
380 case TTI::SK_Broadcast: {
381 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
382 Instruction::InsertElement);
383 if (LT.second.getScalarSizeInBits() == 1) {
384 if (HasScalar) {
385 // Example sequence:
386 // andi a0, a0, 1
387 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
388 // vmv.v.x v8, a0
389 // vmsne.vi v0, v8, 0
390 return LT.first * TLI->getLMULCost(LT.second) * 3;
392 // Example sequence:
393 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
394 // vmv.v.i v8, 0
395 // vmerge.vim v8, v8, 1, v0
396 // vmv.x.s a0, v8
397 // andi a0, a0, 1
398 // vmv.v.x v8, a0
399 // vmsne.vi v0, v8, 0
401 return LT.first * TLI->getLMULCost(LT.second) * 6;
404 if (HasScalar) {
405 // Example sequence:
406 // vmv.v.x v8, a0
407 return LT.first * TLI->getLMULCost(LT.second);
410 // Example sequence:
411 // vrgather.vi v9, v8, 0
412 return LT.first * TLI->getVRGatherVICost(LT.second);
414 case TTI::SK_Splice:
415 // vslidedown+vslideup.
416 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
417 // of similar code, but I think we expand through memory.
418 return 2 * LT.first * TLI->getVSlideCost(LT.second);
419 case TTI::SK_Reverse: {
420 // TODO: Cases to improve here:
421 // * Illegal vector types
422 // * i64 on RV32
423 // * i1 vector
424 // At low LMUL, most of the cost is producing the vrgather index register.
425 // At high LMUL, the cost of the vrgather itself will dominate.
426 // Example sequence:
427 // csrr a0, vlenb
428 // srli a0, a0, 3
429 // addi a0, a0, -1
430 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
431 // vid.v v9
432 // vrsub.vx v10, v9, a0
433 // vrgather.vv v9, v8, v10
434 InstructionCost LenCost = 3;
435 if (LT.second.isFixedLengthVector())
436 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
437 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
438 InstructionCost GatherCost = 2 + TLI->getVRGatherVVCost(LT.second);
439 // Mask operation additionally required extend and truncate
440 InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
441 return LT.first * (LenCost + GatherCost + ExtendCost);
444 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
447 InstructionCost
448 RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
449 unsigned AddressSpace,
450 TTI::TargetCostKind CostKind) {
451 if (!isLegalMaskedLoadStore(Src, Alignment) ||
452 CostKind != TTI::TCK_RecipThroughput)
453 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
454 CostKind);
456 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
459 InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
460 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
461 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
462 bool UseMaskForCond, bool UseMaskForGaps) {
463 if (isa<ScalableVectorType>(VecTy))
464 return InstructionCost::getInvalid();
465 auto *FVTy = cast<FixedVectorType>(VecTy);
466 InstructionCost MemCost =
467 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
468 unsigned VF = FVTy->getNumElements() / Factor;
470 // The interleaved memory access pass will lower interleaved memory ops (i.e
471 // a load and store followed by a specific shuffle) to vlseg/vsseg
472 // intrinsics. In those cases then we can treat it as if it's just one (legal)
473 // memory op
474 if (!UseMaskForCond && !UseMaskForGaps &&
475 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
476 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(FVTy);
477 // Need to make sure type has't been scalarized
478 if (LT.second.isFixedLengthVector()) {
479 auto *LegalFVTy = FixedVectorType::get(FVTy->getElementType(),
480 LT.second.getVectorNumElements());
481 // FIXME: We use the memory op cost of the *legalized* type here, becuase
482 // it's getMemoryOpCost returns a really expensive cost for types like
483 // <6 x i8>, which show up when doing interleaves of Factor=3 etc.
484 // Should the memory op cost of these be cheaper?
485 if (TLI->isLegalInterleavedAccessType(LegalFVTy, Factor, Alignment,
486 AddressSpace, DL)) {
487 InstructionCost LegalMemCost = getMemoryOpCost(
488 Opcode, LegalFVTy, Alignment, AddressSpace, CostKind);
489 return LT.first + LegalMemCost;
494 // An interleaved load will look like this for Factor=3:
495 // %wide.vec = load <12 x i32>, ptr %3, align 4
496 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
497 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
498 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
499 if (Opcode == Instruction::Load) {
500 InstructionCost Cost = MemCost;
501 for (unsigned Index : Indices) {
502 FixedVectorType *SubVecTy =
503 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
504 auto Mask = createStrideMask(Index, Factor, VF);
505 InstructionCost ShuffleCost =
506 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, SubVecTy, Mask,
507 CostKind, 0, nullptr, {});
508 Cost += ShuffleCost;
510 return Cost;
513 // TODO: Model for NF > 2
514 // We'll need to enhance getShuffleCost to model shuffles that are just
515 // inserts and extracts into subvectors, since they won't have the full cost
516 // of a vrgather.
517 // An interleaved store for 3 vectors of 4 lanes will look like
518 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
519 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
520 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
521 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
522 // store <12 x i32> %interleaved.vec, ptr %10, align 4
523 if (Factor != 2)
524 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
525 Alignment, AddressSpace, CostKind,
526 UseMaskForCond, UseMaskForGaps);
528 assert(Opcode == Instruction::Store && "Opcode must be a store");
529 // For an interleaving store of 2 vectors, we perform one large interleaving
530 // shuffle that goes into the wide store
531 auto Mask = createInterleaveMask(VF, Factor);
532 InstructionCost ShuffleCost =
533 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, FVTy, Mask,
534 CostKind, 0, nullptr, {});
535 return MemCost + ShuffleCost;
538 InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
539 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
540 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
541 if (CostKind != TTI::TCK_RecipThroughput)
542 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
543 Alignment, CostKind, I);
545 if ((Opcode == Instruction::Load &&
546 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
547 (Opcode == Instruction::Store &&
548 !isLegalMaskedScatter(DataTy, Align(Alignment))))
549 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
550 Alignment, CostKind, I);
552 // Cost is proportional to the number of memory operations implied. For
553 // scalable vectors, we use an estimate on that number since we don't
554 // know exactly what VL will be.
555 auto &VTy = *cast<VectorType>(DataTy);
556 InstructionCost MemOpCost =
557 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
558 {TTI::OK_AnyValue, TTI::OP_None}, I);
559 unsigned NumLoads = getEstimatedVLFor(&VTy);
560 return NumLoads * MemOpCost;
563 // Currently, these represent both throughput and codesize costs
564 // for the respective intrinsics. The costs in this table are simply
565 // instruction counts with the following adjustments made:
566 // * One vsetvli is considered free.
567 static const CostTblEntry VectorIntrinsicCostTable[]{
568 {Intrinsic::floor, MVT::v2f32, 9},
569 {Intrinsic::floor, MVT::v4f32, 9},
570 {Intrinsic::floor, MVT::v8f32, 9},
571 {Intrinsic::floor, MVT::v16f32, 9},
572 {Intrinsic::floor, MVT::nxv1f32, 9},
573 {Intrinsic::floor, MVT::nxv2f32, 9},
574 {Intrinsic::floor, MVT::nxv4f32, 9},
575 {Intrinsic::floor, MVT::nxv8f32, 9},
576 {Intrinsic::floor, MVT::nxv16f32, 9},
577 {Intrinsic::floor, MVT::v2f64, 9},
578 {Intrinsic::floor, MVT::v4f64, 9},
579 {Intrinsic::floor, MVT::v8f64, 9},
580 {Intrinsic::floor, MVT::v16f64, 9},
581 {Intrinsic::floor, MVT::nxv1f64, 9},
582 {Intrinsic::floor, MVT::nxv2f64, 9},
583 {Intrinsic::floor, MVT::nxv4f64, 9},
584 {Intrinsic::floor, MVT::nxv8f64, 9},
585 {Intrinsic::ceil, MVT::v2f32, 9},
586 {Intrinsic::ceil, MVT::v4f32, 9},
587 {Intrinsic::ceil, MVT::v8f32, 9},
588 {Intrinsic::ceil, MVT::v16f32, 9},
589 {Intrinsic::ceil, MVT::nxv1f32, 9},
590 {Intrinsic::ceil, MVT::nxv2f32, 9},
591 {Intrinsic::ceil, MVT::nxv4f32, 9},
592 {Intrinsic::ceil, MVT::nxv8f32, 9},
593 {Intrinsic::ceil, MVT::nxv16f32, 9},
594 {Intrinsic::ceil, MVT::v2f64, 9},
595 {Intrinsic::ceil, MVT::v4f64, 9},
596 {Intrinsic::ceil, MVT::v8f64, 9},
597 {Intrinsic::ceil, MVT::v16f64, 9},
598 {Intrinsic::ceil, MVT::nxv1f64, 9},
599 {Intrinsic::ceil, MVT::nxv2f64, 9},
600 {Intrinsic::ceil, MVT::nxv4f64, 9},
601 {Intrinsic::ceil, MVT::nxv8f64, 9},
602 {Intrinsic::trunc, MVT::v2f32, 7},
603 {Intrinsic::trunc, MVT::v4f32, 7},
604 {Intrinsic::trunc, MVT::v8f32, 7},
605 {Intrinsic::trunc, MVT::v16f32, 7},
606 {Intrinsic::trunc, MVT::nxv1f32, 7},
607 {Intrinsic::trunc, MVT::nxv2f32, 7},
608 {Intrinsic::trunc, MVT::nxv4f32, 7},
609 {Intrinsic::trunc, MVT::nxv8f32, 7},
610 {Intrinsic::trunc, MVT::nxv16f32, 7},
611 {Intrinsic::trunc, MVT::v2f64, 7},
612 {Intrinsic::trunc, MVT::v4f64, 7},
613 {Intrinsic::trunc, MVT::v8f64, 7},
614 {Intrinsic::trunc, MVT::v16f64, 7},
615 {Intrinsic::trunc, MVT::nxv1f64, 7},
616 {Intrinsic::trunc, MVT::nxv2f64, 7},
617 {Intrinsic::trunc, MVT::nxv4f64, 7},
618 {Intrinsic::trunc, MVT::nxv8f64, 7},
619 {Intrinsic::round, MVT::v2f32, 9},
620 {Intrinsic::round, MVT::v4f32, 9},
621 {Intrinsic::round, MVT::v8f32, 9},
622 {Intrinsic::round, MVT::v16f32, 9},
623 {Intrinsic::round, MVT::nxv1f32, 9},
624 {Intrinsic::round, MVT::nxv2f32, 9},
625 {Intrinsic::round, MVT::nxv4f32, 9},
626 {Intrinsic::round, MVT::nxv8f32, 9},
627 {Intrinsic::round, MVT::nxv16f32, 9},
628 {Intrinsic::round, MVT::v2f64, 9},
629 {Intrinsic::round, MVT::v4f64, 9},
630 {Intrinsic::round, MVT::v8f64, 9},
631 {Intrinsic::round, MVT::v16f64, 9},
632 {Intrinsic::round, MVT::nxv1f64, 9},
633 {Intrinsic::round, MVT::nxv2f64, 9},
634 {Intrinsic::round, MVT::nxv4f64, 9},
635 {Intrinsic::round, MVT::nxv8f64, 9},
636 {Intrinsic::roundeven, MVT::v2f32, 9},
637 {Intrinsic::roundeven, MVT::v4f32, 9},
638 {Intrinsic::roundeven, MVT::v8f32, 9},
639 {Intrinsic::roundeven, MVT::v16f32, 9},
640 {Intrinsic::roundeven, MVT::nxv1f32, 9},
641 {Intrinsic::roundeven, MVT::nxv2f32, 9},
642 {Intrinsic::roundeven, MVT::nxv4f32, 9},
643 {Intrinsic::roundeven, MVT::nxv8f32, 9},
644 {Intrinsic::roundeven, MVT::nxv16f32, 9},
645 {Intrinsic::roundeven, MVT::v2f64, 9},
646 {Intrinsic::roundeven, MVT::v4f64, 9},
647 {Intrinsic::roundeven, MVT::v8f64, 9},
648 {Intrinsic::roundeven, MVT::v16f64, 9},
649 {Intrinsic::roundeven, MVT::nxv1f64, 9},
650 {Intrinsic::roundeven, MVT::nxv2f64, 9},
651 {Intrinsic::roundeven, MVT::nxv4f64, 9},
652 {Intrinsic::roundeven, MVT::nxv8f64, 9},
653 {Intrinsic::rint, MVT::v2f32, 7},
654 {Intrinsic::rint, MVT::v4f32, 7},
655 {Intrinsic::rint, MVT::v8f32, 7},
656 {Intrinsic::rint, MVT::v16f32, 7},
657 {Intrinsic::rint, MVT::nxv1f32, 7},
658 {Intrinsic::rint, MVT::nxv2f32, 7},
659 {Intrinsic::rint, MVT::nxv4f32, 7},
660 {Intrinsic::rint, MVT::nxv8f32, 7},
661 {Intrinsic::rint, MVT::nxv16f32, 7},
662 {Intrinsic::rint, MVT::v2f64, 7},
663 {Intrinsic::rint, MVT::v4f64, 7},
664 {Intrinsic::rint, MVT::v8f64, 7},
665 {Intrinsic::rint, MVT::v16f64, 7},
666 {Intrinsic::rint, MVT::nxv1f64, 7},
667 {Intrinsic::rint, MVT::nxv2f64, 7},
668 {Intrinsic::rint, MVT::nxv4f64, 7},
669 {Intrinsic::rint, MVT::nxv8f64, 7},
670 {Intrinsic::lrint, MVT::v2i32, 1},
671 {Intrinsic::lrint, MVT::v4i32, 1},
672 {Intrinsic::lrint, MVT::v8i32, 1},
673 {Intrinsic::lrint, MVT::v16i32, 1},
674 {Intrinsic::lrint, MVT::nxv1i32, 1},
675 {Intrinsic::lrint, MVT::nxv2i32, 1},
676 {Intrinsic::lrint, MVT::nxv4i32, 1},
677 {Intrinsic::lrint, MVT::nxv8i32, 1},
678 {Intrinsic::lrint, MVT::nxv16i32, 1},
679 {Intrinsic::lrint, MVT::v2i64, 1},
680 {Intrinsic::lrint, MVT::v4i64, 1},
681 {Intrinsic::lrint, MVT::v8i64, 1},
682 {Intrinsic::lrint, MVT::v16i64, 1},
683 {Intrinsic::lrint, MVT::nxv1i64, 1},
684 {Intrinsic::lrint, MVT::nxv2i64, 1},
685 {Intrinsic::lrint, MVT::nxv4i64, 1},
686 {Intrinsic::lrint, MVT::nxv8i64, 1},
687 {Intrinsic::llrint, MVT::v2i64, 1},
688 {Intrinsic::llrint, MVT::v4i64, 1},
689 {Intrinsic::llrint, MVT::v8i64, 1},
690 {Intrinsic::llrint, MVT::v16i64, 1},
691 {Intrinsic::llrint, MVT::nxv1i64, 1},
692 {Intrinsic::llrint, MVT::nxv2i64, 1},
693 {Intrinsic::llrint, MVT::nxv4i64, 1},
694 {Intrinsic::llrint, MVT::nxv8i64, 1},
695 {Intrinsic::nearbyint, MVT::v2f32, 9},
696 {Intrinsic::nearbyint, MVT::v4f32, 9},
697 {Intrinsic::nearbyint, MVT::v8f32, 9},
698 {Intrinsic::nearbyint, MVT::v16f32, 9},
699 {Intrinsic::nearbyint, MVT::nxv1f32, 9},
700 {Intrinsic::nearbyint, MVT::nxv2f32, 9},
701 {Intrinsic::nearbyint, MVT::nxv4f32, 9},
702 {Intrinsic::nearbyint, MVT::nxv8f32, 9},
703 {Intrinsic::nearbyint, MVT::nxv16f32, 9},
704 {Intrinsic::nearbyint, MVT::v2f64, 9},
705 {Intrinsic::nearbyint, MVT::v4f64, 9},
706 {Intrinsic::nearbyint, MVT::v8f64, 9},
707 {Intrinsic::nearbyint, MVT::v16f64, 9},
708 {Intrinsic::nearbyint, MVT::nxv1f64, 9},
709 {Intrinsic::nearbyint, MVT::nxv2f64, 9},
710 {Intrinsic::nearbyint, MVT::nxv4f64, 9},
711 {Intrinsic::nearbyint, MVT::nxv8f64, 9},
712 {Intrinsic::bswap, MVT::v2i16, 3},
713 {Intrinsic::bswap, MVT::v4i16, 3},
714 {Intrinsic::bswap, MVT::v8i16, 3},
715 {Intrinsic::bswap, MVT::v16i16, 3},
716 {Intrinsic::bswap, MVT::nxv1i16, 3},
717 {Intrinsic::bswap, MVT::nxv2i16, 3},
718 {Intrinsic::bswap, MVT::nxv4i16, 3},
719 {Intrinsic::bswap, MVT::nxv8i16, 3},
720 {Intrinsic::bswap, MVT::nxv16i16, 3},
721 {Intrinsic::bswap, MVT::v2i32, 12},
722 {Intrinsic::bswap, MVT::v4i32, 12},
723 {Intrinsic::bswap, MVT::v8i32, 12},
724 {Intrinsic::bswap, MVT::v16i32, 12},
725 {Intrinsic::bswap, MVT::nxv1i32, 12},
726 {Intrinsic::bswap, MVT::nxv2i32, 12},
727 {Intrinsic::bswap, MVT::nxv4i32, 12},
728 {Intrinsic::bswap, MVT::nxv8i32, 12},
729 {Intrinsic::bswap, MVT::nxv16i32, 12},
730 {Intrinsic::bswap, MVT::v2i64, 31},
731 {Intrinsic::bswap, MVT::v4i64, 31},
732 {Intrinsic::bswap, MVT::v8i64, 31},
733 {Intrinsic::bswap, MVT::v16i64, 31},
734 {Intrinsic::bswap, MVT::nxv1i64, 31},
735 {Intrinsic::bswap, MVT::nxv2i64, 31},
736 {Intrinsic::bswap, MVT::nxv4i64, 31},
737 {Intrinsic::bswap, MVT::nxv8i64, 31},
738 {Intrinsic::vp_bswap, MVT::v2i16, 3},
739 {Intrinsic::vp_bswap, MVT::v4i16, 3},
740 {Intrinsic::vp_bswap, MVT::v8i16, 3},
741 {Intrinsic::vp_bswap, MVT::v16i16, 3},
742 {Intrinsic::vp_bswap, MVT::nxv1i16, 3},
743 {Intrinsic::vp_bswap, MVT::nxv2i16, 3},
744 {Intrinsic::vp_bswap, MVT::nxv4i16, 3},
745 {Intrinsic::vp_bswap, MVT::nxv8i16, 3},
746 {Intrinsic::vp_bswap, MVT::nxv16i16, 3},
747 {Intrinsic::vp_bswap, MVT::v2i32, 12},
748 {Intrinsic::vp_bswap, MVT::v4i32, 12},
749 {Intrinsic::vp_bswap, MVT::v8i32, 12},
750 {Intrinsic::vp_bswap, MVT::v16i32, 12},
751 {Intrinsic::vp_bswap, MVT::nxv1i32, 12},
752 {Intrinsic::vp_bswap, MVT::nxv2i32, 12},
753 {Intrinsic::vp_bswap, MVT::nxv4i32, 12},
754 {Intrinsic::vp_bswap, MVT::nxv8i32, 12},
755 {Intrinsic::vp_bswap, MVT::nxv16i32, 12},
756 {Intrinsic::vp_bswap, MVT::v2i64, 31},
757 {Intrinsic::vp_bswap, MVT::v4i64, 31},
758 {Intrinsic::vp_bswap, MVT::v8i64, 31},
759 {Intrinsic::vp_bswap, MVT::v16i64, 31},
760 {Intrinsic::vp_bswap, MVT::nxv1i64, 31},
761 {Intrinsic::vp_bswap, MVT::nxv2i64, 31},
762 {Intrinsic::vp_bswap, MVT::nxv4i64, 31},
763 {Intrinsic::vp_bswap, MVT::nxv8i64, 31},
764 {Intrinsic::vp_fshl, MVT::v2i8, 7},
765 {Intrinsic::vp_fshl, MVT::v4i8, 7},
766 {Intrinsic::vp_fshl, MVT::v8i8, 7},
767 {Intrinsic::vp_fshl, MVT::v16i8, 7},
768 {Intrinsic::vp_fshl, MVT::nxv1i8, 7},
769 {Intrinsic::vp_fshl, MVT::nxv2i8, 7},
770 {Intrinsic::vp_fshl, MVT::nxv4i8, 7},
771 {Intrinsic::vp_fshl, MVT::nxv8i8, 7},
772 {Intrinsic::vp_fshl, MVT::nxv16i8, 7},
773 {Intrinsic::vp_fshl, MVT::nxv32i8, 7},
774 {Intrinsic::vp_fshl, MVT::nxv64i8, 7},
775 {Intrinsic::vp_fshl, MVT::v2i16, 7},
776 {Intrinsic::vp_fshl, MVT::v4i16, 7},
777 {Intrinsic::vp_fshl, MVT::v8i16, 7},
778 {Intrinsic::vp_fshl, MVT::v16i16, 7},
779 {Intrinsic::vp_fshl, MVT::nxv1i16, 7},
780 {Intrinsic::vp_fshl, MVT::nxv2i16, 7},
781 {Intrinsic::vp_fshl, MVT::nxv4i16, 7},
782 {Intrinsic::vp_fshl, MVT::nxv8i16, 7},
783 {Intrinsic::vp_fshl, MVT::nxv16i16, 7},
784 {Intrinsic::vp_fshl, MVT::nxv32i16, 7},
785 {Intrinsic::vp_fshl, MVT::v2i32, 7},
786 {Intrinsic::vp_fshl, MVT::v4i32, 7},
787 {Intrinsic::vp_fshl, MVT::v8i32, 7},
788 {Intrinsic::vp_fshl, MVT::v16i32, 7},
789 {Intrinsic::vp_fshl, MVT::nxv1i32, 7},
790 {Intrinsic::vp_fshl, MVT::nxv2i32, 7},
791 {Intrinsic::vp_fshl, MVT::nxv4i32, 7},
792 {Intrinsic::vp_fshl, MVT::nxv8i32, 7},
793 {Intrinsic::vp_fshl, MVT::nxv16i32, 7},
794 {Intrinsic::vp_fshl, MVT::v2i64, 7},
795 {Intrinsic::vp_fshl, MVT::v4i64, 7},
796 {Intrinsic::vp_fshl, MVT::v8i64, 7},
797 {Intrinsic::vp_fshl, MVT::v16i64, 7},
798 {Intrinsic::vp_fshl, MVT::nxv1i64, 7},
799 {Intrinsic::vp_fshl, MVT::nxv2i64, 7},
800 {Intrinsic::vp_fshl, MVT::nxv4i64, 7},
801 {Intrinsic::vp_fshl, MVT::nxv8i64, 7},
802 {Intrinsic::vp_fshr, MVT::v2i8, 7},
803 {Intrinsic::vp_fshr, MVT::v4i8, 7},
804 {Intrinsic::vp_fshr, MVT::v8i8, 7},
805 {Intrinsic::vp_fshr, MVT::v16i8, 7},
806 {Intrinsic::vp_fshr, MVT::nxv1i8, 7},
807 {Intrinsic::vp_fshr, MVT::nxv2i8, 7},
808 {Intrinsic::vp_fshr, MVT::nxv4i8, 7},
809 {Intrinsic::vp_fshr, MVT::nxv8i8, 7},
810 {Intrinsic::vp_fshr, MVT::nxv16i8, 7},
811 {Intrinsic::vp_fshr, MVT::nxv32i8, 7},
812 {Intrinsic::vp_fshr, MVT::nxv64i8, 7},
813 {Intrinsic::vp_fshr, MVT::v2i16, 7},
814 {Intrinsic::vp_fshr, MVT::v4i16, 7},
815 {Intrinsic::vp_fshr, MVT::v8i16, 7},
816 {Intrinsic::vp_fshr, MVT::v16i16, 7},
817 {Intrinsic::vp_fshr, MVT::nxv1i16, 7},
818 {Intrinsic::vp_fshr, MVT::nxv2i16, 7},
819 {Intrinsic::vp_fshr, MVT::nxv4i16, 7},
820 {Intrinsic::vp_fshr, MVT::nxv8i16, 7},
821 {Intrinsic::vp_fshr, MVT::nxv16i16, 7},
822 {Intrinsic::vp_fshr, MVT::nxv32i16, 7},
823 {Intrinsic::vp_fshr, MVT::v2i32, 7},
824 {Intrinsic::vp_fshr, MVT::v4i32, 7},
825 {Intrinsic::vp_fshr, MVT::v8i32, 7},
826 {Intrinsic::vp_fshr, MVT::v16i32, 7},
827 {Intrinsic::vp_fshr, MVT::nxv1i32, 7},
828 {Intrinsic::vp_fshr, MVT::nxv2i32, 7},
829 {Intrinsic::vp_fshr, MVT::nxv4i32, 7},
830 {Intrinsic::vp_fshr, MVT::nxv8i32, 7},
831 {Intrinsic::vp_fshr, MVT::nxv16i32, 7},
832 {Intrinsic::vp_fshr, MVT::v2i64, 7},
833 {Intrinsic::vp_fshr, MVT::v4i64, 7},
834 {Intrinsic::vp_fshr, MVT::v8i64, 7},
835 {Intrinsic::vp_fshr, MVT::v16i64, 7},
836 {Intrinsic::vp_fshr, MVT::nxv1i64, 7},
837 {Intrinsic::vp_fshr, MVT::nxv2i64, 7},
838 {Intrinsic::vp_fshr, MVT::nxv4i64, 7},
839 {Intrinsic::vp_fshr, MVT::nxv8i64, 7},
840 {Intrinsic::bitreverse, MVT::v2i8, 17},
841 {Intrinsic::bitreverse, MVT::v4i8, 17},
842 {Intrinsic::bitreverse, MVT::v8i8, 17},
843 {Intrinsic::bitreverse, MVT::v16i8, 17},
844 {Intrinsic::bitreverse, MVT::nxv1i8, 17},
845 {Intrinsic::bitreverse, MVT::nxv2i8, 17},
846 {Intrinsic::bitreverse, MVT::nxv4i8, 17},
847 {Intrinsic::bitreverse, MVT::nxv8i8, 17},
848 {Intrinsic::bitreverse, MVT::nxv16i8, 17},
849 {Intrinsic::bitreverse, MVT::v2i16, 24},
850 {Intrinsic::bitreverse, MVT::v4i16, 24},
851 {Intrinsic::bitreverse, MVT::v8i16, 24},
852 {Intrinsic::bitreverse, MVT::v16i16, 24},
853 {Intrinsic::bitreverse, MVT::nxv1i16, 24},
854 {Intrinsic::bitreverse, MVT::nxv2i16, 24},
855 {Intrinsic::bitreverse, MVT::nxv4i16, 24},
856 {Intrinsic::bitreverse, MVT::nxv8i16, 24},
857 {Intrinsic::bitreverse, MVT::nxv16i16, 24},
858 {Intrinsic::bitreverse, MVT::v2i32, 33},
859 {Intrinsic::bitreverse, MVT::v4i32, 33},
860 {Intrinsic::bitreverse, MVT::v8i32, 33},
861 {Intrinsic::bitreverse, MVT::v16i32, 33},
862 {Intrinsic::bitreverse, MVT::nxv1i32, 33},
863 {Intrinsic::bitreverse, MVT::nxv2i32, 33},
864 {Intrinsic::bitreverse, MVT::nxv4i32, 33},
865 {Intrinsic::bitreverse, MVT::nxv8i32, 33},
866 {Intrinsic::bitreverse, MVT::nxv16i32, 33},
867 {Intrinsic::bitreverse, MVT::v2i64, 52},
868 {Intrinsic::bitreverse, MVT::v4i64, 52},
869 {Intrinsic::bitreverse, MVT::v8i64, 52},
870 {Intrinsic::bitreverse, MVT::v16i64, 52},
871 {Intrinsic::bitreverse, MVT::nxv1i64, 52},
872 {Intrinsic::bitreverse, MVT::nxv2i64, 52},
873 {Intrinsic::bitreverse, MVT::nxv4i64, 52},
874 {Intrinsic::bitreverse, MVT::nxv8i64, 52},
875 {Intrinsic::vp_bitreverse, MVT::v2i8, 17},
876 {Intrinsic::vp_bitreverse, MVT::v4i8, 17},
877 {Intrinsic::vp_bitreverse, MVT::v8i8, 17},
878 {Intrinsic::vp_bitreverse, MVT::v16i8, 17},
879 {Intrinsic::vp_bitreverse, MVT::nxv1i8, 17},
880 {Intrinsic::vp_bitreverse, MVT::nxv2i8, 17},
881 {Intrinsic::vp_bitreverse, MVT::nxv4i8, 17},
882 {Intrinsic::vp_bitreverse, MVT::nxv8i8, 17},
883 {Intrinsic::vp_bitreverse, MVT::nxv16i8, 17},
884 {Intrinsic::vp_bitreverse, MVT::v2i16, 24},
885 {Intrinsic::vp_bitreverse, MVT::v4i16, 24},
886 {Intrinsic::vp_bitreverse, MVT::v8i16, 24},
887 {Intrinsic::vp_bitreverse, MVT::v16i16, 24},
888 {Intrinsic::vp_bitreverse, MVT::nxv1i16, 24},
889 {Intrinsic::vp_bitreverse, MVT::nxv2i16, 24},
890 {Intrinsic::vp_bitreverse, MVT::nxv4i16, 24},
891 {Intrinsic::vp_bitreverse, MVT::nxv8i16, 24},
892 {Intrinsic::vp_bitreverse, MVT::nxv16i16, 24},
893 {Intrinsic::vp_bitreverse, MVT::v2i32, 33},
894 {Intrinsic::vp_bitreverse, MVT::v4i32, 33},
895 {Intrinsic::vp_bitreverse, MVT::v8i32, 33},
896 {Intrinsic::vp_bitreverse, MVT::v16i32, 33},
897 {Intrinsic::vp_bitreverse, MVT::nxv1i32, 33},
898 {Intrinsic::vp_bitreverse, MVT::nxv2i32, 33},
899 {Intrinsic::vp_bitreverse, MVT::nxv4i32, 33},
900 {Intrinsic::vp_bitreverse, MVT::nxv8i32, 33},
901 {Intrinsic::vp_bitreverse, MVT::nxv16i32, 33},
902 {Intrinsic::vp_bitreverse, MVT::v2i64, 52},
903 {Intrinsic::vp_bitreverse, MVT::v4i64, 52},
904 {Intrinsic::vp_bitreverse, MVT::v8i64, 52},
905 {Intrinsic::vp_bitreverse, MVT::v16i64, 52},
906 {Intrinsic::vp_bitreverse, MVT::nxv1i64, 52},
907 {Intrinsic::vp_bitreverse, MVT::nxv2i64, 52},
908 {Intrinsic::vp_bitreverse, MVT::nxv4i64, 52},
909 {Intrinsic::vp_bitreverse, MVT::nxv8i64, 52},
910 {Intrinsic::ctpop, MVT::v2i8, 12},
911 {Intrinsic::ctpop, MVT::v4i8, 12},
912 {Intrinsic::ctpop, MVT::v8i8, 12},
913 {Intrinsic::ctpop, MVT::v16i8, 12},
914 {Intrinsic::ctpop, MVT::nxv1i8, 12},
915 {Intrinsic::ctpop, MVT::nxv2i8, 12},
916 {Intrinsic::ctpop, MVT::nxv4i8, 12},
917 {Intrinsic::ctpop, MVT::nxv8i8, 12},
918 {Intrinsic::ctpop, MVT::nxv16i8, 12},
919 {Intrinsic::ctpop, MVT::v2i16, 19},
920 {Intrinsic::ctpop, MVT::v4i16, 19},
921 {Intrinsic::ctpop, MVT::v8i16, 19},
922 {Intrinsic::ctpop, MVT::v16i16, 19},
923 {Intrinsic::ctpop, MVT::nxv1i16, 19},
924 {Intrinsic::ctpop, MVT::nxv2i16, 19},
925 {Intrinsic::ctpop, MVT::nxv4i16, 19},
926 {Intrinsic::ctpop, MVT::nxv8i16, 19},
927 {Intrinsic::ctpop, MVT::nxv16i16, 19},
928 {Intrinsic::ctpop, MVT::v2i32, 20},
929 {Intrinsic::ctpop, MVT::v4i32, 20},
930 {Intrinsic::ctpop, MVT::v8i32, 20},
931 {Intrinsic::ctpop, MVT::v16i32, 20},
932 {Intrinsic::ctpop, MVT::nxv1i32, 20},
933 {Intrinsic::ctpop, MVT::nxv2i32, 20},
934 {Intrinsic::ctpop, MVT::nxv4i32, 20},
935 {Intrinsic::ctpop, MVT::nxv8i32, 20},
936 {Intrinsic::ctpop, MVT::nxv16i32, 20},
937 {Intrinsic::ctpop, MVT::v2i64, 21},
938 {Intrinsic::ctpop, MVT::v4i64, 21},
939 {Intrinsic::ctpop, MVT::v8i64, 21},
940 {Intrinsic::ctpop, MVT::v16i64, 21},
941 {Intrinsic::ctpop, MVT::nxv1i64, 21},
942 {Intrinsic::ctpop, MVT::nxv2i64, 21},
943 {Intrinsic::ctpop, MVT::nxv4i64, 21},
944 {Intrinsic::ctpop, MVT::nxv8i64, 21},
945 {Intrinsic::vp_ctpop, MVT::v2i8, 12},
946 {Intrinsic::vp_ctpop, MVT::v4i8, 12},
947 {Intrinsic::vp_ctpop, MVT::v8i8, 12},
948 {Intrinsic::vp_ctpop, MVT::v16i8, 12},
949 {Intrinsic::vp_ctpop, MVT::nxv1i8, 12},
950 {Intrinsic::vp_ctpop, MVT::nxv2i8, 12},
951 {Intrinsic::vp_ctpop, MVT::nxv4i8, 12},
952 {Intrinsic::vp_ctpop, MVT::nxv8i8, 12},
953 {Intrinsic::vp_ctpop, MVT::nxv16i8, 12},
954 {Intrinsic::vp_ctpop, MVT::v2i16, 19},
955 {Intrinsic::vp_ctpop, MVT::v4i16, 19},
956 {Intrinsic::vp_ctpop, MVT::v8i16, 19},
957 {Intrinsic::vp_ctpop, MVT::v16i16, 19},
958 {Intrinsic::vp_ctpop, MVT::nxv1i16, 19},
959 {Intrinsic::vp_ctpop, MVT::nxv2i16, 19},
960 {Intrinsic::vp_ctpop, MVT::nxv4i16, 19},
961 {Intrinsic::vp_ctpop, MVT::nxv8i16, 19},
962 {Intrinsic::vp_ctpop, MVT::nxv16i16, 19},
963 {Intrinsic::vp_ctpop, MVT::v2i32, 20},
964 {Intrinsic::vp_ctpop, MVT::v4i32, 20},
965 {Intrinsic::vp_ctpop, MVT::v8i32, 20},
966 {Intrinsic::vp_ctpop, MVT::v16i32, 20},
967 {Intrinsic::vp_ctpop, MVT::nxv1i32, 20},
968 {Intrinsic::vp_ctpop, MVT::nxv2i32, 20},
969 {Intrinsic::vp_ctpop, MVT::nxv4i32, 20},
970 {Intrinsic::vp_ctpop, MVT::nxv8i32, 20},
971 {Intrinsic::vp_ctpop, MVT::nxv16i32, 20},
972 {Intrinsic::vp_ctpop, MVT::v2i64, 21},
973 {Intrinsic::vp_ctpop, MVT::v4i64, 21},
974 {Intrinsic::vp_ctpop, MVT::v8i64, 21},
975 {Intrinsic::vp_ctpop, MVT::v16i64, 21},
976 {Intrinsic::vp_ctpop, MVT::nxv1i64, 21},
977 {Intrinsic::vp_ctpop, MVT::nxv2i64, 21},
978 {Intrinsic::vp_ctpop, MVT::nxv4i64, 21},
979 {Intrinsic::vp_ctpop, MVT::nxv8i64, 21},
980 {Intrinsic::vp_ctlz, MVT::v2i8, 19},
981 {Intrinsic::vp_ctlz, MVT::v4i8, 19},
982 {Intrinsic::vp_ctlz, MVT::v8i8, 19},
983 {Intrinsic::vp_ctlz, MVT::v16i8, 19},
984 {Intrinsic::vp_ctlz, MVT::nxv1i8, 19},
985 {Intrinsic::vp_ctlz, MVT::nxv2i8, 19},
986 {Intrinsic::vp_ctlz, MVT::nxv4i8, 19},
987 {Intrinsic::vp_ctlz, MVT::nxv8i8, 19},
988 {Intrinsic::vp_ctlz, MVT::nxv16i8, 19},
989 {Intrinsic::vp_ctlz, MVT::nxv32i8, 19},
990 {Intrinsic::vp_ctlz, MVT::nxv64i8, 19},
991 {Intrinsic::vp_ctlz, MVT::v2i16, 28},
992 {Intrinsic::vp_ctlz, MVT::v4i16, 28},
993 {Intrinsic::vp_ctlz, MVT::v8i16, 28},
994 {Intrinsic::vp_ctlz, MVT::v16i16, 28},
995 {Intrinsic::vp_ctlz, MVT::nxv1i16, 28},
996 {Intrinsic::vp_ctlz, MVT::nxv2i16, 28},
997 {Intrinsic::vp_ctlz, MVT::nxv4i16, 28},
998 {Intrinsic::vp_ctlz, MVT::nxv8i16, 28},
999 {Intrinsic::vp_ctlz, MVT::nxv16i16, 28},
1000 {Intrinsic::vp_ctlz, MVT::nxv32i16, 28},
1001 {Intrinsic::vp_ctlz, MVT::v2i32, 31},
1002 {Intrinsic::vp_ctlz, MVT::v4i32, 31},
1003 {Intrinsic::vp_ctlz, MVT::v8i32, 31},
1004 {Intrinsic::vp_ctlz, MVT::v16i32, 31},
1005 {Intrinsic::vp_ctlz, MVT::nxv1i32, 31},
1006 {Intrinsic::vp_ctlz, MVT::nxv2i32, 31},
1007 {Intrinsic::vp_ctlz, MVT::nxv4i32, 31},
1008 {Intrinsic::vp_ctlz, MVT::nxv8i32, 31},
1009 {Intrinsic::vp_ctlz, MVT::nxv16i32, 31},
1010 {Intrinsic::vp_ctlz, MVT::v2i64, 35},
1011 {Intrinsic::vp_ctlz, MVT::v4i64, 35},
1012 {Intrinsic::vp_ctlz, MVT::v8i64, 35},
1013 {Intrinsic::vp_ctlz, MVT::v16i64, 35},
1014 {Intrinsic::vp_ctlz, MVT::nxv1i64, 35},
1015 {Intrinsic::vp_ctlz, MVT::nxv2i64, 35},
1016 {Intrinsic::vp_ctlz, MVT::nxv4i64, 35},
1017 {Intrinsic::vp_ctlz, MVT::nxv8i64, 35},
1018 {Intrinsic::vp_cttz, MVT::v2i8, 16},
1019 {Intrinsic::vp_cttz, MVT::v4i8, 16},
1020 {Intrinsic::vp_cttz, MVT::v8i8, 16},
1021 {Intrinsic::vp_cttz, MVT::v16i8, 16},
1022 {Intrinsic::vp_cttz, MVT::nxv1i8, 16},
1023 {Intrinsic::vp_cttz, MVT::nxv2i8, 16},
1024 {Intrinsic::vp_cttz, MVT::nxv4i8, 16},
1025 {Intrinsic::vp_cttz, MVT::nxv8i8, 16},
1026 {Intrinsic::vp_cttz, MVT::nxv16i8, 16},
1027 {Intrinsic::vp_cttz, MVT::nxv32i8, 16},
1028 {Intrinsic::vp_cttz, MVT::nxv64i8, 16},
1029 {Intrinsic::vp_cttz, MVT::v2i16, 23},
1030 {Intrinsic::vp_cttz, MVT::v4i16, 23},
1031 {Intrinsic::vp_cttz, MVT::v8i16, 23},
1032 {Intrinsic::vp_cttz, MVT::v16i16, 23},
1033 {Intrinsic::vp_cttz, MVT::nxv1i16, 23},
1034 {Intrinsic::vp_cttz, MVT::nxv2i16, 23},
1035 {Intrinsic::vp_cttz, MVT::nxv4i16, 23},
1036 {Intrinsic::vp_cttz, MVT::nxv8i16, 23},
1037 {Intrinsic::vp_cttz, MVT::nxv16i16, 23},
1038 {Intrinsic::vp_cttz, MVT::nxv32i16, 23},
1039 {Intrinsic::vp_cttz, MVT::v2i32, 24},
1040 {Intrinsic::vp_cttz, MVT::v4i32, 24},
1041 {Intrinsic::vp_cttz, MVT::v8i32, 24},
1042 {Intrinsic::vp_cttz, MVT::v16i32, 24},
1043 {Intrinsic::vp_cttz, MVT::nxv1i32, 24},
1044 {Intrinsic::vp_cttz, MVT::nxv2i32, 24},
1045 {Intrinsic::vp_cttz, MVT::nxv4i32, 24},
1046 {Intrinsic::vp_cttz, MVT::nxv8i32, 24},
1047 {Intrinsic::vp_cttz, MVT::nxv16i32, 24},
1048 {Intrinsic::vp_cttz, MVT::v2i64, 25},
1049 {Intrinsic::vp_cttz, MVT::v4i64, 25},
1050 {Intrinsic::vp_cttz, MVT::v8i64, 25},
1051 {Intrinsic::vp_cttz, MVT::v16i64, 25},
1052 {Intrinsic::vp_cttz, MVT::nxv1i64, 25},
1053 {Intrinsic::vp_cttz, MVT::nxv2i64, 25},
1054 {Intrinsic::vp_cttz, MVT::nxv4i64, 25},
1055 {Intrinsic::vp_cttz, MVT::nxv8i64, 25},
1058 static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID) {
1059 switch (ID) {
1060 #define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
1061 case Intrinsic::VPID: \
1062 return ISD::VPSD;
1063 #include "llvm/IR/VPIntrinsics.def"
1064 #undef HELPER_MAP_VPID_TO_VPSD
1066 return ISD::DELETED_NODE;
1069 InstructionCost
1070 RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1071 TTI::TargetCostKind CostKind) {
1072 auto *RetTy = ICA.getReturnType();
1073 switch (ICA.getID()) {
1074 case Intrinsic::ceil:
1075 case Intrinsic::floor:
1076 case Intrinsic::trunc:
1077 case Intrinsic::rint:
1078 case Intrinsic::lrint:
1079 case Intrinsic::llrint:
1080 case Intrinsic::round:
1081 case Intrinsic::roundeven: {
1082 // These all use the same code.
1083 auto LT = getTypeLegalizationCost(RetTy);
1084 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1085 return LT.first * 8;
1086 break;
1088 case Intrinsic::umin:
1089 case Intrinsic::umax:
1090 case Intrinsic::smin:
1091 case Intrinsic::smax: {
1092 auto LT = getTypeLegalizationCost(RetTy);
1093 if ((ST->hasVInstructions() && LT.second.isVector()) ||
1094 (LT.second.isScalarInteger() && ST->hasStdExtZbb()))
1095 return LT.first;
1096 break;
1098 case Intrinsic::sadd_sat:
1099 case Intrinsic::ssub_sat:
1100 case Intrinsic::uadd_sat:
1101 case Intrinsic::usub_sat:
1102 case Intrinsic::fabs:
1103 case Intrinsic::sqrt: {
1104 auto LT = getTypeLegalizationCost(RetTy);
1105 if (ST->hasVInstructions() && LT.second.isVector())
1106 return LT.first;
1107 break;
1109 case Intrinsic::ctpop: {
1110 auto LT = getTypeLegalizationCost(RetTy);
1111 if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector())
1112 return LT.first;
1113 break;
1115 case Intrinsic::abs: {
1116 auto LT = getTypeLegalizationCost(RetTy);
1117 if (ST->hasVInstructions() && LT.second.isVector()) {
1118 // vrsub.vi v10, v8, 0
1119 // vmax.vv v8, v8, v10
1120 return LT.first * 2;
1122 break;
1124 // TODO: add more intrinsic
1125 case Intrinsic::experimental_stepvector: {
1126 unsigned Cost = 1; // vid
1127 auto LT = getTypeLegalizationCost(RetTy);
1128 return Cost + (LT.first - 1);
1130 case Intrinsic::vp_rint: {
1131 // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
1132 unsigned Cost = 5;
1133 auto LT = getTypeLegalizationCost(RetTy);
1134 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
1135 return Cost * LT.first;
1136 break;
1138 case Intrinsic::vp_nearbyint: {
1139 // More one read and one write for fflags than vp_rint.
1140 unsigned Cost = 7;
1141 auto LT = getTypeLegalizationCost(RetTy);
1142 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
1143 return Cost * LT.first;
1144 break;
1146 case Intrinsic::vp_ceil:
1147 case Intrinsic::vp_floor:
1148 case Intrinsic::vp_round:
1149 case Intrinsic::vp_roundeven:
1150 case Intrinsic::vp_roundtozero: {
1151 // Rounding with static rounding mode needs two more instructions to
1152 // swap/write FRM than vp_rint.
1153 unsigned Cost = 7;
1154 auto LT = getTypeLegalizationCost(RetTy);
1155 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID());
1156 if (TLI->isOperationCustom(VPISD, LT.second))
1157 return Cost * LT.first;
1158 break;
1162 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1163 auto LT = getTypeLegalizationCost(RetTy);
1164 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1165 ICA.getID(), LT.second))
1166 return LT.first * Entry->Cost;
1169 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1172 InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
1173 Type *Src,
1174 TTI::CastContextHint CCH,
1175 TTI::TargetCostKind CostKind,
1176 const Instruction *I) {
1177 if (isa<VectorType>(Dst) && isa<VectorType>(Src)) {
1178 // FIXME: Need to compute legalizing cost for illegal types.
1179 if (!isTypeLegal(Src) || !isTypeLegal(Dst))
1180 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1182 // Skip if element size of Dst or Src is bigger than ELEN.
1183 if (Src->getScalarSizeInBits() > ST->getELen() ||
1184 Dst->getScalarSizeInBits() > ST->getELen())
1185 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1187 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1188 assert(ISD && "Invalid opcode");
1190 // FIXME: Need to consider vsetvli and lmul.
1191 int PowDiff = (int)Log2_32(Dst->getScalarSizeInBits()) -
1192 (int)Log2_32(Src->getScalarSizeInBits());
1193 switch (ISD) {
1194 case ISD::SIGN_EXTEND:
1195 case ISD::ZERO_EXTEND:
1196 if (Src->getScalarSizeInBits() == 1) {
1197 // We do not use vsext/vzext to extend from mask vector.
1198 // Instead we use the following instructions to extend from mask vector:
1199 // vmv.v.i v8, 0
1200 // vmerge.vim v8, v8, -1, v0
1201 return 2;
1203 return 1;
1204 case ISD::TRUNCATE:
1205 if (Dst->getScalarSizeInBits() == 1) {
1206 // We do not use several vncvt to truncate to mask vector. So we could
1207 // not use PowDiff to calculate it.
1208 // Instead we use the following instructions to truncate to mask vector:
1209 // vand.vi v8, v8, 1
1210 // vmsne.vi v0, v8, 0
1211 return 2;
1213 [[fallthrough]];
1214 case ISD::FP_EXTEND:
1215 case ISD::FP_ROUND:
1216 // Counts of narrow/widen instructions.
1217 return std::abs(PowDiff);
1218 case ISD::FP_TO_SINT:
1219 case ISD::FP_TO_UINT:
1220 case ISD::SINT_TO_FP:
1221 case ISD::UINT_TO_FP:
1222 if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) {
1223 // The cost of convert from or to mask vector is different from other
1224 // cases. We could not use PowDiff to calculate it.
1225 // For mask vector to fp, we should use the following instructions:
1226 // vmv.v.i v8, 0
1227 // vmerge.vim v8, v8, -1, v0
1228 // vfcvt.f.x.v v8, v8
1230 // And for fp vector to mask, we use:
1231 // vfncvt.rtz.x.f.w v9, v8
1232 // vand.vi v8, v9, 1
1233 // vmsne.vi v0, v8, 0
1234 return 3;
1236 if (std::abs(PowDiff) <= 1)
1237 return 1;
1238 // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
1239 // so it only need two conversion.
1240 if (Src->isIntOrIntVectorTy())
1241 return 2;
1242 // Counts of narrow/widen instructions.
1243 return std::abs(PowDiff);
1246 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1249 unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
1250 if (isa<ScalableVectorType>(Ty)) {
1251 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1252 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1253 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1254 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1256 return cast<FixedVectorType>(Ty)->getNumElements();
1259 InstructionCost
1260 RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
1261 FastMathFlags FMF,
1262 TTI::TargetCostKind CostKind) {
1263 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1264 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1266 // Skip if scalar size of Ty is bigger than ELEN.
1267 if (Ty->getScalarSizeInBits() > ST->getELen())
1268 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1270 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1271 if (Ty->getElementType()->isIntegerTy(1))
1272 // vcpop sequences, see vreduction-mask.ll. umax, smin actually only
1273 // cost 2, but we don't have enough info here so we slightly over cost.
1274 return (LT.first - 1) + 3;
1276 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1277 InstructionCost BaseCost = 2;
1279 if (CostKind == TTI::TCK_CodeSize)
1280 return (LT.first - 1) + BaseCost;
1282 unsigned VL = getEstimatedVLFor(Ty);
1283 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
1286 InstructionCost
1287 RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
1288 std::optional<FastMathFlags> FMF,
1289 TTI::TargetCostKind CostKind) {
1290 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1291 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1293 // Skip if scalar size of Ty is bigger than ELEN.
1294 if (Ty->getScalarSizeInBits() > ST->getELen())
1295 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1297 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1298 assert(ISD && "Invalid opcode");
1300 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1301 ISD != ISD::FADD)
1302 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1304 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1305 if (Ty->getElementType()->isIntegerTy(1))
1306 // vcpop sequences, see vreduction-mask.ll
1307 return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2);
1309 // IR Reduction is composed by two vmv and one rvv reduction instruction.
1310 InstructionCost BaseCost = 2;
1312 if (CostKind == TTI::TCK_CodeSize)
1313 return (LT.first - 1) + BaseCost;
1315 unsigned VL = getEstimatedVLFor(Ty);
1316 if (TTI::requiresOrderedReduction(FMF))
1317 return (LT.first - 1) + BaseCost + VL;
1318 return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
1321 InstructionCost RISCVTTIImpl::getExtendedReductionCost(
1322 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1323 FastMathFlags FMF, TTI::TargetCostKind CostKind) {
1324 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1325 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1326 FMF, CostKind);
1328 // Skip if scalar size of ResTy is bigger than ELEN.
1329 if (ResTy->getScalarSizeInBits() > ST->getELen())
1330 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1331 FMF, CostKind);
1333 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1334 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1335 FMF, CostKind);
1337 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1339 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
1340 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1341 FMF, CostKind);
1343 return (LT.first - 1) +
1344 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1347 InstructionCost RISCVTTIImpl::getStoreImmCost(Type *Ty,
1348 TTI::OperandValueInfo OpInfo,
1349 TTI::TargetCostKind CostKind) {
1350 assert(OpInfo.isConstant() && "non constant operand?");
1351 if (!isa<VectorType>(Ty))
1352 // FIXME: We need to account for immediate materialization here, but doing
1353 // a decent job requires more knowledge about the immediate than we
1354 // currently have here.
1355 return 0;
1357 if (OpInfo.isUniform())
1358 // vmv.x.i, vmv.v.x, or vfmv.v.f
1359 // We ignore the cost of the scalar constant materialization to be consistent
1360 // with how we treat scalar constants themselves just above.
1361 return 1;
1363 return getConstantPoolLoadCost(Ty, CostKind);
1367 InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1368 MaybeAlign Alignment,
1369 unsigned AddressSpace,
1370 TTI::TargetCostKind CostKind,
1371 TTI::OperandValueInfo OpInfo,
1372 const Instruction *I) {
1373 EVT VT = TLI->getValueType(DL, Src, true);
1374 // Type legalization can't handle structs
1375 if (VT == MVT::Other)
1376 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1377 CostKind, OpInfo, I);
1379 InstructionCost Cost = 0;
1380 if (Opcode == Instruction::Store && OpInfo.isConstant())
1381 Cost += getStoreImmCost(Src, OpInfo, CostKind);
1382 InstructionCost BaseCost =
1383 BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1384 CostKind, OpInfo, I);
1385 // Assume memory ops cost scale with the number of vector registers
1386 // possible accessed by the instruction. Note that BasicTTI already
1387 // handles the LT.first term for us.
1388 if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1389 LT.second.isVector())
1390 BaseCost *= TLI->getLMULCost(LT.second);
1391 return Cost + BaseCost;
1395 InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
1396 Type *CondTy,
1397 CmpInst::Predicate VecPred,
1398 TTI::TargetCostKind CostKind,
1399 const Instruction *I) {
1400 if (CostKind != TTI::TCK_RecipThroughput)
1401 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1404 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1405 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1408 // Skip if scalar size of ValTy is bigger than ELEN.
1409 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
1410 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1413 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1414 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1415 if (CondTy->isVectorTy()) {
1416 if (ValTy->getScalarSizeInBits() == 1) {
1417 // vmandn.mm v8, v8, v9
1418 // vmand.mm v9, v0, v9
1419 // vmor.mm v0, v9, v8
1420 return LT.first * 3;
1422 // vselect and max/min are supported natively.
1423 return LT.first * 1;
1426 if (ValTy->getScalarSizeInBits() == 1) {
1427 // vmv.v.x v9, a0
1428 // vmsne.vi v9, v9, 0
1429 // vmandn.mm v8, v8, v9
1430 // vmand.mm v9, v0, v9
1431 // vmor.mm v0, v9, v8
1432 return LT.first * 5;
1435 // vmv.v.x v10, a0
1436 // vmsne.vi v0, v10, 0
1437 // vmerge.vvm v8, v9, v8, v0
1438 return LT.first * 3;
1441 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1442 ValTy->isVectorTy()) {
1443 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1445 // Support natively.
1446 if (CmpInst::isIntPredicate(VecPred))
1447 return LT.first * 1;
1449 // If we do not support the input floating point vector type, use the base
1450 // one which will calculate as:
1451 // ScalarizeCost + Num * Cost for fixed vector,
1452 // InvalidCost for scalable vector.
1453 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
1454 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
1455 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
1456 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1458 switch (VecPred) {
1459 // Support natively.
1460 case CmpInst::FCMP_OEQ:
1461 case CmpInst::FCMP_OGT:
1462 case CmpInst::FCMP_OGE:
1463 case CmpInst::FCMP_OLT:
1464 case CmpInst::FCMP_OLE:
1465 case CmpInst::FCMP_UNE:
1466 return LT.first * 1;
1467 // TODO: Other comparisons?
1468 default:
1469 break;
1473 // TODO: Add cost for scalar type.
1475 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1478 InstructionCost RISCVTTIImpl::getCFInstrCost(unsigned Opcode,
1479 TTI::TargetCostKind CostKind,
1480 const Instruction *I) {
1481 if (CostKind != TTI::TCK_RecipThroughput)
1482 return Opcode == Instruction::PHI ? 0 : 1;
1483 // Branches are assumed to be predicted.
1484 return 0;
1487 InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
1488 TTI::TargetCostKind CostKind,
1489 unsigned Index, Value *Op0,
1490 Value *Op1) {
1491 assert(Val->isVectorTy() && "This must be a vector type");
1493 if (Opcode != Instruction::ExtractElement &&
1494 Opcode != Instruction::InsertElement)
1495 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1497 // Legalize the type.
1498 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
1500 // This type is legalized to a scalar type.
1501 if (!LT.second.isVector()) {
1502 auto *FixedVecTy = cast<FixedVectorType>(Val);
1503 // If Index is a known constant, cost is zero.
1504 if (Index != -1U)
1505 return 0;
1506 // Extract/InsertElement with non-constant index is very costly when
1507 // scalarized; estimate cost of loads/stores sequence via the stack:
1508 // ExtractElement cost: store vector to stack, load scalar;
1509 // InsertElement cost: store vector to stack, store scalar, load vector.
1510 Type *ElemTy = FixedVecTy->getElementType();
1511 auto NumElems = FixedVecTy->getNumElements();
1512 auto Align = DL.getPrefTypeAlign(ElemTy);
1513 InstructionCost LoadCost =
1514 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
1515 InstructionCost StoreCost =
1516 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
1517 return Opcode == Instruction::ExtractElement
1518 ? StoreCost * NumElems + LoadCost
1519 : (StoreCost + LoadCost) * NumElems + StoreCost;
1522 // For unsupported scalable vector.
1523 if (LT.second.isScalableVector() && !LT.first.isValid())
1524 return LT.first;
1526 if (!isTypeLegal(Val))
1527 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1529 // Mask vector extract/insert is expanded via e8.
1530 if (Val->getScalarSizeInBits() == 1) {
1531 VectorType *WideTy =
1532 VectorType::get(IntegerType::get(Val->getContext(), 8),
1533 cast<VectorType>(Val)->getElementCount());
1534 if (Opcode == Instruction::ExtractElement) {
1535 InstructionCost ExtendCost
1536 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1537 TTI::CastContextHint::None, CostKind);
1538 InstructionCost ExtractCost
1539 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1540 return ExtendCost + ExtractCost;
1542 InstructionCost ExtendCost
1543 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1544 TTI::CastContextHint::None, CostKind);
1545 InstructionCost InsertCost
1546 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1547 InstructionCost TruncCost
1548 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
1549 TTI::CastContextHint::None, CostKind);
1550 return ExtendCost + InsertCost + TruncCost;
1554 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1555 // and vslideup + vmv.s.x to insert element to vector.
1556 unsigned BaseCost = 1;
1557 // When insertelement we should add the index with 1 as the input of vslideup.
1558 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
1560 if (Index != -1U) {
1561 // The type may be split. For fixed-width vectors we can normalize the
1562 // index to the new type.
1563 if (LT.second.isFixedLengthVector()) {
1564 unsigned Width = LT.second.getVectorNumElements();
1565 Index = Index % Width;
1568 // We could extract/insert the first element without vslidedown/vslideup.
1569 if (Index == 0)
1570 SlideCost = 0;
1571 else if (Opcode == Instruction::InsertElement)
1572 SlideCost = 1; // With a constant index, we do not need to use addi.
1575 // Extract i64 in the target that has XLEN=32 need more instruction.
1576 if (Val->getScalarType()->isIntegerTy() &&
1577 ST->getXLen() < Val->getScalarSizeInBits()) {
1578 // For extractelement, we need the following instructions:
1579 // vsetivli zero, 1, e64, m1, ta, mu (not count)
1580 // vslidedown.vx v8, v8, a0
1581 // vmv.x.s a0, v8
1582 // li a1, 32
1583 // vsrl.vx v8, v8, a1
1584 // vmv.x.s a1, v8
1586 // For insertelement, we need the following instructions:
1587 // vsetivli zero, 2, e32, m4, ta, mu (not count)
1588 // vmv.v.i v12, 0
1589 // vslide1up.vx v16, v12, a1
1590 // vslide1up.vx v12, v16, a0
1591 // addi a0, a2, 1
1592 // vsetvli zero, a0, e64, m4, tu, mu (not count)
1593 // vslideup.vx v8, v12, a2
1595 // TODO: should we count these special vsetvlis?
1596 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
1598 return BaseCost + SlideCost;
1601 InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
1602 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1603 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
1604 ArrayRef<const Value *> Args, const Instruction *CxtI) {
1606 // TODO: Handle more cost kinds.
1607 if (CostKind != TTI::TCK_RecipThroughput)
1608 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1609 Args, CxtI);
1611 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1612 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1613 Args, CxtI);
1615 // Skip if scalar size of Ty is bigger than ELEN.
1616 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
1617 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1618 Args, CxtI);
1620 // Legalize the type.
1621 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1623 // TODO: Handle scalar type.
1624 if (!LT.second.isVector())
1625 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1626 Args, CxtI);
1629 auto getConstantMatCost =
1630 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
1631 if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
1632 // Two sub-cases:
1633 // * Has a 5 bit immediate operand which can be splatted.
1634 // * Has a larger immediate which must be materialized in scalar register
1635 // We return 0 for both as we currently ignore the cost of materializing
1636 // scalar constants in GPRs.
1637 return 0;
1639 return getConstantPoolLoadCost(Ty, CostKind);
1642 // Add the cost of materializing any constant vectors required.
1643 InstructionCost ConstantMatCost = 0;
1644 if (Op1Info.isConstant())
1645 ConstantMatCost += getConstantMatCost(0, Op1Info);
1646 if (Op2Info.isConstant())
1647 ConstantMatCost += getConstantMatCost(1, Op2Info);
1649 switch (TLI->InstructionOpcodeToISD(Opcode)) {
1650 case ISD::ADD:
1651 case ISD::SUB:
1652 case ISD::AND:
1653 case ISD::OR:
1654 case ISD::XOR:
1655 case ISD::SHL:
1656 case ISD::SRL:
1657 case ISD::SRA:
1658 case ISD::MUL:
1659 case ISD::MULHS:
1660 case ISD::MULHU:
1661 case ISD::FADD:
1662 case ISD::FSUB:
1663 case ISD::FMUL:
1664 case ISD::FNEG: {
1665 return ConstantMatCost + TLI->getLMULCost(LT.second) * LT.first * 1;
1667 default:
1668 return ConstantMatCost +
1669 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
1670 Args, CxtI);
1674 // TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
1675 InstructionCost RISCVTTIImpl::getPointersChainCost(
1676 ArrayRef<const Value *> Ptrs, const Value *Base,
1677 const TTI::PointersChainInfo &Info, Type *AccessTy,
1678 TTI::TargetCostKind CostKind) {
1679 InstructionCost Cost = TTI::TCC_Free;
1680 // In the basic model we take into account GEP instructions only
1681 // (although here can come alloca instruction, a value, constants and/or
1682 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
1683 // pointer). Typically, if Base is a not a GEP-instruction and all the
1684 // pointers are relative to the same base address, all the rest are
1685 // either GEP instructions, PHIs, bitcasts or constants. When we have same
1686 // base, we just calculate cost of each non-Base GEP as an ADD operation if
1687 // any their index is a non-const.
1688 // If no known dependecies between the pointers cost is calculated as a sum
1689 // of costs of GEP instructions.
1690 for (auto [I, V] : enumerate(Ptrs)) {
1691 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
1692 if (!GEP)
1693 continue;
1694 if (Info.isSameBase() && V != Base) {
1695 if (GEP->hasAllConstantIndices())
1696 continue;
1697 // If the chain is unit-stride and BaseReg + stride*i is a legal
1698 // addressing mode, then presume the base GEP is sitting around in a
1699 // register somewhere and check if we can fold the offset relative to
1700 // it.
1701 unsigned Stride = DL.getTypeStoreSize(AccessTy);
1702 if (Info.isUnitStride() &&
1703 isLegalAddressingMode(AccessTy,
1704 /* BaseGV */ nullptr,
1705 /* BaseOffset */ Stride * I,
1706 /* HasBaseReg */ true,
1707 /* Scale */ 0,
1708 GEP->getType()->getPointerAddressSpace()))
1709 continue;
1710 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
1711 {TTI::OK_AnyValue, TTI::OP_None},
1712 {TTI::OK_AnyValue, TTI::OP_None},
1713 std::nullopt);
1714 } else {
1715 SmallVector<const Value *> Indices(GEP->indices());
1716 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
1717 Indices, AccessTy, CostKind);
1720 return Cost;
1723 void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1724 TTI::UnrollingPreferences &UP,
1725 OptimizationRemarkEmitter *ORE) {
1726 // TODO: More tuning on benchmarks and metrics with changes as needed
1727 // would apply to all settings below to enable performance.
1730 if (ST->enableDefaultUnroll())
1731 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
1733 // Enable Upper bound unrolling universally, not dependant upon the conditions
1734 // below.
1735 UP.UpperBound = true;
1737 // Disable loop unrolling for Oz and Os.
1738 UP.OptSizeThreshold = 0;
1739 UP.PartialOptSizeThreshold = 0;
1740 if (L->getHeader()->getParent()->hasOptSize())
1741 return;
1743 SmallVector<BasicBlock *, 4> ExitingBlocks;
1744 L->getExitingBlocks(ExitingBlocks);
1745 LLVM_DEBUG(dbgs() << "Loop has:\n"
1746 << "Blocks: " << L->getNumBlocks() << "\n"
1747 << "Exit blocks: " << ExitingBlocks.size() << "\n");
1749 // Only allow another exit other than the latch. This acts as an early exit
1750 // as it mirrors the profitability calculation of the runtime unroller.
1751 if (ExitingBlocks.size() > 2)
1752 return;
1754 // Limit the CFG of the loop body for targets with a branch predictor.
1755 // Allowing 4 blocks permits if-then-else diamonds in the body.
1756 if (L->getNumBlocks() > 4)
1757 return;
1759 // Don't unroll vectorized loops, including the remainder loop
1760 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
1761 return;
1763 // Scan the loop: don't unroll loops with calls as this could prevent
1764 // inlining.
1765 InstructionCost Cost = 0;
1766 for (auto *BB : L->getBlocks()) {
1767 for (auto &I : *BB) {
1768 // Initial setting - Don't unroll loops containing vectorized
1769 // instructions.
1770 if (I.getType()->isVectorTy())
1771 return;
1773 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1774 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
1775 if (!isLoweredToCall(F))
1776 continue;
1778 return;
1781 SmallVector<const Value *> Operands(I.operand_values());
1782 Cost += getInstructionCost(&I, Operands,
1783 TargetTransformInfo::TCK_SizeAndLatency);
1787 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1789 UP.Partial = true;
1790 UP.Runtime = true;
1791 UP.UnrollRemainder = true;
1792 UP.UnrollAndJam = true;
1793 UP.UnrollAndJamInnerLoopThreshold = 60;
1795 // Force unrolling small loops can be very useful because of the branch
1796 // taken cost of the backedge.
1797 if (Cost < 12)
1798 UP.Force = true;
1801 void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1802 TTI::PeelingPreferences &PP) {
1803 BaseT::getPeelingPreferences(L, SE, PP);
1806 unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) {
1807 TypeSize Size = DL.getTypeSizeInBits(Ty);
1808 if (Ty->isVectorTy()) {
1809 if (Size.isScalable() && ST->hasVInstructions())
1810 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
1812 if (ST->useRVVForFixedLengthVectors())
1813 return divideCeil(Size, ST->getRealMinVLen());
1816 return BaseT::getRegUsageForType(Ty);
1819 unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1820 if (SLPMaxVF.getNumOccurrences())
1821 return SLPMaxVF;
1823 // Return how many elements can fit in getRegisterBitwidth. This is the
1824 // same routine as used in LoopVectorizer. We should probably be
1825 // accounting for whether we actually have instructions with the right
1826 // lane type, but we don't have enough information to do that without
1827 // some additional plumbing which hasn't been justified yet.
1828 TypeSize RegWidth =
1829 getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
1830 // If no vector registers, or absurd element widths, disable
1831 // vectorization by returning 1.
1832 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
1835 bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
1836 const TargetTransformInfo::LSRCost &C2) {
1837 // RISC-V specific here are "instruction number 1st priority".
1838 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
1839 C1.NumIVMuls, C1.NumBaseAdds,
1840 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
1841 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
1842 C2.NumIVMuls, C2.NumBaseAdds,
1843 C2.ScaleCost, C2.ImmCost, C2.SetupCost);