[OptTable] Fix typo VALUE => VALUES (NFCI) (#121523)
[llvm-project.git] / llvm / lib / Target / RISCV / RISCVTargetTransformInfo.cpp
blob850d6244affa503fb0b06a2f80f3e479a1edef4a
1 //===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
9 #include "RISCVTargetTransformInfo.h"
10 #include "MCTargetDesc/RISCVMatInt.h"
11 #include "llvm/ADT/STLExtras.h"
12 #include "llvm/Analysis/TargetTransformInfo.h"
13 #include "llvm/CodeGen/BasicTTIImpl.h"
14 #include "llvm/CodeGen/CostTable.h"
15 #include "llvm/CodeGen/TargetLowering.h"
16 #include "llvm/IR/Instructions.h"
17 #include "llvm/IR/PatternMatch.h"
18 #include <cmath>
19 #include <optional>
20 using namespace llvm;
21 using namespace llvm::PatternMatch;
23 #define DEBUG_TYPE "riscvtti"
25 static cl::opt<unsigned> RVVRegisterWidthLMUL(
26 "riscv-v-register-bit-width-lmul",
27 cl::desc(
28 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
29 "by autovectorized code. Fractional LMULs are not supported."),
30 cl::init(2), cl::Hidden);
32 static cl::opt<unsigned> SLPMaxVF(
33 "riscv-v-slp-max-vf",
34 cl::desc(
35 "Overrides result used for getMaximumVF query which is used "
36 "exclusively by SLP vectorizer."),
37 cl::Hidden);
39 InstructionCost
40 RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
41 TTI::TargetCostKind CostKind) {
42 // Check if the type is valid for all CostKind
43 if (!VT.isVector())
44 return InstructionCost::getInvalid();
45 size_t NumInstr = OpCodes.size();
46 if (CostKind == TTI::TCK_CodeSize)
47 return NumInstr;
48 InstructionCost LMULCost = TLI->getLMULCost(VT);
49 if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency))
50 return LMULCost * NumInstr;
51 InstructionCost Cost = 0;
52 for (auto Op : OpCodes) {
53 switch (Op) {
54 case RISCV::VRGATHER_VI:
55 Cost += TLI->getVRGatherVICost(VT);
56 break;
57 case RISCV::VRGATHER_VV:
58 Cost += TLI->getVRGatherVVCost(VT);
59 break;
60 case RISCV::VSLIDEUP_VI:
61 case RISCV::VSLIDEDOWN_VI:
62 Cost += TLI->getVSlideVICost(VT);
63 break;
64 case RISCV::VSLIDEUP_VX:
65 case RISCV::VSLIDEDOWN_VX:
66 Cost += TLI->getVSlideVXCost(VT);
67 break;
68 case RISCV::VREDMAX_VS:
69 case RISCV::VREDMIN_VS:
70 case RISCV::VREDMAXU_VS:
71 case RISCV::VREDMINU_VS:
72 case RISCV::VREDSUM_VS:
73 case RISCV::VREDAND_VS:
74 case RISCV::VREDOR_VS:
75 case RISCV::VREDXOR_VS:
76 case RISCV::VFREDMAX_VS:
77 case RISCV::VFREDMIN_VS:
78 case RISCV::VFREDUSUM_VS: {
79 unsigned VL = VT.getVectorMinNumElements();
80 if (!VT.isFixedLengthVector())
81 VL *= *getVScaleForTuning();
82 Cost += Log2_32_Ceil(VL);
83 break;
85 case RISCV::VFREDOSUM_VS: {
86 unsigned VL = VT.getVectorMinNumElements();
87 if (!VT.isFixedLengthVector())
88 VL *= *getVScaleForTuning();
89 Cost += VL;
90 break;
92 case RISCV::VMV_X_S:
93 case RISCV::VMV_S_X:
94 case RISCV::VFMV_F_S:
95 case RISCV::VFMV_S_F:
96 case RISCV::VMOR_MM:
97 case RISCV::VMXOR_MM:
98 case RISCV::VMAND_MM:
99 case RISCV::VMANDN_MM:
100 case RISCV::VMNAND_MM:
101 case RISCV::VCPOP_M:
102 case RISCV::VFIRST_M:
103 Cost += 1;
104 break;
105 default:
106 Cost += LMULCost;
109 return Cost;
112 static InstructionCost getIntImmCostImpl(const DataLayout &DL,
113 const RISCVSubtarget *ST,
114 const APInt &Imm, Type *Ty,
115 TTI::TargetCostKind CostKind,
116 bool FreeZeroes) {
117 assert(Ty->isIntegerTy() &&
118 "getIntImmCost can only estimate cost of materialising integers");
120 // We have a Zero register, so 0 is always free.
121 if (Imm == 0)
122 return TTI::TCC_Free;
124 // Otherwise, we check how many instructions it will take to materialise.
125 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
126 /*CompressionCost=*/false, FreeZeroes);
129 InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
130 TTI::TargetCostKind CostKind) {
131 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
134 // Look for patterns of shift followed by AND that can be turned into a pair of
135 // shifts. We won't need to materialize an immediate for the AND so these can
136 // be considered free.
137 static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
138 uint64_t Mask = Imm.getZExtValue();
139 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
140 if (!BO || !BO->hasOneUse())
141 return false;
143 if (BO->getOpcode() != Instruction::Shl)
144 return false;
146 if (!isa<ConstantInt>(BO->getOperand(1)))
147 return false;
149 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
150 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
151 // is a mask shifted by c2 bits with c3 leading zeros.
152 if (isShiftedMask_64(Mask)) {
153 unsigned Trailing = llvm::countr_zero(Mask);
154 if (ShAmt == Trailing)
155 return true;
158 return false;
161 InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
162 const APInt &Imm, Type *Ty,
163 TTI::TargetCostKind CostKind,
164 Instruction *Inst) {
165 assert(Ty->isIntegerTy() &&
166 "getIntImmCost can only estimate cost of materialising integers");
168 // We have a Zero register, so 0 is always free.
169 if (Imm == 0)
170 return TTI::TCC_Free;
172 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
173 // commutative, in others the immediate comes from a specific argument index.
174 bool Takes12BitImm = false;
175 unsigned ImmArgIdx = ~0U;
177 switch (Opcode) {
178 case Instruction::GetElementPtr:
179 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
180 // split up large offsets in GEP into better parts than ConstantHoisting
181 // can.
182 return TTI::TCC_Free;
183 case Instruction::Store: {
184 // Use the materialization cost regardless of if it's the address or the
185 // value that is constant, except for if the store is misaligned and
186 // misaligned accesses are not legal (experience shows constant hoisting
187 // can sometimes be harmful in such cases).
188 if (Idx == 1 || !Inst)
189 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
190 /*FreeZeroes=*/true);
192 StoreInst *ST = cast<StoreInst>(Inst);
193 if (!getTLI()->allowsMemoryAccessForAlignment(
194 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
195 ST->getPointerAddressSpace(), ST->getAlign()))
196 return TTI::TCC_Free;
198 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
199 /*FreeZeroes=*/true);
201 case Instruction::Load:
202 // If the address is a constant, use the materialization cost.
203 return getIntImmCost(Imm, Ty, CostKind);
204 case Instruction::And:
205 // zext.h
206 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
207 return TTI::TCC_Free;
208 // zext.w
209 if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
210 return TTI::TCC_Free;
211 // bclri
212 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
213 return TTI::TCC_Free;
214 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
215 canUseShiftPair(Inst, Imm))
216 return TTI::TCC_Free;
217 Takes12BitImm = true;
218 break;
219 case Instruction::Add:
220 Takes12BitImm = true;
221 break;
222 case Instruction::Or:
223 case Instruction::Xor:
224 // bseti/binvi
225 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
226 return TTI::TCC_Free;
227 Takes12BitImm = true;
228 break;
229 case Instruction::Mul:
230 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
231 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
232 return TTI::TCC_Free;
233 // One more or less than a power of 2 can use SLLI+ADD/SUB.
234 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
235 return TTI::TCC_Free;
236 // FIXME: There is no MULI instruction.
237 Takes12BitImm = true;
238 break;
239 case Instruction::Sub:
240 case Instruction::Shl:
241 case Instruction::LShr:
242 case Instruction::AShr:
243 Takes12BitImm = true;
244 ImmArgIdx = 1;
245 break;
246 default:
247 break;
250 if (Takes12BitImm) {
251 // Check immediate is the correct argument...
252 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
253 // ... and fits into the 12-bit immediate.
254 if (Imm.getSignificantBits() <= 64 &&
255 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
256 return TTI::TCC_Free;
260 // Otherwise, use the full materialisation cost.
261 return getIntImmCost(Imm, Ty, CostKind);
264 // By default, prevent hoisting.
265 return TTI::TCC_Free;
268 InstructionCost
269 RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
270 const APInt &Imm, Type *Ty,
271 TTI::TargetCostKind CostKind) {
272 // Prevent hoisting in unknown cases.
273 return TTI::TCC_Free;
276 bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const {
277 return ST->hasVInstructions();
280 TargetTransformInfo::PopcntSupportKind
281 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) {
282 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
283 return ST->hasStdExtZbb() || (ST->hasVendorXCVbitmanip() && !ST->is64Bit())
284 ? TTI::PSK_FastHardware
285 : TTI::PSK_Software;
288 bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
289 // Currently, the ExpandReductions pass can't expand scalable-vector
290 // reductions, but we still request expansion as RVV doesn't support certain
291 // reductions and the SelectionDAG can't legalize them either.
292 switch (II->getIntrinsicID()) {
293 default:
294 return false;
295 // These reductions have no equivalent in RVV
296 case Intrinsic::vector_reduce_mul:
297 case Intrinsic::vector_reduce_fmul:
298 return true;
302 std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
303 if (ST->hasVInstructions())
304 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
305 return BaseT::getMaxVScale();
308 std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
309 if (ST->hasVInstructions())
310 if (unsigned MinVLen = ST->getRealMinVLen();
311 MinVLen >= RISCV::RVVBitsPerBlock)
312 return MinVLen / RISCV::RVVBitsPerBlock;
313 return BaseT::getVScaleForTuning();
316 TypeSize
317 RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
318 unsigned LMUL =
319 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
320 switch (K) {
321 case TargetTransformInfo::RGK_Scalar:
322 return TypeSize::getFixed(ST->getXLen());
323 case TargetTransformInfo::RGK_FixedWidthVector:
324 return TypeSize::getFixed(
325 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
326 case TargetTransformInfo::RGK_ScalableVector:
327 return TypeSize::getScalable(
328 (ST->hasVInstructions() &&
329 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
330 ? LMUL * RISCV::RVVBitsPerBlock
331 : 0);
334 llvm_unreachable("Unsupported register kind");
337 InstructionCost
338 RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) {
339 // Add a cost of address generation + the cost of the load. The address
340 // is expected to be a PC relative offset to a constant pool entry
341 // using auipc/addi.
342 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
343 /*AddressSpace=*/0, CostKind);
346 static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {
347 unsigned Size = Mask.size();
348 if (!isPowerOf2_32(Size))
349 return false;
350 for (unsigned I = 0; I != Size; ++I) {
351 if (static_cast<unsigned>(Mask[I]) == I)
352 continue;
353 if (Mask[I] != 0)
354 return false;
355 if (Size % I != 0)
356 return false;
357 for (unsigned J = I + 1; J != Size; ++J)
358 // Check the pattern is repeated.
359 if (static_cast<unsigned>(Mask[J]) != J % I)
360 return false;
361 SubVectorSize = I;
362 return true;
364 // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
365 return false;
368 static VectorType *getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST,
369 LLVMContext &C) {
370 assert((DataVT.getScalarSizeInBits() != 8 ||
371 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
372 MVT IndexVT = DataVT.changeTypeToInteger();
373 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
374 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
375 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
378 InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
379 VectorType *Tp, ArrayRef<int> Mask,
380 TTI::TargetCostKind CostKind,
381 int Index, VectorType *SubTp,
382 ArrayRef<const Value *> Args,
383 const Instruction *CxtI) {
384 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
386 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
388 // First, handle cases where having a fixed length vector enables us to
389 // give a more accurate cost than falling back to generic scalable codegen.
390 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
391 if (isa<FixedVectorType>(Tp)) {
392 switch (Kind) {
393 default:
394 break;
395 case TTI::SK_PermuteSingleSrc: {
396 if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) {
397 MVT EltTp = LT.second.getVectorElementType();
398 // If the size of the element is < ELEN then shuffles of interleaves and
399 // deinterleaves of 2 vectors can be lowered into the following
400 // sequences
401 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
402 // Example sequence:
403 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
404 // vwaddu.vv v10, v8, v9
405 // li a0, -1 (ignored)
406 // vwmaccu.vx v10, a0, v9
407 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
408 return 2 * LT.first * TLI->getLMULCost(LT.second);
410 if (Mask[0] == 0 || Mask[0] == 1) {
411 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
412 // Example sequence:
413 // vnsrl.wi v10, v8, 0
414 if (equal(DeinterleaveMask, Mask))
415 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
416 LT.second, CostKind);
419 int SubVectorSize;
420 if (LT.second.getScalarSizeInBits() != 1 &&
421 isRepeatedConcatMask(Mask, SubVectorSize)) {
422 InstructionCost Cost = 0;
423 unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);
424 // The cost of extraction from a subvector is 0 if the index is 0.
425 for (unsigned I = 0; I != NumSlides; ++I) {
426 unsigned InsertIndex = SubVectorSize * (1 << I);
427 FixedVectorType *SubTp =
428 FixedVectorType::get(Tp->getElementType(), InsertIndex);
429 FixedVectorType *DestTp =
430 FixedVectorType::getDoubleElementsVectorType(SubTp);
431 std::pair<InstructionCost, MVT> DestLT =
432 getTypeLegalizationCost(DestTp);
433 // Add the cost of whole vector register move because the
434 // destination vector register group for vslideup cannot overlap the
435 // source.
436 Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
437 Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, {},
438 CostKind, InsertIndex, SubTp);
440 return Cost;
443 // vrgather + cost of generating the mask constant.
444 // We model this for an unknown mask with a single vrgather.
445 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
446 (LT.second.getScalarSizeInBits() != 8 ||
447 LT.second.getVectorNumElements() <= 256)) {
448 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext());
449 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
450 return IndexCost +
451 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
453 [[fallthrough]];
455 case TTI::SK_Transpose:
456 case TTI::SK_PermuteTwoSrc: {
457 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
458 // register for the second vrgather. We model this for an unknown
459 // (shuffle) mask.
460 if (LT.second.isFixedLengthVector() && LT.first == 1 &&
461 (LT.second.getScalarSizeInBits() != 8 ||
462 LT.second.getVectorNumElements() <= 256)) {
463 auto &C = Tp->getContext();
464 auto EC = Tp->getElementCount();
465 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
466 VectorType *MaskTy = VectorType::get(IntegerType::getInt1Ty(C), EC);
467 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
468 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
469 return 2 * IndexCost +
470 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
471 LT.second, CostKind) +
472 MaskCost;
474 [[fallthrough]];
476 case TTI::SK_Select: {
477 // We are going to permute multiple sources and the result will be in
478 // multiple destinations. Providing an accurate cost only for splits where
479 // the element type remains the same.
480 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
481 LT.second.isFixedLengthVector() &&
482 LT.second.getVectorElementType().getSizeInBits() ==
483 Tp->getElementType()->getPrimitiveSizeInBits() &&
484 LT.second.getVectorNumElements() <
485 cast<FixedVectorType>(Tp)->getNumElements() &&
486 divideCeil(Mask.size(),
487 cast<FixedVectorType>(Tp)->getNumElements()) ==
488 static_cast<unsigned>(*LT.first.getValue())) {
489 unsigned NumRegs = *LT.first.getValue();
490 unsigned VF = cast<FixedVectorType>(Tp)->getNumElements();
491 unsigned SubVF = PowerOf2Ceil(VF / NumRegs);
492 auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF);
494 InstructionCost Cost = 0;
495 for (unsigned I = 0, NumSrcRegs = divideCeil(Mask.size(), SubVF);
496 I < NumSrcRegs; ++I) {
497 bool IsSingleVector = true;
498 SmallVector<int> SubMask(SubVF, PoisonMaskElem);
499 transform(
500 Mask.slice(I * SubVF,
501 I == NumSrcRegs - 1 ? Mask.size() % SubVF : SubVF),
502 SubMask.begin(), [&](int I) -> int {
503 if (I == PoisonMaskElem)
504 return PoisonMaskElem;
505 bool SingleSubVector = I / VF == 0;
506 IsSingleVector &= SingleSubVector;
507 return (SingleSubVector ? 0 : 1) * SubVF + (I % VF) % SubVF;
509 if (all_of(enumerate(SubMask), [](auto &&P) {
510 return P.value() == PoisonMaskElem ||
511 static_cast<unsigned>(P.value()) == P.index();
513 continue;
514 Cost += getShuffleCost(IsSingleVector ? TTI::SK_PermuteSingleSrc
515 : TTI::SK_PermuteTwoSrc,
516 SubVecTy, SubMask, CostKind, 0, nullptr);
518 return Cost;
520 break;
525 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
526 switch (Kind) {
527 default:
528 // Fallthrough to generic handling.
529 // TODO: Most of these cases will return getInvalid in generic code, and
530 // must be implemented here.
531 break;
532 case TTI::SK_ExtractSubvector:
533 // Extract at zero is always a subregister extract
534 if (Index == 0)
535 return TTI::TCC_Free;
537 // If we're extracting a subvector of at most m1 size at a sub-register
538 // boundary - which unfortunately we need exact vlen to identify - this is
539 // a subregister extract at worst and thus won't require a vslidedown.
540 // TODO: Extend for aligned m2, m4 subvector extracts
541 // TODO: Extend for misalgined (but contained) extracts
542 // TODO: Extend for scalable subvector types
543 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
544 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
545 const unsigned MinVLen = ST->getRealMinVLen();
546 const unsigned MaxVLen = ST->getRealMaxVLen();
547 if (MinVLen == MaxVLen &&
548 SubLT.second.getScalarSizeInBits() * Index % MinVLen == 0 &&
549 SubLT.second.getSizeInBits() <= MinVLen)
550 return TTI::TCC_Free;
553 // Example sequence:
554 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
555 // vslidedown.vi v8, v9, 2
556 return LT.first *
557 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
558 case TTI::SK_InsertSubvector:
559 // Example sequence:
560 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
561 // vslideup.vi v8, v9, 2
562 return LT.first *
563 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
564 case TTI::SK_Select: {
565 // Example sequence:
566 // li a0, 90
567 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
568 // vmv.s.x v0, a0
569 // vmerge.vvm v8, v9, v8, v0
570 // We use 2 for the cost of the mask materialization as this is the true
571 // cost for small masks and most shuffles are small. At worst, this cost
572 // should be a very small constant for the constant pool load. As such,
573 // we may bias towards large selects slightly more than truely warranted.
574 return LT.first *
575 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
576 LT.second, CostKind));
578 case TTI::SK_Broadcast: {
579 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
580 Instruction::InsertElement);
581 if (LT.second.getScalarSizeInBits() == 1) {
582 if (HasScalar) {
583 // Example sequence:
584 // andi a0, a0, 1
585 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
586 // vmv.v.x v8, a0
587 // vmsne.vi v0, v8, 0
588 return LT.first *
589 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
590 LT.second, CostKind));
592 // Example sequence:
593 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
594 // vmv.v.i v8, 0
595 // vmerge.vim v8, v8, 1, v0
596 // vmv.x.s a0, v8
597 // andi a0, a0, 1
598 // vmv.v.x v8, a0
599 // vmsne.vi v0, v8, 0
601 return LT.first *
602 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
603 RISCV::VMV_X_S, RISCV::VMV_V_X,
604 RISCV::VMSNE_VI},
605 LT.second, CostKind));
608 if (HasScalar) {
609 // Example sequence:
610 // vmv.v.x v8, a0
611 return LT.first *
612 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
615 // Example sequence:
616 // vrgather.vi v9, v8, 0
617 return LT.first *
618 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
620 case TTI::SK_Splice: {
621 // vslidedown+vslideup.
622 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
623 // of similar code, but I think we expand through memory.
624 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
625 if (Index >= 0 && Index < 32)
626 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
627 else if (Index < 0 && Index > -32)
628 Opcodes[1] = RISCV::VSLIDEUP_VI;
629 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
631 case TTI::SK_Reverse: {
632 // TODO: Cases to improve here:
633 // * Illegal vector types
634 // * i64 on RV32
635 // * i1 vector
636 // At low LMUL, most of the cost is producing the vrgather index register.
637 // At high LMUL, the cost of the vrgather itself will dominate.
638 // Example sequence:
639 // csrr a0, vlenb
640 // srli a0, a0, 3
641 // addi a0, a0, -1
642 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
643 // vid.v v9
644 // vrsub.vx v10, v9, a0
645 // vrgather.vv v9, v8, v10
646 InstructionCost LenCost = 3;
647 if (LT.second.isFixedLengthVector())
648 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
649 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
650 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
651 if (LT.second.isFixedLengthVector() &&
652 isInt<5>(LT.second.getVectorNumElements() - 1))
653 Opcodes[1] = RISCV::VRSUB_VI;
654 InstructionCost GatherCost =
655 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
656 // Mask operation additionally required extend and truncate
657 InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
658 return LT.first * (LenCost + GatherCost + ExtendCost);
661 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
664 static unsigned isM1OrSmaller(MVT VT) {
665 RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
666 return (LMUL == RISCVII::VLMUL::LMUL_F8 || LMUL == RISCVII::VLMUL::LMUL_F4 ||
667 LMUL == RISCVII::VLMUL::LMUL_F2 || LMUL == RISCVII::VLMUL::LMUL_1);
670 InstructionCost RISCVTTIImpl::getScalarizationOverhead(
671 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
672 TTI::TargetCostKind CostKind, ArrayRef<Value *> VL) {
673 if (isa<ScalableVectorType>(Ty))
674 return InstructionCost::getInvalid();
676 // A build_vector (which is m1 sized or smaller) can be done in no
677 // worse than one vslide1down.vx per element in the type. We could
678 // in theory do an explode_vector in the inverse manner, but our
679 // lowering today does not have a first class node for this pattern.
680 InstructionCost Cost = BaseT::getScalarizationOverhead(
681 Ty, DemandedElts, Insert, Extract, CostKind);
682 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
683 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
684 if (Ty->getScalarSizeInBits() == 1) {
685 auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
686 // Note: Implicit scalar anyextend is assumed to be free since the i1
687 // must be stored in a GPR.
688 return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,
689 CostKind) +
690 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
691 TTI::CastContextHint::None, CostKind, nullptr);
694 assert(LT.second.isFixedLengthVector());
695 MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
696 if (isM1OrSmaller(ContainerVT)) {
697 InstructionCost BV =
698 cast<FixedVectorType>(Ty)->getNumElements() *
699 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);
700 if (BV < Cost)
701 Cost = BV;
704 return Cost;
707 InstructionCost
708 RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
709 unsigned AddressSpace,
710 TTI::TargetCostKind CostKind) {
711 if (!isLegalMaskedLoadStore(Src, Alignment) ||
712 CostKind != TTI::TCK_RecipThroughput)
713 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
714 CostKind);
716 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
719 InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
720 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
721 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
722 bool UseMaskForCond, bool UseMaskForGaps) {
724 // The interleaved memory access pass will lower interleaved memory ops (i.e
725 // a load and store followed by a specific shuffle) to vlseg/vsseg
726 // intrinsics.
727 if (!UseMaskForCond && !UseMaskForGaps &&
728 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
729 auto *VTy = cast<VectorType>(VecTy);
730 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
731 // Need to make sure type has't been scalarized
732 if (LT.second.isVector()) {
733 auto *SubVecTy =
734 VectorType::get(VTy->getElementType(),
735 VTy->getElementCount().divideCoefficientBy(Factor));
736 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
737 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
738 AddressSpace, DL)) {
740 // Some processors optimize segment loads/stores as one wide memory op +
741 // Factor * LMUL shuffle ops.
742 if (ST->hasOptimizedSegmentLoadStore(Factor)) {
743 InstructionCost Cost =
744 getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
745 MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
746 Cost += Factor * TLI->getLMULCost(SubVecVT);
747 return LT.first * Cost;
750 // Otherwise, the cost is proportional to the number of elements (VL *
751 // Factor ops).
752 InstructionCost MemOpCost =
753 getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,
754 CostKind, {TTI::OK_AnyValue, TTI::OP_None});
755 unsigned NumLoads = getEstimatedVLFor(VTy);
756 return NumLoads * MemOpCost;
761 // TODO: Return the cost of interleaved accesses for scalable vector when
762 // unable to convert to segment accesses instructions.
763 if (isa<ScalableVectorType>(VecTy))
764 return InstructionCost::getInvalid();
766 auto *FVTy = cast<FixedVectorType>(VecTy);
767 InstructionCost MemCost =
768 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
769 unsigned VF = FVTy->getNumElements() / Factor;
771 // An interleaved load will look like this for Factor=3:
772 // %wide.vec = load <12 x i32>, ptr %3, align 4
773 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
774 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
775 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
776 if (Opcode == Instruction::Load) {
777 InstructionCost Cost = MemCost;
778 for (unsigned Index : Indices) {
779 FixedVectorType *SubVecTy =
780 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
781 auto Mask = createStrideMask(Index, Factor, VF);
782 InstructionCost ShuffleCost =
783 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, SubVecTy, Mask,
784 CostKind, 0, nullptr, {});
785 Cost += ShuffleCost;
787 return Cost;
790 // TODO: Model for NF > 2
791 // We'll need to enhance getShuffleCost to model shuffles that are just
792 // inserts and extracts into subvectors, since they won't have the full cost
793 // of a vrgather.
794 // An interleaved store for 3 vectors of 4 lanes will look like
795 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
796 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
797 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
798 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
799 // store <12 x i32> %interleaved.vec, ptr %10, align 4
800 if (Factor != 2)
801 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
802 Alignment, AddressSpace, CostKind,
803 UseMaskForCond, UseMaskForGaps);
805 assert(Opcode == Instruction::Store && "Opcode must be a store");
806 // For an interleaving store of 2 vectors, we perform one large interleaving
807 // shuffle that goes into the wide store
808 auto Mask = createInterleaveMask(VF, Factor);
809 InstructionCost ShuffleCost =
810 getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, FVTy, Mask,
811 CostKind, 0, nullptr, {});
812 return MemCost + ShuffleCost;
815 InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
816 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
817 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
818 if (CostKind != TTI::TCK_RecipThroughput)
819 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
820 Alignment, CostKind, I);
822 if ((Opcode == Instruction::Load &&
823 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
824 (Opcode == Instruction::Store &&
825 !isLegalMaskedScatter(DataTy, Align(Alignment))))
826 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
827 Alignment, CostKind, I);
829 // Cost is proportional to the number of memory operations implied. For
830 // scalable vectors, we use an estimate on that number since we don't
831 // know exactly what VL will be.
832 auto &VTy = *cast<VectorType>(DataTy);
833 InstructionCost MemOpCost =
834 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
835 {TTI::OK_AnyValue, TTI::OP_None}, I);
836 unsigned NumLoads = getEstimatedVLFor(&VTy);
837 return NumLoads * MemOpCost;
840 InstructionCost RISCVTTIImpl::getStridedMemoryOpCost(
841 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
842 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
843 if (((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
844 !isLegalStridedLoadStore(DataTy, Alignment)) ||
845 (Opcode != Instruction::Load && Opcode != Instruction::Store))
846 return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
847 Alignment, CostKind, I);
849 if (CostKind == TTI::TCK_CodeSize)
850 return TTI::TCC_Basic;
852 // Cost is proportional to the number of memory operations implied. For
853 // scalable vectors, we use an estimate on that number since we don't
854 // know exactly what VL will be.
855 auto &VTy = *cast<VectorType>(DataTy);
856 InstructionCost MemOpCost =
857 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
858 {TTI::OK_AnyValue, TTI::OP_None}, I);
859 unsigned NumLoads = getEstimatedVLFor(&VTy);
860 return NumLoads * MemOpCost;
863 InstructionCost
864 RISCVTTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
865 // FIXME: This is a property of the default vector convention, not
866 // all possible calling conventions. Fixing that will require
867 // some TTI API and SLP rework.
868 InstructionCost Cost = 0;
869 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
870 for (auto *Ty : Tys) {
871 if (!Ty->isVectorTy())
872 continue;
873 Align A = DL.getPrefTypeAlign(Ty);
874 Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +
875 getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);
877 return Cost;
880 // Currently, these represent both throughput and codesize costs
881 // for the respective intrinsics. The costs in this table are simply
882 // instruction counts with the following adjustments made:
883 // * One vsetvli is considered free.
884 static const CostTblEntry VectorIntrinsicCostTable[]{
885 {Intrinsic::floor, MVT::f32, 9},
886 {Intrinsic::floor, MVT::f64, 9},
887 {Intrinsic::ceil, MVT::f32, 9},
888 {Intrinsic::ceil, MVT::f64, 9},
889 {Intrinsic::trunc, MVT::f32, 7},
890 {Intrinsic::trunc, MVT::f64, 7},
891 {Intrinsic::round, MVT::f32, 9},
892 {Intrinsic::round, MVT::f64, 9},
893 {Intrinsic::roundeven, MVT::f32, 9},
894 {Intrinsic::roundeven, MVT::f64, 9},
895 {Intrinsic::rint, MVT::f32, 7},
896 {Intrinsic::rint, MVT::f64, 7},
897 {Intrinsic::lrint, MVT::i32, 1},
898 {Intrinsic::lrint, MVT::i64, 1},
899 {Intrinsic::llrint, MVT::i64, 1},
900 {Intrinsic::nearbyint, MVT::f32, 9},
901 {Intrinsic::nearbyint, MVT::f64, 9},
902 {Intrinsic::bswap, MVT::i16, 3},
903 {Intrinsic::bswap, MVT::i32, 12},
904 {Intrinsic::bswap, MVT::i64, 31},
905 {Intrinsic::vp_bswap, MVT::i16, 3},
906 {Intrinsic::vp_bswap, MVT::i32, 12},
907 {Intrinsic::vp_bswap, MVT::i64, 31},
908 {Intrinsic::vp_fshl, MVT::i8, 7},
909 {Intrinsic::vp_fshl, MVT::i16, 7},
910 {Intrinsic::vp_fshl, MVT::i32, 7},
911 {Intrinsic::vp_fshl, MVT::i64, 7},
912 {Intrinsic::vp_fshr, MVT::i8, 7},
913 {Intrinsic::vp_fshr, MVT::i16, 7},
914 {Intrinsic::vp_fshr, MVT::i32, 7},
915 {Intrinsic::vp_fshr, MVT::i64, 7},
916 {Intrinsic::bitreverse, MVT::i8, 17},
917 {Intrinsic::bitreverse, MVT::i16, 24},
918 {Intrinsic::bitreverse, MVT::i32, 33},
919 {Intrinsic::bitreverse, MVT::i64, 52},
920 {Intrinsic::vp_bitreverse, MVT::i8, 17},
921 {Intrinsic::vp_bitreverse, MVT::i16, 24},
922 {Intrinsic::vp_bitreverse, MVT::i32, 33},
923 {Intrinsic::vp_bitreverse, MVT::i64, 52},
924 {Intrinsic::ctpop, MVT::i8, 12},
925 {Intrinsic::ctpop, MVT::i16, 19},
926 {Intrinsic::ctpop, MVT::i32, 20},
927 {Intrinsic::ctpop, MVT::i64, 21},
928 {Intrinsic::ctlz, MVT::i8, 19},
929 {Intrinsic::ctlz, MVT::i16, 28},
930 {Intrinsic::ctlz, MVT::i32, 31},
931 {Intrinsic::ctlz, MVT::i64, 35},
932 {Intrinsic::cttz, MVT::i8, 16},
933 {Intrinsic::cttz, MVT::i16, 23},
934 {Intrinsic::cttz, MVT::i32, 24},
935 {Intrinsic::cttz, MVT::i64, 25},
936 {Intrinsic::vp_ctpop, MVT::i8, 12},
937 {Intrinsic::vp_ctpop, MVT::i16, 19},
938 {Intrinsic::vp_ctpop, MVT::i32, 20},
939 {Intrinsic::vp_ctpop, MVT::i64, 21},
940 {Intrinsic::vp_ctlz, MVT::i8, 19},
941 {Intrinsic::vp_ctlz, MVT::i16, 28},
942 {Intrinsic::vp_ctlz, MVT::i32, 31},
943 {Intrinsic::vp_ctlz, MVT::i64, 35},
944 {Intrinsic::vp_cttz, MVT::i8, 16},
945 {Intrinsic::vp_cttz, MVT::i16, 23},
946 {Intrinsic::vp_cttz, MVT::i32, 24},
947 {Intrinsic::vp_cttz, MVT::i64, 25},
950 static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID) {
951 switch (ID) {
952 #define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
953 case Intrinsic::VPID: \
954 return ISD::VPSD;
955 #include "llvm/IR/VPIntrinsics.def"
956 #undef HELPER_MAP_VPID_TO_VPSD
958 return ISD::DELETED_NODE;
961 InstructionCost
962 RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
963 TTI::TargetCostKind CostKind) {
964 auto *RetTy = ICA.getReturnType();
965 switch (ICA.getID()) {
966 case Intrinsic::lrint:
967 case Intrinsic::llrint:
968 // We can't currently lower half or bfloat vector lrint/llrint.
969 if (auto *VecTy = dyn_cast<VectorType>(ICA.getArgTypes()[0]);
970 VecTy && VecTy->getElementType()->is16bitFPTy())
971 return InstructionCost::getInvalid();
972 [[fallthrough]];
973 case Intrinsic::ceil:
974 case Intrinsic::floor:
975 case Intrinsic::trunc:
976 case Intrinsic::rint:
977 case Intrinsic::round:
978 case Intrinsic::roundeven: {
979 // These all use the same code.
980 auto LT = getTypeLegalizationCost(RetTy);
981 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
982 return LT.first * 8;
983 break;
985 case Intrinsic::umin:
986 case Intrinsic::umax:
987 case Intrinsic::smin:
988 case Intrinsic::smax: {
989 auto LT = getTypeLegalizationCost(RetTy);
990 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
991 return LT.first;
993 if (ST->hasVInstructions() && LT.second.isVector()) {
994 unsigned Op;
995 switch (ICA.getID()) {
996 case Intrinsic::umin:
997 Op = RISCV::VMINU_VV;
998 break;
999 case Intrinsic::umax:
1000 Op = RISCV::VMAXU_VV;
1001 break;
1002 case Intrinsic::smin:
1003 Op = RISCV::VMIN_VV;
1004 break;
1005 case Intrinsic::smax:
1006 Op = RISCV::VMAX_VV;
1007 break;
1009 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1011 break;
1013 case Intrinsic::sadd_sat:
1014 case Intrinsic::ssub_sat:
1015 case Intrinsic::uadd_sat:
1016 case Intrinsic::usub_sat: {
1017 auto LT = getTypeLegalizationCost(RetTy);
1018 if (ST->hasVInstructions() && LT.second.isVector()) {
1019 unsigned Op;
1020 switch (ICA.getID()) {
1021 case Intrinsic::sadd_sat:
1022 Op = RISCV::VSADD_VV;
1023 break;
1024 case Intrinsic::ssub_sat:
1025 Op = RISCV::VSSUBU_VV;
1026 break;
1027 case Intrinsic::uadd_sat:
1028 Op = RISCV::VSADDU_VV;
1029 break;
1030 case Intrinsic::usub_sat:
1031 Op = RISCV::VSSUBU_VV;
1032 break;
1034 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1036 break;
1038 case Intrinsic::fabs:
1039 case Intrinsic::sqrt: {
1040 auto LT = getTypeLegalizationCost(RetTy);
1041 // TODO: add f16/bf16, bf16 with zvfbfmin && f16 with zvfhmin
1042 if (ST->hasVInstructions() && LT.second.isVector()) {
1043 unsigned Op;
1044 switch (ICA.getID()) {
1045 case Intrinsic::fabs:
1046 Op = RISCV::VFSGNJX_VV;
1047 break;
1048 case Intrinsic::sqrt:
1049 Op = RISCV::VFSQRT_V;
1050 break;
1052 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1054 break;
1056 case Intrinsic::cttz:
1057 case Intrinsic::ctlz:
1058 case Intrinsic::ctpop: {
1059 auto LT = getTypeLegalizationCost(RetTy);
1060 if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector()) {
1061 unsigned Op;
1062 switch (ICA.getID()) {
1063 case Intrinsic::cttz:
1064 Op = RISCV::VCTZ_V;
1065 break;
1066 case Intrinsic::ctlz:
1067 Op = RISCV::VCLZ_V;
1068 break;
1069 case Intrinsic::ctpop:
1070 Op = RISCV::VCPOP_V;
1071 break;
1073 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1075 break;
1077 case Intrinsic::abs: {
1078 auto LT = getTypeLegalizationCost(RetTy);
1079 if (ST->hasVInstructions() && LT.second.isVector()) {
1080 // vrsub.vi v10, v8, 0
1081 // vmax.vv v8, v8, v10
1082 return LT.first *
1083 getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},
1084 LT.second, CostKind);
1086 break;
1088 case Intrinsic::get_active_lane_mask: {
1089 if (ST->hasVInstructions()) {
1090 Type *ExpRetTy = VectorType::get(
1091 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1092 auto LT = getTypeLegalizationCost(ExpRetTy);
1094 // vid.v v8 // considered hoisted
1095 // vsaddu.vx v8, v8, a0
1096 // vmsltu.vx v0, v8, a1
1097 return LT.first *
1098 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
1099 LT.second, CostKind);
1101 break;
1103 // TODO: add more intrinsic
1104 case Intrinsic::stepvector: {
1105 auto LT = getTypeLegalizationCost(RetTy);
1106 // Legalisation of illegal types involves an `index' instruction plus
1107 // (LT.first - 1) vector adds.
1108 if (ST->hasVInstructions())
1109 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
1110 (LT.first - 1) *
1111 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
1112 return 1 + (LT.first - 1);
1114 case Intrinsic::experimental_cttz_elts: {
1115 Type *ArgTy = ICA.getArgTypes()[0];
1116 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
1117 if (getTLI()->shouldExpandCttzElements(ArgType))
1118 break;
1119 InstructionCost Cost = getRISCVInstructionCost(
1120 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
1122 // If zero_is_poison is false, then we will generate additional
1123 // cmp + select instructions to convert -1 to EVL.
1124 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
1125 if (ICA.getArgs().size() > 1 &&
1126 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
1127 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
1128 CmpInst::ICMP_SLT, CostKind) +
1129 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
1130 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1132 return Cost;
1134 case Intrinsic::vp_rint: {
1135 // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
1136 unsigned Cost = 5;
1137 auto LT = getTypeLegalizationCost(RetTy);
1138 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
1139 return Cost * LT.first;
1140 break;
1142 case Intrinsic::vp_nearbyint: {
1143 // More one read and one write for fflags than vp_rint.
1144 unsigned Cost = 7;
1145 auto LT = getTypeLegalizationCost(RetTy);
1146 if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
1147 return Cost * LT.first;
1148 break;
1150 case Intrinsic::vp_ceil:
1151 case Intrinsic::vp_floor:
1152 case Intrinsic::vp_round:
1153 case Intrinsic::vp_roundeven:
1154 case Intrinsic::vp_roundtozero: {
1155 // Rounding with static rounding mode needs two more instructions to
1156 // swap/write FRM than vp_rint.
1157 unsigned Cost = 7;
1158 auto LT = getTypeLegalizationCost(RetTy);
1159 unsigned VPISD = getISDForVPIntrinsicID(ICA.getID());
1160 if (TLI->isOperationCustom(VPISD, LT.second))
1161 return Cost * LT.first;
1162 break;
1164 case Intrinsic::vp_fneg: {
1165 std::optional<unsigned> FOp =
1166 VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID());
1167 assert(FOp.has_value());
1168 return getArithmeticInstrCost(*FOp, ICA.getReturnType(), CostKind);
1169 break;
1171 case Intrinsic::vp_select: {
1172 Intrinsic::ID IID = ICA.getID();
1173 std::optional<unsigned> FOp = VPIntrinsic::getFunctionalOpcodeForVP(IID);
1174 assert(FOp.has_value());
1175 return getCmpSelInstrCost(*FOp, ICA.getReturnType(), ICA.getArgTypes()[0],
1176 CmpInst::BAD_ICMP_PREDICATE, CostKind);
1178 case Intrinsic::vp_merge:
1179 return getCmpSelInstrCost(Instruction::Select, ICA.getReturnType(),
1180 ICA.getArgTypes()[0], CmpInst::BAD_ICMP_PREDICATE,
1181 CostKind);
1182 case Intrinsic::experimental_vp_splat: {
1183 auto LT = getTypeLegalizationCost(RetTy);
1184 // TODO: Lower i1 experimental_vp_splat
1185 if (!ST->hasVInstructions() || LT.second.getScalarType() == MVT::i1)
1186 return InstructionCost::getInvalid();
1187 return LT.first * getRISCVInstructionCost(LT.second.isFloatingPoint()
1188 ? RISCV::VFMV_V_F
1189 : RISCV::VMV_V_X,
1190 LT.second, CostKind);
1194 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1195 if (auto LT = getTypeLegalizationCost(RetTy);
1196 LT.second.isVector()) {
1197 MVT EltTy = LT.second.getVectorElementType();
1198 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1199 ICA.getID(), EltTy))
1200 return LT.first * Entry->Cost;
1204 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1207 InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
1208 Type *Src,
1209 TTI::CastContextHint CCH,
1210 TTI::TargetCostKind CostKind,
1211 const Instruction *I) {
1212 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1213 if (!IsVectorType)
1214 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1216 // FIXME: Need to compute legalizing cost for illegal types. The current
1217 // code handles only legal types and those which can be trivially
1218 // promoted to legal.
1219 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1220 Dst->getScalarSizeInBits() > ST->getELen())
1221 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1223 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1224 assert(ISD && "Invalid opcode");
1225 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1226 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1228 // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1229 // The shared implementation doesn't model vector widening during legalization
1230 // and instead assumes scalarization. In order to scalarize an <N x i1>
1231 // vector, we need to extend/trunc to/from i8. If we don't special case
1232 // this, we can get an infinite recursion cycle.
1233 switch (ISD) {
1234 default:
1235 break;
1236 case ISD::SIGN_EXTEND:
1237 case ISD::ZERO_EXTEND:
1238 if (Src->getScalarSizeInBits() == 1) {
1239 // We do not use vsext/vzext to extend from mask vector.
1240 // Instead we use the following instructions to extend from mask vector:
1241 // vmv.v.i v8, 0
1242 // vmerge.vim v8, v8, -1, v0 (repeated per split)
1243 return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +
1244 DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,
1245 DstLT.second, CostKind) +
1246 DstLT.first - 1;
1248 break;
1249 case ISD::TRUNCATE:
1250 if (Dst->getScalarSizeInBits() == 1) {
1251 // We do not use several vncvt to truncate to mask vector. So we could
1252 // not use PowDiff to calculate it.
1253 // Instead we use the following instructions to truncate to mask vector:
1254 // vand.vi v8, v8, 1
1255 // vmsne.vi v0, v8, 0
1256 return SrcLT.first *
1257 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1258 SrcLT.second, CostKind) +
1259 SrcLT.first - 1;
1261 break;
1264 // Our actual lowering for the case where a wider legal type is available
1265 // uses promotion to the wider type. This is reflected in the result of
1266 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1267 // scalarized if the legalized Src and Dst are not equal sized.
1268 const DataLayout &DL = this->getDataLayout();
1269 if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1270 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),
1271 SrcLT.second.getSizeInBits()) ||
1272 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),
1273 DstLT.second.getSizeInBits()))
1274 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1276 // The split cost is handled by the base getCastInstrCost
1277 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
1279 int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -
1280 (int)Log2_32(SrcLT.second.getScalarSizeInBits());
1281 switch (ISD) {
1282 case ISD::SIGN_EXTEND:
1283 case ISD::ZERO_EXTEND: {
1284 if ((PowDiff < 1) || (PowDiff > 3))
1285 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1286 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1287 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1288 unsigned Op =
1289 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1290 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1292 case ISD::TRUNCATE:
1293 case ISD::FP_EXTEND:
1294 case ISD::FP_ROUND: {
1295 // Counts of narrow/widen instructions.
1296 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();
1297 unsigned DstEltSize = DstLT.second.getScalarSizeInBits();
1299 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1300 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1301 : RISCV::VFNCVT_F_F_W;
1302 InstructionCost Cost = 0;
1303 for (; SrcEltSize != DstEltSize;) {
1304 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1305 ? MVT::getIntegerVT(DstEltSize)
1306 : MVT::getFloatingPointVT(DstEltSize);
1307 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1308 DstEltSize =
1309 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1310 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1312 return Cost;
1314 case ISD::FP_TO_SINT:
1315 case ISD::FP_TO_UINT: {
1316 unsigned IsSigned = ISD == ISD::FP_TO_SINT;
1317 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;
1318 unsigned FWCVT =
1319 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;
1320 unsigned FNCVT =
1321 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;
1322 unsigned SrcEltSize = Src->getScalarSizeInBits();
1323 unsigned DstEltSize = Dst->getScalarSizeInBits();
1324 InstructionCost Cost = 0;
1325 if ((SrcEltSize == 16) &&
1326 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {
1327 // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1328 // pre-widening to f32 and then convert f32 to integer
1329 VectorType *VecF32Ty =
1330 VectorType::get(Type::getFloatTy(Dst->getContext()),
1331 cast<VectorType>(Dst)->getElementCount());
1332 std::pair<InstructionCost, MVT> VecF32LT =
1333 getTypeLegalizationCost(VecF32Ty);
1334 Cost +=
1335 VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,
1336 VecF32LT.second, CostKind);
1337 Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);
1338 return Cost;
1340 if (DstEltSize == SrcEltSize)
1341 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1342 else if (DstEltSize > SrcEltSize)
1343 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1344 else { // (SrcEltSize > DstEltSize)
1345 // First do a narrowing conversion to an integer half the size, then
1346 // truncate if needed.
1347 MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);
1348 MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);
1349 Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);
1350 if ((SrcEltSize / 2) > DstEltSize) {
1351 Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());
1352 Cost +=
1353 getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);
1356 return Cost;
1358 case ISD::SINT_TO_FP:
1359 case ISD::UINT_TO_FP: {
1360 unsigned IsSigned = ISD == ISD::SINT_TO_FP;
1361 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;
1362 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;
1363 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;
1364 unsigned SrcEltSize = Src->getScalarSizeInBits();
1365 unsigned DstEltSize = Dst->getScalarSizeInBits();
1367 InstructionCost Cost = 0;
1368 if ((DstEltSize == 16) &&
1369 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {
1370 // If the target only supports zvfhmin or it is i64-to-fp16 conversion
1371 // it is converted to f32 and then converted to f16
1372 VectorType *VecF32Ty =
1373 VectorType::get(Type::getFloatTy(Dst->getContext()),
1374 cast<VectorType>(Dst)->getElementCount());
1375 std::pair<InstructionCost, MVT> VecF32LT =
1376 getTypeLegalizationCost(VecF32Ty);
1377 Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);
1378 Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,
1379 DstLT.second, CostKind);
1380 return Cost;
1383 if (DstEltSize == SrcEltSize)
1384 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1385 else if (DstEltSize > SrcEltSize) {
1386 if ((DstEltSize / 2) > SrcEltSize) {
1387 VectorType *VecTy =
1388 VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),
1389 cast<VectorType>(Dst)->getElementCount());
1390 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;
1391 Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);
1393 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1394 } else
1395 Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);
1396 return Cost;
1399 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1402 unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
1403 if (isa<ScalableVectorType>(Ty)) {
1404 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1405 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1406 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1407 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1409 return cast<FixedVectorType>(Ty)->getNumElements();
1412 InstructionCost
1413 RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
1414 FastMathFlags FMF,
1415 TTI::TargetCostKind CostKind) {
1416 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1417 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1419 // Skip if scalar size of Ty is bigger than ELEN.
1420 if (Ty->getScalarSizeInBits() > ST->getELen())
1421 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1423 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1424 if (Ty->getElementType()->isIntegerTy(1)) {
1425 // SelectionDAGBuilder does following transforms:
1426 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1427 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1428 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1429 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1430 else
1431 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1434 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1435 SmallVector<unsigned, 3> Opcodes;
1436 InstructionCost ExtraCost = 0;
1437 switch (IID) {
1438 case Intrinsic::maximum:
1439 if (FMF.noNaNs()) {
1440 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1441 } else {
1442 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1443 RISCV::VFMV_F_S};
1444 // Cost of Canonical Nan + branch
1445 // lui a0, 523264
1446 // fmv.w.x fa0, a0
1447 Type *DstTy = Ty->getScalarType();
1448 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1449 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1450 ExtraCost = 1 +
1451 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1452 TTI::CastContextHint::None, CostKind) +
1453 getCFInstrCost(Instruction::Br, CostKind);
1455 break;
1457 case Intrinsic::minimum:
1458 if (FMF.noNaNs()) {
1459 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1460 } else {
1461 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1462 RISCV::VFMV_F_S};
1463 // Cost of Canonical Nan + branch
1464 // lui a0, 523264
1465 // fmv.w.x fa0, a0
1466 Type *DstTy = Ty->getScalarType();
1467 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1468 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1469 ExtraCost = 1 +
1470 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1471 TTI::CastContextHint::None, CostKind) +
1472 getCFInstrCost(Instruction::Br, CostKind);
1474 break;
1476 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1479 // IR Reduction is composed by one rvv reduction instruction and vmv
1480 unsigned SplitOp;
1481 SmallVector<unsigned, 3> Opcodes;
1482 switch (IID) {
1483 default:
1484 llvm_unreachable("Unsupported intrinsic");
1485 case Intrinsic::smax:
1486 SplitOp = RISCV::VMAX_VV;
1487 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1488 break;
1489 case Intrinsic::smin:
1490 SplitOp = RISCV::VMIN_VV;
1491 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};
1492 break;
1493 case Intrinsic::umax:
1494 SplitOp = RISCV::VMAXU_VV;
1495 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1496 break;
1497 case Intrinsic::umin:
1498 SplitOp = RISCV::VMINU_VV;
1499 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};
1500 break;
1501 case Intrinsic::maxnum:
1502 SplitOp = RISCV::VFMAX_VV;
1503 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1504 break;
1505 case Intrinsic::minnum:
1506 SplitOp = RISCV::VFMIN_VV;
1507 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1508 break;
1510 // Add a cost for data larger than LMUL8
1511 InstructionCost SplitCost =
1512 (LT.first > 1) ? (LT.first - 1) *
1513 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1514 : 0;
1515 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1518 InstructionCost
1519 RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
1520 std::optional<FastMathFlags> FMF,
1521 TTI::TargetCostKind CostKind) {
1522 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1523 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1525 // Skip if scalar size of Ty is bigger than ELEN.
1526 if (Ty->getScalarSizeInBits() > ST->getELen())
1527 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1529 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1530 assert(ISD && "Invalid opcode");
1532 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1533 ISD != ISD::FADD)
1534 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1536 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1537 Type *ElementTy = Ty->getElementType();
1538 if (ElementTy->isIntegerTy(1)) {
1539 if (ISD == ISD::AND) {
1540 // Example sequences:
1541 // vsetvli a0, zero, e8, mf8, ta, ma
1542 // vmand.mm v8, v9, v8 ; needed every time type is split
1543 // vmnot.m v8, v0
1544 // vcpop.m a0, v8
1545 // seqz a0, a0
1546 return LT.first * getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second,
1547 CostKind) +
1548 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
1549 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1550 CmpInst::ICMP_EQ, CostKind);
1551 } else if (ISD == ISD::XOR) {
1552 // Example sequences:
1553 // vsetvli a0, zero, e8, mf8, ta, ma
1554 // vmxor.mm v8, v0, v8 ; needed every time type is split
1555 // vcpop.m a0, v8
1556 // andi a0, a0, 1
1557 return (LT.first - 1) *
1558 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
1559 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;
1560 } else {
1561 // Example sequences:
1562 // vsetvli a0, zero, e8, mf8, ta, ma
1563 // vmxor.mm v8, v9, v8 ; needed every time type is split
1564 // vcpop.m a0, v0
1565 // snez a0, a0
1566 return (LT.first - 1) *
1567 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
1568 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
1569 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1570 CmpInst::ICMP_NE, CostKind);
1574 // IR Reduction of or/and is composed by one vmv and one rvv reduction
1575 // instruction, and others is composed by two vmv and one rvv reduction
1576 // instruction
1577 unsigned SplitOp;
1578 SmallVector<unsigned, 3> Opcodes;
1579 switch (ISD) {
1580 case ISD::ADD:
1581 SplitOp = RISCV::VADD_VV;
1582 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
1583 break;
1584 case ISD::OR:
1585 SplitOp = RISCV::VOR_VV;
1586 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};
1587 break;
1588 case ISD::XOR:
1589 SplitOp = RISCV::VXOR_VV;
1590 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
1591 break;
1592 case ISD::AND:
1593 SplitOp = RISCV::VAND_VV;
1594 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};
1595 break;
1596 case ISD::FADD:
1597 // We can't promote f16/bf16 fadd reductions.
1598 if ((LT.second.getVectorElementType() == MVT::f16 &&
1599 !ST->hasVInstructionsF16()) ||
1600 LT.second.getVectorElementType() == MVT::bf16)
1601 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1602 if (TTI::requiresOrderedReduction(FMF)) {
1603 Opcodes.push_back(RISCV::VFMV_S_F);
1604 for (unsigned i = 0; i < LT.first.getValue(); i++)
1605 Opcodes.push_back(RISCV::VFREDOSUM_VS);
1606 Opcodes.push_back(RISCV::VFMV_F_S);
1607 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1609 SplitOp = RISCV::VFADD_VV;
1610 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
1611 break;
1613 // Add a cost for data larger than LMUL8
1614 InstructionCost SplitCost =
1615 (LT.first > 1) ? (LT.first - 1) *
1616 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1617 : 0;
1618 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1621 InstructionCost RISCVTTIImpl::getExtendedReductionCost(
1622 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1623 FastMathFlags FMF, TTI::TargetCostKind CostKind) {
1624 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1625 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1626 FMF, CostKind);
1628 // Skip if scalar size of ResTy is bigger than ELEN.
1629 if (ResTy->getScalarSizeInBits() > ST->getELen())
1630 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1631 FMF, CostKind);
1633 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1634 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1635 FMF, CostKind);
1637 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1639 if (IsUnsigned && Opcode == Instruction::Add &&
1640 LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) {
1641 // Represent vector_reduce_add(ZExt(<n x i1>)) as
1642 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
1643 return LT.first *
1644 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);
1647 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
1648 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1649 FMF, CostKind);
1651 return (LT.first - 1) +
1652 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1655 InstructionCost RISCVTTIImpl::getStoreImmCost(Type *Ty,
1656 TTI::OperandValueInfo OpInfo,
1657 TTI::TargetCostKind CostKind) {
1658 assert(OpInfo.isConstant() && "non constant operand?");
1659 if (!isa<VectorType>(Ty))
1660 // FIXME: We need to account for immediate materialization here, but doing
1661 // a decent job requires more knowledge about the immediate than we
1662 // currently have here.
1663 return 0;
1665 if (OpInfo.isUniform())
1666 // vmv.v.i, vmv.v.x, or vfmv.v.f
1667 // We ignore the cost of the scalar constant materialization to be consistent
1668 // with how we treat scalar constants themselves just above.
1669 return 1;
1671 return getConstantPoolLoadCost(Ty, CostKind);
1675 InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1676 MaybeAlign Alignment,
1677 unsigned AddressSpace,
1678 TTI::TargetCostKind CostKind,
1679 TTI::OperandValueInfo OpInfo,
1680 const Instruction *I) {
1681 EVT VT = TLI->getValueType(DL, Src, true);
1682 // Type legalization can't handle structs
1683 if (VT == MVT::Other)
1684 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1685 CostKind, OpInfo, I);
1687 InstructionCost Cost = 0;
1688 if (Opcode == Instruction::Store && OpInfo.isConstant())
1689 Cost += getStoreImmCost(Src, OpInfo, CostKind);
1691 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1693 InstructionCost BaseCost = [&]() {
1694 InstructionCost Cost = LT.first;
1695 if (CostKind != TTI::TCK_RecipThroughput)
1696 return Cost;
1698 // Our actual lowering for the case where a wider legal type is available
1699 // uses the a VL predicated load on the wider type. This is reflected in
1700 // the result of getTypeLegalizationCost, but BasicTTI assumes the
1701 // widened cases are scalarized.
1702 const DataLayout &DL = this->getDataLayout();
1703 if (Src->isVectorTy() && LT.second.isVector() &&
1704 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
1705 LT.second.getSizeInBits()))
1706 return Cost;
1708 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1709 CostKind, OpInfo, I);
1710 }();
1712 // Assume memory ops cost scale with the number of vector registers
1713 // possible accessed by the instruction. Note that BasicTTI already
1714 // handles the LT.first term for us.
1715 if (LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
1716 BaseCost *= TLI->getLMULCost(LT.second);
1717 return Cost + BaseCost;
1721 InstructionCost RISCVTTIImpl::getCmpSelInstrCost(
1722 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
1723 TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
1724 TTI::OperandValueInfo Op2Info, const Instruction *I) {
1725 if (CostKind != TTI::TCK_RecipThroughput)
1726 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1727 Op1Info, Op2Info, I);
1729 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1730 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1731 Op1Info, Op2Info, I);
1733 // Skip if scalar size of ValTy is bigger than ELEN.
1734 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
1735 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1736 Op1Info, Op2Info, I);
1738 auto GetConstantMatCost =
1739 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
1740 if (OpInfo.isUniform())
1741 // We return 0 we currently ignore the cost of materializing scalar
1742 // constants in GPRs.
1743 return 0;
1745 return getConstantPoolLoadCost(ValTy, CostKind);
1748 InstructionCost ConstantMatCost;
1749 if (Op1Info.isConstant())
1750 ConstantMatCost += GetConstantMatCost(Op1Info);
1751 if (Op2Info.isConstant())
1752 ConstantMatCost += GetConstantMatCost(Op2Info);
1754 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1755 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1756 if (CondTy->isVectorTy()) {
1757 if (ValTy->getScalarSizeInBits() == 1) {
1758 // vmandn.mm v8, v8, v9
1759 // vmand.mm v9, v0, v9
1760 // vmor.mm v0, v9, v8
1761 return ConstantMatCost +
1762 LT.first *
1763 getRISCVInstructionCost(
1764 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1765 LT.second, CostKind);
1767 // vselect and max/min are supported natively.
1768 return ConstantMatCost +
1769 LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,
1770 CostKind);
1773 if (ValTy->getScalarSizeInBits() == 1) {
1774 // vmv.v.x v9, a0
1775 // vmsne.vi v9, v9, 0
1776 // vmandn.mm v8, v8, v9
1777 // vmand.mm v9, v0, v9
1778 // vmor.mm v0, v9, v8
1779 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
1780 return ConstantMatCost +
1781 LT.first *
1782 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
1783 InterimVT, CostKind) +
1784 LT.first * getRISCVInstructionCost(
1785 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1786 LT.second, CostKind);
1789 // vmv.v.x v10, a0
1790 // vmsne.vi v0, v10, 0
1791 // vmerge.vvm v8, v9, v8, v0
1792 return ConstantMatCost +
1793 LT.first * getRISCVInstructionCost(
1794 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
1795 LT.second, CostKind);
1798 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
1799 CmpInst::isIntPredicate(VecPred)) {
1800 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
1801 // provided they incur the same cost across all implementations
1802 return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,
1803 LT.second,
1804 CostKind);
1807 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
1808 CmpInst::isFPPredicate(VecPred)) {
1810 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
1811 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
1812 return ConstantMatCost +
1813 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
1815 // If we do not support the input floating point vector type, use the base
1816 // one which will calculate as:
1817 // ScalarizeCost + Num * Cost for fixed vector,
1818 // InvalidCost for scalable vector.
1819 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
1820 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
1821 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
1822 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1823 Op1Info, Op2Info, I);
1825 // Assuming vector fp compare and mask instructions are all the same cost
1826 // until a need arises to differentiate them.
1827 switch (VecPred) {
1828 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
1829 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
1830 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
1831 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
1832 return ConstantMatCost +
1833 LT.first * getRISCVInstructionCost(
1834 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
1835 LT.second, CostKind);
1837 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
1838 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
1839 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
1840 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
1841 return ConstantMatCost +
1842 LT.first *
1843 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
1844 LT.second, CostKind);
1846 case CmpInst::FCMP_OEQ: // vmfeq.vv
1847 case CmpInst::FCMP_OGT: // vmflt.vv
1848 case CmpInst::FCMP_OGE: // vmfle.vv
1849 case CmpInst::FCMP_OLT: // vmflt.vv
1850 case CmpInst::FCMP_OLE: // vmfle.vv
1851 case CmpInst::FCMP_UNE: // vmfne.vv
1852 return ConstantMatCost +
1853 LT.first *
1854 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
1855 default:
1856 break;
1860 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
1861 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
1862 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
1863 // be (0 + select instr cost).
1864 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
1865 ValTy->isIntegerTy() && !I->user_empty()) {
1866 if (all_of(I->users(), [&](const User *U) {
1867 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
1868 U->getType()->isIntegerTy() &&
1869 !isa<ConstantData>(U->getOperand(1)) &&
1870 !isa<ConstantData>(U->getOperand(2));
1872 return 0;
1875 // TODO: Add cost for scalar type.
1877 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1878 Op1Info, Op2Info, I);
1881 InstructionCost RISCVTTIImpl::getCFInstrCost(unsigned Opcode,
1882 TTI::TargetCostKind CostKind,
1883 const Instruction *I) {
1884 if (CostKind != TTI::TCK_RecipThroughput)
1885 return Opcode == Instruction::PHI ? 0 : 1;
1886 // Branches are assumed to be predicted.
1887 return 0;
1890 InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
1891 TTI::TargetCostKind CostKind,
1892 unsigned Index, Value *Op0,
1893 Value *Op1) {
1894 assert(Val->isVectorTy() && "This must be a vector type");
1896 if (Opcode != Instruction::ExtractElement &&
1897 Opcode != Instruction::InsertElement)
1898 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1900 // Legalize the type.
1901 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
1903 // This type is legalized to a scalar type.
1904 if (!LT.second.isVector()) {
1905 auto *FixedVecTy = cast<FixedVectorType>(Val);
1906 // If Index is a known constant, cost is zero.
1907 if (Index != -1U)
1908 return 0;
1909 // Extract/InsertElement with non-constant index is very costly when
1910 // scalarized; estimate cost of loads/stores sequence via the stack:
1911 // ExtractElement cost: store vector to stack, load scalar;
1912 // InsertElement cost: store vector to stack, store scalar, load vector.
1913 Type *ElemTy = FixedVecTy->getElementType();
1914 auto NumElems = FixedVecTy->getNumElements();
1915 auto Align = DL.getPrefTypeAlign(ElemTy);
1916 InstructionCost LoadCost =
1917 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
1918 InstructionCost StoreCost =
1919 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
1920 return Opcode == Instruction::ExtractElement
1921 ? StoreCost * NumElems + LoadCost
1922 : (StoreCost + LoadCost) * NumElems + StoreCost;
1925 // For unsupported scalable vector.
1926 if (LT.second.isScalableVector() && !LT.first.isValid())
1927 return LT.first;
1929 // Mask vector extract/insert is expanded via e8.
1930 if (Val->getScalarSizeInBits() == 1) {
1931 VectorType *WideTy =
1932 VectorType::get(IntegerType::get(Val->getContext(), 8),
1933 cast<VectorType>(Val)->getElementCount());
1934 if (Opcode == Instruction::ExtractElement) {
1935 InstructionCost ExtendCost
1936 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1937 TTI::CastContextHint::None, CostKind);
1938 InstructionCost ExtractCost
1939 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1940 return ExtendCost + ExtractCost;
1942 InstructionCost ExtendCost
1943 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1944 TTI::CastContextHint::None, CostKind);
1945 InstructionCost InsertCost
1946 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1947 InstructionCost TruncCost
1948 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
1949 TTI::CastContextHint::None, CostKind);
1950 return ExtendCost + InsertCost + TruncCost;
1954 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1955 // and vslideup + vmv.s.x to insert element to vector.
1956 unsigned BaseCost = 1;
1957 // When insertelement we should add the index with 1 as the input of vslideup.
1958 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
1960 if (Index != -1U) {
1961 // The type may be split. For fixed-width vectors we can normalize the
1962 // index to the new type.
1963 if (LT.second.isFixedLengthVector()) {
1964 unsigned Width = LT.second.getVectorNumElements();
1965 Index = Index % Width;
1968 // If exact VLEN is known, we will insert/extract into the appropriate
1969 // subvector with no additional subvector insert/extract cost.
1970 if (auto VLEN = ST->getRealVLen()) {
1971 unsigned EltSize = LT.second.getScalarSizeInBits();
1972 unsigned M1Max = *VLEN / EltSize;
1973 Index = Index % M1Max;
1976 // We could extract/insert the first element without vslidedown/vslideup.
1977 if (Index == 0)
1978 SlideCost = 0;
1979 else if (Opcode == Instruction::InsertElement)
1980 SlideCost = 1; // With a constant index, we do not need to use addi.
1983 // When the vector needs to split into multiple register groups and the index
1984 // exceeds single vector register group, we need to insert/extract the element
1985 // via stack.
1986 if (LT.first > 1 &&
1987 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
1988 LT.second.isScalableVector()))) {
1989 Type *ScalarType = Val->getScalarType();
1990 Align VecAlign = DL.getPrefTypeAlign(Val);
1991 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
1992 // Extra addi for unknown index.
1993 InstructionCost IdxCost = Index == -1U ? 1 : 0;
1995 // Store all split vectors into stack and load the target element.
1996 if (Opcode == Instruction::ExtractElement)
1997 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
1998 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
1999 CostKind) +
2000 IdxCost;
2002 // Store all split vectors into stack and store the target element and load
2003 // vectors back.
2004 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2005 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +
2006 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
2007 CostKind) +
2008 IdxCost;
2011 // Extract i64 in the target that has XLEN=32 need more instruction.
2012 if (Val->getScalarType()->isIntegerTy() &&
2013 ST->getXLen() < Val->getScalarSizeInBits()) {
2014 // For extractelement, we need the following instructions:
2015 // vsetivli zero, 1, e64, m1, ta, mu (not count)
2016 // vslidedown.vx v8, v8, a0
2017 // vmv.x.s a0, v8
2018 // li a1, 32
2019 // vsrl.vx v8, v8, a1
2020 // vmv.x.s a1, v8
2022 // For insertelement, we need the following instructions:
2023 // vsetivli zero, 2, e32, m4, ta, mu (not count)
2024 // vmv.v.i v12, 0
2025 // vslide1up.vx v16, v12, a1
2026 // vslide1up.vx v12, v16, a0
2027 // addi a0, a2, 1
2028 // vsetvli zero, a0, e64, m4, tu, mu (not count)
2029 // vslideup.vx v8, v12, a2
2031 // TODO: should we count these special vsetvlis?
2032 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
2034 return BaseCost + SlideCost;
2037 InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
2038 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2039 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
2040 ArrayRef<const Value *> Args, const Instruction *CxtI) {
2042 // TODO: Handle more cost kinds.
2043 if (CostKind != TTI::TCK_RecipThroughput)
2044 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2045 Args, CxtI);
2047 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2048 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2049 Args, CxtI);
2051 // Skip if scalar size of Ty is bigger than ELEN.
2052 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
2053 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2054 Args, CxtI);
2056 // Legalize the type.
2057 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2059 // TODO: Handle scalar type.
2060 if (!LT.second.isVector())
2061 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2062 Args, CxtI);
2064 // f16 with zvfhmin and bf16 will be promoted to f32.
2065 // FIXME: nxv32[b]f16 will be custom lowered and split.
2066 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
2067 InstructionCost CastCost = 0;
2068 if ((LT.second.getVectorElementType() == MVT::f16 ||
2069 LT.second.getVectorElementType() == MVT::bf16) &&
2070 TLI->getOperationAction(ISDOpcode, LT.second) ==
2071 TargetLoweringBase::LegalizeAction::Promote) {
2072 MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);
2073 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());
2074 Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2075 // Add cost of extending arguments
2076 CastCost += LT.first * Args.size() *
2077 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,
2078 TTI::CastContextHint::None, CostKind);
2079 // Add cost of truncating result
2080 CastCost +=
2081 LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,
2082 TTI::CastContextHint::None, CostKind);
2083 // Compute cost of op in promoted type
2084 LT.second = PromotedVT;
2087 auto getConstantMatCost =
2088 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
2089 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))
2090 // Two sub-cases:
2091 // * Has a 5 bit immediate operand which can be splatted.
2092 // * Has a larger immediate which must be materialized in scalar register
2093 // We return 0 for both as we currently ignore the cost of materializing
2094 // scalar constants in GPRs.
2095 return 0;
2097 return getConstantPoolLoadCost(Ty, CostKind);
2100 // Add the cost of materializing any constant vectors required.
2101 InstructionCost ConstantMatCost = 0;
2102 if (Op1Info.isConstant())
2103 ConstantMatCost += getConstantMatCost(0, Op1Info);
2104 if (Op2Info.isConstant())
2105 ConstantMatCost += getConstantMatCost(1, Op2Info);
2107 unsigned Op;
2108 switch (ISDOpcode) {
2109 case ISD::ADD:
2110 case ISD::SUB:
2111 Op = RISCV::VADD_VV;
2112 break;
2113 case ISD::SHL:
2114 case ISD::SRL:
2115 case ISD::SRA:
2116 Op = RISCV::VSLL_VV;
2117 break;
2118 case ISD::AND:
2119 case ISD::OR:
2120 case ISD::XOR:
2121 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
2122 break;
2123 case ISD::MUL:
2124 case ISD::MULHS:
2125 case ISD::MULHU:
2126 Op = RISCV::VMUL_VV;
2127 break;
2128 case ISD::SDIV:
2129 case ISD::UDIV:
2130 Op = RISCV::VDIV_VV;
2131 break;
2132 case ISD::SREM:
2133 case ISD::UREM:
2134 Op = RISCV::VREM_VV;
2135 break;
2136 case ISD::FADD:
2137 case ISD::FSUB:
2138 Op = RISCV::VFADD_VV;
2139 break;
2140 case ISD::FMUL:
2141 Op = RISCV::VFMUL_VV;
2142 break;
2143 case ISD::FDIV:
2144 Op = RISCV::VFDIV_VV;
2145 break;
2146 case ISD::FNEG:
2147 Op = RISCV::VFSGNJN_VV;
2148 break;
2149 default:
2150 // Assuming all other instructions have the same cost until a need arises to
2151 // differentiate them.
2152 return CastCost + ConstantMatCost +
2153 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2154 Args, CxtI);
2157 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
2158 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2159 // ops are twice as expensive as integer ops. Do the same for vectors so
2160 // scalar floating point ops aren't cheaper than their vector equivalents.
2161 if (Ty->isFPOrFPVectorTy())
2162 InstrCost *= 2;
2163 return CastCost + ConstantMatCost + LT.first * InstrCost;
2166 // TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2167 InstructionCost RISCVTTIImpl::getPointersChainCost(
2168 ArrayRef<const Value *> Ptrs, const Value *Base,
2169 const TTI::PointersChainInfo &Info, Type *AccessTy,
2170 TTI::TargetCostKind CostKind) {
2171 InstructionCost Cost = TTI::TCC_Free;
2172 // In the basic model we take into account GEP instructions only
2173 // (although here can come alloca instruction, a value, constants and/or
2174 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2175 // pointer). Typically, if Base is a not a GEP-instruction and all the
2176 // pointers are relative to the same base address, all the rest are
2177 // either GEP instructions, PHIs, bitcasts or constants. When we have same
2178 // base, we just calculate cost of each non-Base GEP as an ADD operation if
2179 // any their index is a non-const.
2180 // If no known dependecies between the pointers cost is calculated as a sum
2181 // of costs of GEP instructions.
2182 for (auto [I, V] : enumerate(Ptrs)) {
2183 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
2184 if (!GEP)
2185 continue;
2186 if (Info.isSameBase() && V != Base) {
2187 if (GEP->hasAllConstantIndices())
2188 continue;
2189 // If the chain is unit-stride and BaseReg + stride*i is a legal
2190 // addressing mode, then presume the base GEP is sitting around in a
2191 // register somewhere and check if we can fold the offset relative to
2192 // it.
2193 unsigned Stride = DL.getTypeStoreSize(AccessTy);
2194 if (Info.isUnitStride() &&
2195 isLegalAddressingMode(AccessTy,
2196 /* BaseGV */ nullptr,
2197 /* BaseOffset */ Stride * I,
2198 /* HasBaseReg */ true,
2199 /* Scale */ 0,
2200 GEP->getType()->getPointerAddressSpace()))
2201 continue;
2202 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
2203 {TTI::OK_AnyValue, TTI::OP_None},
2204 {TTI::OK_AnyValue, TTI::OP_None}, {});
2205 } else {
2206 SmallVector<const Value *> Indices(GEP->indices());
2207 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
2208 Indices, AccessTy, CostKind);
2211 return Cost;
2214 void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2215 TTI::UnrollingPreferences &UP,
2216 OptimizationRemarkEmitter *ORE) {
2217 // TODO: More tuning on benchmarks and metrics with changes as needed
2218 // would apply to all settings below to enable performance.
2221 if (ST->enableDefaultUnroll())
2222 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2224 // Enable Upper bound unrolling universally, not dependant upon the conditions
2225 // below.
2226 UP.UpperBound = true;
2228 // Disable loop unrolling for Oz and Os.
2229 UP.OptSizeThreshold = 0;
2230 UP.PartialOptSizeThreshold = 0;
2231 if (L->getHeader()->getParent()->hasOptSize())
2232 return;
2234 SmallVector<BasicBlock *, 4> ExitingBlocks;
2235 L->getExitingBlocks(ExitingBlocks);
2236 LLVM_DEBUG(dbgs() << "Loop has:\n"
2237 << "Blocks: " << L->getNumBlocks() << "\n"
2238 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2240 // Only allow another exit other than the latch. This acts as an early exit
2241 // as it mirrors the profitability calculation of the runtime unroller.
2242 if (ExitingBlocks.size() > 2)
2243 return;
2245 // Limit the CFG of the loop body for targets with a branch predictor.
2246 // Allowing 4 blocks permits if-then-else diamonds in the body.
2247 if (L->getNumBlocks() > 4)
2248 return;
2250 // Don't unroll vectorized loops, including the remainder loop
2251 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2252 return;
2254 // Scan the loop: don't unroll loops with calls as this could prevent
2255 // inlining.
2256 InstructionCost Cost = 0;
2257 for (auto *BB : L->getBlocks()) {
2258 for (auto &I : *BB) {
2259 // Initial setting - Don't unroll loops containing vectorized
2260 // instructions.
2261 if (I.getType()->isVectorTy())
2262 return;
2264 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2265 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2266 if (!isLoweredToCall(F))
2267 continue;
2269 return;
2272 SmallVector<const Value *> Operands(I.operand_values());
2273 Cost += getInstructionCost(&I, Operands,
2274 TargetTransformInfo::TCK_SizeAndLatency);
2278 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2280 UP.Partial = true;
2281 UP.Runtime = true;
2282 UP.UnrollRemainder = true;
2283 UP.UnrollAndJam = true;
2285 // Force unrolling small loops can be very useful because of the branch
2286 // taken cost of the backedge.
2287 if (Cost < 12)
2288 UP.Force = true;
2291 void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
2292 TTI::PeelingPreferences &PP) {
2293 BaseT::getPeelingPreferences(L, SE, PP);
2296 unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) {
2297 if (Ty->isVectorTy()) {
2298 // f16 with only zvfhmin and bf16 will be promoted to f32
2299 Type *EltTy = cast<VectorType>(Ty)->getElementType();
2300 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
2301 EltTy->isBFloatTy())
2302 Ty = VectorType::get(Type::getFloatTy(Ty->getContext()),
2303 cast<VectorType>(Ty));
2305 TypeSize Size = DL.getTypeSizeInBits(Ty);
2306 if (Size.isScalable() && ST->hasVInstructions())
2307 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
2309 if (ST->useRVVForFixedLengthVectors())
2310 return divideCeil(Size, ST->getRealMinVLen());
2313 return BaseT::getRegUsageForType(Ty);
2316 unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2317 if (SLPMaxVF.getNumOccurrences())
2318 return SLPMaxVF;
2320 // Return how many elements can fit in getRegisterBitwidth. This is the
2321 // same routine as used in LoopVectorizer. We should probably be
2322 // accounting for whether we actually have instructions with the right
2323 // lane type, but we don't have enough information to do that without
2324 // some additional plumbing which hasn't been justified yet.
2325 TypeSize RegWidth =
2326 getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
2327 // If no vector registers, or absurd element widths, disable
2328 // vectorization by returning 1.
2329 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
2332 TTI::AddressingModeKind
2333 RISCVTTIImpl::getPreferredAddressingMode(const Loop *L,
2334 ScalarEvolution *SE) const {
2335 if (ST->hasVendorXCVmem() && !ST->is64Bit())
2336 return TTI::AMK_PostIndexed;
2338 return BasicTTIImplBase::getPreferredAddressingMode(L, SE);
2341 bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
2342 const TargetTransformInfo::LSRCost &C2) {
2343 // RISC-V specific here are "instruction number 1st priority".
2344 // If we need to emit adds inside the loop to add up base registers, then
2345 // we need at least one extra temporary register.
2346 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
2347 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
2348 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
2349 C1.NumIVMuls, C1.NumBaseAdds,
2350 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
2351 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
2352 C2.NumIVMuls, C2.NumBaseAdds,
2353 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
2356 bool RISCVTTIImpl::isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) {
2357 auto *VTy = dyn_cast<VectorType>(DataTy);
2358 if (!VTy || VTy->isScalableTy())
2359 return false;
2361 if (!isLegalMaskedLoadStore(DataTy, Alignment))
2362 return false;
2364 // FIXME: If it is an i8 vector and the element count exceeds 256, we should
2365 // scalarize these types with LMUL >= maximum fixed-length LMUL.
2366 if (VTy->getElementType()->isIntegerTy(8))
2367 if (VTy->getElementCount().getFixedValue() > 256)
2368 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <
2369 ST->getMaxLMULForFixedLengthVectors();
2370 return true;
2373 bool RISCVTTIImpl::isLegalMaskedCompressStore(Type *DataTy, Align Alignment) {
2374 auto *VTy = dyn_cast<VectorType>(DataTy);
2375 if (!VTy || VTy->isScalableTy())
2376 return false;
2378 if (!isLegalMaskedLoadStore(DataTy, Alignment))
2379 return false;
2380 return true;
2383 /// See if \p I should be considered for address type promotion. We check if \p
2384 /// I is a sext with right type and used in memory accesses. If it used in a
2385 /// "complex" getelementptr, we allow it to be promoted without finding other
2386 /// sext instructions that sign extended the same initial value. A getelementptr
2387 /// is considered as "complex" if it has more than 2 operands.
2388 bool RISCVTTIImpl::shouldConsiderAddressTypePromotion(
2389 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
2390 bool Considerable = false;
2391 AllowPromotionWithoutCommonHeader = false;
2392 if (!isa<SExtInst>(&I))
2393 return false;
2394 Type *ConsideredSExtType =
2395 Type::getInt64Ty(I.getParent()->getParent()->getContext());
2396 if (I.getType() != ConsideredSExtType)
2397 return false;
2398 // See if the sext is the one with the right type and used in at least one
2399 // GetElementPtrInst.
2400 for (const User *U : I.users()) {
2401 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
2402 Considerable = true;
2403 // A getelementptr is considered as "complex" if it has more than 2
2404 // operands. We will promote a SExt used in such complex GEP as we
2405 // expect some computation to be merged if they are done on 64 bits.
2406 if (GEPInst->getNumOperands() > 2) {
2407 AllowPromotionWithoutCommonHeader = true;
2408 break;
2412 return Considerable;
2415 bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {
2416 switch (Opcode) {
2417 case Instruction::Add:
2418 case Instruction::Sub:
2419 case Instruction::Mul:
2420 case Instruction::And:
2421 case Instruction::Or:
2422 case Instruction::Xor:
2423 case Instruction::FAdd:
2424 case Instruction::FSub:
2425 case Instruction::FMul:
2426 case Instruction::FDiv:
2427 case Instruction::ICmp:
2428 case Instruction::FCmp:
2429 return true;
2430 case Instruction::Shl:
2431 case Instruction::LShr:
2432 case Instruction::AShr:
2433 case Instruction::UDiv:
2434 case Instruction::SDiv:
2435 case Instruction::URem:
2436 case Instruction::SRem:
2437 case Instruction::Select:
2438 return Operand == 1;
2439 default:
2440 return false;
2444 bool RISCVTTIImpl::canSplatOperand(Instruction *I, int Operand) const {
2445 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
2446 return false;
2448 if (canSplatOperand(I->getOpcode(), Operand))
2449 return true;
2451 auto *II = dyn_cast<IntrinsicInst>(I);
2452 if (!II)
2453 return false;
2455 switch (II->getIntrinsicID()) {
2456 case Intrinsic::fma:
2457 case Intrinsic::vp_fma:
2458 case Intrinsic::fmuladd:
2459 case Intrinsic::vp_fmuladd:
2460 return Operand == 0 || Operand == 1;
2461 case Intrinsic::vp_shl:
2462 case Intrinsic::vp_lshr:
2463 case Intrinsic::vp_ashr:
2464 case Intrinsic::vp_udiv:
2465 case Intrinsic::vp_sdiv:
2466 case Intrinsic::vp_urem:
2467 case Intrinsic::vp_srem:
2468 case Intrinsic::ssub_sat:
2469 case Intrinsic::vp_ssub_sat:
2470 case Intrinsic::usub_sat:
2471 case Intrinsic::vp_usub_sat:
2472 case Intrinsic::vp_select:
2473 return Operand == 1;
2474 // These intrinsics are commutative.
2475 case Intrinsic::vp_add:
2476 case Intrinsic::vp_mul:
2477 case Intrinsic::vp_and:
2478 case Intrinsic::vp_or:
2479 case Intrinsic::vp_xor:
2480 case Intrinsic::vp_fadd:
2481 case Intrinsic::vp_fmul:
2482 case Intrinsic::vp_icmp:
2483 case Intrinsic::vp_fcmp:
2484 case Intrinsic::smin:
2485 case Intrinsic::vp_smin:
2486 case Intrinsic::umin:
2487 case Intrinsic::vp_umin:
2488 case Intrinsic::smax:
2489 case Intrinsic::vp_smax:
2490 case Intrinsic::umax:
2491 case Intrinsic::vp_umax:
2492 case Intrinsic::sadd_sat:
2493 case Intrinsic::vp_sadd_sat:
2494 case Intrinsic::uadd_sat:
2495 case Intrinsic::vp_uadd_sat:
2496 // These intrinsics have 'vr' versions.
2497 case Intrinsic::vp_sub:
2498 case Intrinsic::vp_fsub:
2499 case Intrinsic::vp_fdiv:
2500 return Operand == 0 || Operand == 1;
2501 default:
2502 return false;
2506 /// Check if sinking \p I's operands to I's basic block is profitable, because
2507 /// the operands can be folded into a target instruction, e.g.
2508 /// splats of scalars can fold into vector instructions.
2509 bool RISCVTTIImpl::isProfitableToSinkOperands(
2510 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
2511 using namespace llvm::PatternMatch;
2513 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
2514 return false;
2516 // Don't sink splat operands if the target prefers it. Some targets requires
2517 // S2V transfer buffers and we can run out of them copying the same value
2518 // repeatedly.
2519 // FIXME: It could still be worth doing if it would improve vector register
2520 // pressure and prevent a vector spill.
2521 if (!ST->sinkSplatOperands())
2522 return false;
2524 for (auto OpIdx : enumerate(I->operands())) {
2525 if (!canSplatOperand(I, OpIdx.index()))
2526 continue;
2528 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
2529 // Make sure we are not already sinking this operand
2530 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
2531 continue;
2533 // We are looking for a splat that can be sunk.
2534 if (!match(Op, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),
2535 m_Undef(), m_ZeroMask())))
2536 continue;
2538 // Don't sink i1 splats.
2539 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))
2540 continue;
2542 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
2543 // and vector registers
2544 for (Use &U : Op->uses()) {
2545 Instruction *Insn = cast<Instruction>(U.getUser());
2546 if (!canSplatOperand(Insn, U.getOperandNo()))
2547 return false;
2550 Ops.push_back(&Op->getOperandUse(0));
2551 Ops.push_back(&OpIdx.value());
2553 return true;
2556 RISCVTTIImpl::TTI::MemCmpExpansionOptions
2557 RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
2558 TTI::MemCmpExpansionOptions Options;
2559 // TODO: Enable expansion when unaligned access is not supported after we fix
2560 // issues in ExpandMemcmp.
2561 if (!ST->enableUnalignedScalarMem())
2562 return Options;
2564 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
2565 return Options;
2567 Options.AllowOverlappingLoads = true;
2568 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
2569 Options.NumLoadsPerBlock = Options.MaxNumLoads;
2570 if (ST->is64Bit()) {
2571 Options.LoadSizes = {8, 4, 2, 1};
2572 Options.AllowedTailExpansions = {3, 5, 6};
2573 } else {
2574 Options.LoadSizes = {4, 2, 1};
2575 Options.AllowedTailExpansions = {3};
2577 return Options;