llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

   1 //===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #include "RISCVTargetTransformInfo.h"
  10 #include "MCTargetDesc/RISCVMatInt.h"
  11 #include "llvm/ADT/STLExtras.h"
  12 #include "llvm/Analysis/TargetTransformInfo.h"
  13 #include "llvm/CodeGen/BasicTTIImpl.h"
  14 #include "llvm/CodeGen/CostTable.h"
  15 #include "llvm/CodeGen/TargetLowering.h"
  16 #include "llvm/IR/Instructions.h"
  17 #include "llvm/IR/PatternMatch.h"
  18 #include <cmath>
  19 #include <optional>
  20 using namespace llvm;
  21 using namespace llvm::PatternMatch;
  22
  23 #define DEBUG_TYPE "riscvtti"
  24
  25 static cl::opt<unsigned> RVVRegisterWidthLMUL(
  26     "riscv-v-register-bit-width-lmul",
  27     cl::desc(
  28         "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
  29         "by autovectorized code. Fractional LMULs are not supported."),
  30     cl::init(2), cl::Hidden);
  31
  32 static cl::opt<unsigned> SLPMaxVF(
  33     "riscv-v-slp-max-vf",
  34     cl::desc(
  35         "Overrides result used for getMaximumVF query which is used "
  36         "exclusively by SLP vectorizer."),
  37     cl::Hidden);
  38
  39 InstructionCost
  40 RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
  41                                       TTI::TargetCostKind CostKind) {
  42   // Check if the type is valid for all CostKind
  43   if (!VT.isVector())
  44     return InstructionCost::getInvalid();
  45   size_t NumInstr = OpCodes.size();
  46   if (CostKind == TTI::TCK_CodeSize)
  47     return NumInstr;
  48   InstructionCost LMULCost = TLI->getLMULCost(VT);
  49   if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency))
  50     return LMULCost * NumInstr;
  51   InstructionCost Cost = 0;
  52   for (auto Op : OpCodes) {
  53     switch (Op) {
  54     case RISCV::VRGATHER_VI:
  55       Cost += TLI->getVRGatherVICost(VT);
  56       break;
  57     case RISCV::VRGATHER_VV:
  58       Cost += TLI->getVRGatherVVCost(VT);
  59       break;
  60     case RISCV::VSLIDEUP_VI:
  61     case RISCV::VSLIDEDOWN_VI:
  62       Cost += TLI->getVSlideVICost(VT);
  63       break;
  64     case RISCV::VSLIDEUP_VX:
  65     case RISCV::VSLIDEDOWN_VX:
  66       Cost += TLI->getVSlideVXCost(VT);
  67       break;
  68     case RISCV::VREDMAX_VS:
  69     case RISCV::VREDMIN_VS:
  70     case RISCV::VREDMAXU_VS:
  71     case RISCV::VREDMINU_VS:
  72     case RISCV::VREDSUM_VS:
  73     case RISCV::VREDAND_VS:
  74     case RISCV::VREDOR_VS:
  75     case RISCV::VREDXOR_VS:
  76     case RISCV::VFREDMAX_VS:
  77     case RISCV::VFREDMIN_VS:
  78     case RISCV::VFREDUSUM_VS: {
  79       unsigned VL = VT.getVectorMinNumElements();
  80       if (!VT.isFixedLengthVector())
  81         VL *= *getVScaleForTuning();
  82       Cost += Log2_32_Ceil(VL);
  83       break;
  84     }
  85     case RISCV::VFREDOSUM_VS: {
  86       unsigned VL = VT.getVectorMinNumElements();
  87       if (!VT.isFixedLengthVector())
  88         VL *= *getVScaleForTuning();
  89       Cost += VL;
  90       break;
  91     }
  92     case RISCV::VMV_X_S:
  93     case RISCV::VMV_S_X:
  94     case RISCV::VFMV_F_S:
  95     case RISCV::VFMV_S_F:
  96     case RISCV::VMOR_MM:
  97     case RISCV::VMXOR_MM:
  98     case RISCV::VMAND_MM:
  99     case RISCV::VMANDN_MM:
 100     case RISCV::VMNAND_MM:
 101     case RISCV::VCPOP_M:
 102     case RISCV::VFIRST_M:
 103       Cost += 1;
 104       break;
 105     default:
 106       Cost += LMULCost;
 107     }
 108   }
 109   return Cost;
 110 }
 111
 112 static InstructionCost getIntImmCostImpl(const DataLayout &DL,
 113                                          const RISCVSubtarget *ST,
 114                                          const APInt &Imm, Type *Ty,
 115                                          TTI::TargetCostKind CostKind,
 116                                          bool FreeZeroes) {
 117   assert(Ty->isIntegerTy() &&
 118          "getIntImmCost can only estimate cost of materialising integers");
 119
 120   // We have a Zero register, so 0 is always free.
 121   if (Imm == 0)
 122     return TTI::TCC_Free;
 123
 124   // Otherwise, we check how many instructions it will take to materialise.
 125   return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
 126                                     /*CompressionCost=*/false, FreeZeroes);
 127 }
 128
 129 InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
 130                                             TTI::TargetCostKind CostKind) {
 131   return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
 132 }
 133
 134 // Look for patterns of shift followed by AND that can be turned into a pair of
 135 // shifts. We won't need to materialize an immediate for the AND so these can
 136 // be considered free.
 137 static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
 138   uint64_t Mask = Imm.getZExtValue();
 139   auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
 140   if (!BO || !BO->hasOneUse())
 141     return false;
 142
 143   if (BO->getOpcode() != Instruction::Shl)
 144     return false;
 145
 146   if (!isa<ConstantInt>(BO->getOperand(1)))
 147     return false;
 148
 149   unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
 150   // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
 151   // is a mask shifted by c2 bits with c3 leading zeros.
 152   if (isShiftedMask_64(Mask)) {
 153     unsigned Trailing = llvm::countr_zero(Mask);
 154     if (ShAmt == Trailing)
 155       return true;
 156   }
 157
 158   return false;
 159 }
 160
 161 InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
 162                                                 const APInt &Imm, Type *Ty,
 163                                                 TTI::TargetCostKind CostKind,
 164                                                 Instruction *Inst) {
 165   assert(Ty->isIntegerTy() &&
 166          "getIntImmCost can only estimate cost of materialising integers");
 167
 168   // We have a Zero register, so 0 is always free.
 169   if (Imm == 0)
 170     return TTI::TCC_Free;
 171
 172   // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
 173   // commutative, in others the immediate comes from a specific argument index.
 174   bool Takes12BitImm = false;
 175   unsigned ImmArgIdx = ~0U;
 176
 177   switch (Opcode) {
 178   case Instruction::GetElementPtr:
 179     // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
 180     // split up large offsets in GEP into better parts than ConstantHoisting
 181     // can.
 182     return TTI::TCC_Free;
 183   case Instruction::Store: {
 184     // Use the materialization cost regardless of if it's the address or the
 185     // value that is constant, except for if the store is misaligned and
 186     // misaligned accesses are not legal (experience shows constant hoisting
 187     // can sometimes be harmful in such cases).
 188     if (Idx == 1 || !Inst)
 189       return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
 190                                /*FreeZeroes=*/true);
 191
 192     StoreInst *ST = cast<StoreInst>(Inst);
 193     if (!getTLI()->allowsMemoryAccessForAlignment(
 194             Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
 195             ST->getPointerAddressSpace(), ST->getAlign()))
 196       return TTI::TCC_Free;
 197
 198     return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
 199                              /*FreeZeroes=*/true);
 200   }
 201   case Instruction::Load:
 202     // If the address is a constant, use the materialization cost.
 203     return getIntImmCost(Imm, Ty, CostKind);
 204   case Instruction::And:
 205     // zext.h
 206     if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
 207       return TTI::TCC_Free;
 208     // zext.w
 209     if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
 210       return TTI::TCC_Free;
 211     // bclri
 212     if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
 213       return TTI::TCC_Free;
 214     if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
 215         canUseShiftPair(Inst, Imm))
 216       return TTI::TCC_Free;
 217     Takes12BitImm = true;
 218     break;
 219   case Instruction::Add:
 220     Takes12BitImm = true;
 221     break;
 222   case Instruction::Or:
 223   case Instruction::Xor:
 224     // bseti/binvi
 225     if (ST->hasStdExtZbs() && Imm.isPowerOf2())
 226       return TTI::TCC_Free;
 227     Takes12BitImm = true;
 228     break;
 229   case Instruction::Mul:
 230     // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
 231     if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
 232       return TTI::TCC_Free;
 233     // One more or less than a power of 2 can use SLLI+ADD/SUB.
 234     if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
 235       return TTI::TCC_Free;
 236     // FIXME: There is no MULI instruction.
 237     Takes12BitImm = true;
 238     break;
 239   case Instruction::Sub:
 240   case Instruction::Shl:
 241   case Instruction::LShr:
 242   case Instruction::AShr:
 243     Takes12BitImm = true;
 244     ImmArgIdx = 1;
 245     break;
 246   default:
 247     break;
 248   }
 249
 250   if (Takes12BitImm) {
 251     // Check immediate is the correct argument...
 252     if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
 253       // ... and fits into the 12-bit immediate.
 254       if (Imm.getSignificantBits() <= 64 &&
 255           getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
 256         return TTI::TCC_Free;
 257       }
 258     }
 259
 260     // Otherwise, use the full materialisation cost.
 261     return getIntImmCost(Imm, Ty, CostKind);
 262   }
 263
 264   // By default, prevent hoisting.
 265   return TTI::TCC_Free;
 266 }
 267
 268 InstructionCost
 269 RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
 270                                   const APInt &Imm, Type *Ty,
 271                                   TTI::TargetCostKind CostKind) {
 272   // Prevent hoisting in unknown cases.
 273   return TTI::TCC_Free;
 274 }
 275
 276 bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const {
 277   return ST->hasVInstructions();
 278 }
 279
 280 TargetTransformInfo::PopcntSupportKind
 281 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) {
 282   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
 283   return ST->hasStdExtZbb() || (ST->hasVendorXCVbitmanip() && !ST->is64Bit())
 284              ? TTI::PSK_FastHardware
 285              : TTI::PSK_Software;
 286 }
 287
 288 bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
 289   // Currently, the ExpandReductions pass can't expand scalable-vector
 290   // reductions, but we still request expansion as RVV doesn't support certain
 291   // reductions and the SelectionDAG can't legalize them either.
 292   switch (II->getIntrinsicID()) {
 293   default:
 294     return false;
 295   // These reductions have no equivalent in RVV
 296   case Intrinsic::vector_reduce_mul:
 297   case Intrinsic::vector_reduce_fmul:
 298     return true;
 299   }
 300 }
 301
 302 std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
 303   if (ST->hasVInstructions())
 304     return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
 305   return BaseT::getMaxVScale();
 306 }
 307
 308 std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
 309   if (ST->hasVInstructions())
 310     if (unsigned MinVLen = ST->getRealMinVLen();
 311         MinVLen >= RISCV::RVVBitsPerBlock)
 312       return MinVLen / RISCV::RVVBitsPerBlock;
 313   return BaseT::getVScaleForTuning();
 314 }
 315
 316 TypeSize
 317 RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
 318   unsigned LMUL =
 319       llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
 320   switch (K) {
 321   case TargetTransformInfo::RGK_Scalar:
 322     return TypeSize::getFixed(ST->getXLen());
 323   case TargetTransformInfo::RGK_FixedWidthVector:
 324     return TypeSize::getFixed(
 325         ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
 326   case TargetTransformInfo::RGK_ScalableVector:
 327     return TypeSize::getScalable(
 328         (ST->hasVInstructions() &&
 329          ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
 330             ? LMUL * RISCV::RVVBitsPerBlock
 331             : 0);
 332   }
 333
 334   llvm_unreachable("Unsupported register kind");
 335 }
 336
 337 InstructionCost
 338 RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,  TTI::TargetCostKind CostKind) {
 339   // Add a cost of address generation + the cost of the load. The address
 340   // is expected to be a PC relative offset to a constant pool entry
 341   // using auipc/addi.
 342   return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
 343                              /*AddressSpace=*/0, CostKind);
 344 }
 345
 346 static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {
 347   unsigned Size = Mask.size();
 348   if (!isPowerOf2_32(Size))
 349     return false;
 350   for (unsigned I = 0; I != Size; ++I) {
 351     if (static_cast<unsigned>(Mask[I]) == I)
 352       continue;
 353     if (Mask[I] != 0)
 354       return false;
 355     if (Size % I != 0)
 356       return false;
 357     for (unsigned J = I + 1; J != Size; ++J)
 358       // Check the pattern is repeated.
 359       if (static_cast<unsigned>(Mask[J]) != J % I)
 360         return false;
 361     SubVectorSize = I;
 362     return true;
 363   }
 364   // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
 365   return false;
 366 }
 367
 368 static VectorType *getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST,
 369                                         LLVMContext &C) {
 370   assert((DataVT.getScalarSizeInBits() != 8 ||
 371           DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
 372   MVT IndexVT = DataVT.changeTypeToInteger();
 373   if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
 374     IndexVT = IndexVT.changeVectorElementType(MVT::i16);
 375   return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
 376 }
 377
 378 InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
 379                                              VectorType *Tp, ArrayRef<int> Mask,
 380                                              TTI::TargetCostKind CostKind,
 381                                              int Index, VectorType *SubTp,
 382                                              ArrayRef<const Value *> Args,
 383                                              const Instruction *CxtI) {
 384   Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
 385
 386   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
 387
 388   // First, handle cases where having a fixed length vector enables us to
 389   // give a more accurate cost than falling back to generic scalable codegen.
 390   // TODO: Each of these cases hints at a modeling gap around scalable vectors.
 391   if (isa<FixedVectorType>(Tp)) {
 392     switch (Kind) {
 393     default:
 394       break;
 395     case TTI::SK_PermuteSingleSrc: {
 396       if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) {
 397         MVT EltTp = LT.second.getVectorElementType();
 398         // If the size of the element is < ELEN then shuffles of interleaves and
 399         // deinterleaves of 2 vectors can be lowered into the following
 400         // sequences
 401         if (EltTp.getScalarSizeInBits() < ST->getELen()) {
 402           // Example sequence:
 403           //   vsetivli     zero, 4, e8, mf4, ta, ma (ignored)
 404           //   vwaddu.vv    v10, v8, v9
 405           //   li       a0, -1                   (ignored)
 406           //   vwmaccu.vx   v10, a0, v9
 407           if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
 408             return 2 * LT.first * TLI->getLMULCost(LT.second);
 409
 410           if (Mask[0] == 0 || Mask[0] == 1) {
 411             auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
 412             // Example sequence:
 413             //   vnsrl.wi   v10, v8, 0
 414             if (equal(DeinterleaveMask, Mask))
 415               return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
 416                                                         LT.second, CostKind);
 417           }
 418         }
 419         int SubVectorSize;
 420         if (LT.second.getScalarSizeInBits() != 1 &&
 421             isRepeatedConcatMask(Mask, SubVectorSize)) {
 422           InstructionCost Cost = 0;
 423           unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);
 424           // The cost of extraction from a subvector is 0 if the index is 0.
 425           for (unsigned I = 0; I != NumSlides; ++I) {
 426             unsigned InsertIndex = SubVectorSize * (1 << I);
 427             FixedVectorType *SubTp =
 428                 FixedVectorType::get(Tp->getElementType(), InsertIndex);
 429             FixedVectorType *DestTp =
 430                 FixedVectorType::getDoubleElementsVectorType(SubTp);
 431             std::pair<InstructionCost, MVT> DestLT =
 432                 getTypeLegalizationCost(DestTp);
 433             // Add the cost of whole vector register move because the
 434             // destination vector register group for vslideup cannot overlap the
 435             // source.
 436             Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
 437             Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, {},
 438                                    CostKind, InsertIndex, SubTp);
 439           }
 440           return Cost;
 441         }
 442       }
 443       // vrgather + cost of generating the mask constant.
 444       // We model this for an unknown mask with a single vrgather.
 445       if (LT.second.isFixedLengthVector() && LT.first == 1 &&
 446           (LT.second.getScalarSizeInBits() != 8 ||
 447            LT.second.getVectorNumElements() <= 256)) {
 448         VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext());
 449         InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
 450         return IndexCost +
 451                getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
 452       }
 453       [[fallthrough]];
 454     }
 455     case TTI::SK_Transpose:
 456     case TTI::SK_PermuteTwoSrc: {
 457       // 2 x (vrgather + cost of generating the mask constant) + cost of mask
 458       // register for the second vrgather. We model this for an unknown
 459       // (shuffle) mask.
 460       if (LT.second.isFixedLengthVector() && LT.first == 1 &&
 461           (LT.second.getScalarSizeInBits() != 8 ||
 462            LT.second.getVectorNumElements() <= 256)) {
 463         auto &C = Tp->getContext();
 464         auto EC = Tp->getElementCount();
 465         VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
 466         VectorType *MaskTy = VectorType::get(IntegerType::getInt1Ty(C), EC);
 467         InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
 468         InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
 469         return 2 * IndexCost +
 470                getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
 471                                        LT.second, CostKind) +
 472                MaskCost;
 473       }
 474       [[fallthrough]];
 475     }
 476     case TTI::SK_Select: {
 477       // We are going to permute multiple sources and the result will be in
 478       // multiple destinations. Providing an accurate cost only for splits where
 479       // the element type remains the same.
 480       if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
 481           LT.second.isFixedLengthVector() &&
 482           LT.second.getVectorElementType().getSizeInBits() ==
 483               Tp->getElementType()->getPrimitiveSizeInBits() &&
 484           LT.second.getVectorNumElements() <
 485               cast<FixedVectorType>(Tp)->getNumElements() &&
 486           divideCeil(Mask.size(),
 487                      cast<FixedVectorType>(Tp)->getNumElements()) ==
 488               static_cast<unsigned>(*LT.first.getValue())) {
 489         unsigned NumRegs = *LT.first.getValue();
 490         unsigned VF = cast<FixedVectorType>(Tp)->getNumElements();
 491         unsigned SubVF = PowerOf2Ceil(VF / NumRegs);
 492         auto *SubVecTy = FixedVectorType::get(Tp->getElementType(), SubVF);
 493
 494         InstructionCost Cost = 0;
 495         for (unsigned I = 0, NumSrcRegs = divideCeil(Mask.size(), SubVF);
 496              I < NumSrcRegs; ++I) {
 497           bool IsSingleVector = true;
 498           SmallVector<int> SubMask(SubVF, PoisonMaskElem);
 499           transform(
 500               Mask.slice(I * SubVF,
 501                          I == NumSrcRegs - 1 ? Mask.size() % SubVF : SubVF),
 502               SubMask.begin(), [&](int I) -> int {
 503                 if (I == PoisonMaskElem)
 504                   return PoisonMaskElem;
 505                 bool SingleSubVector = I / VF == 0;
 506                 IsSingleVector &= SingleSubVector;
 507                 return (SingleSubVector ? 0 : 1) * SubVF + (I % VF) % SubVF;
 508               });
 509           if (all_of(enumerate(SubMask), [](auto &&P) {
 510                 return P.value() == PoisonMaskElem ||
 511                        static_cast<unsigned>(P.value()) == P.index();
 512               }))
 513             continue;
 514           Cost += getShuffleCost(IsSingleVector ? TTI::SK_PermuteSingleSrc
 515                                                 : TTI::SK_PermuteTwoSrc,
 516                                  SubVecTy, SubMask, CostKind, 0, nullptr);
 517         }
 518         return Cost;
 519       }
 520       break;
 521     }
 522     }
 523   };
 524
 525   // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
 526   switch (Kind) {
 527   default:
 528     // Fallthrough to generic handling.
 529     // TODO: Most of these cases will return getInvalid in generic code, and
 530     // must be implemented here.
 531     break;
 532   case TTI::SK_ExtractSubvector:
 533     // Extract at zero is always a subregister extract
 534     if (Index == 0)
 535       return TTI::TCC_Free;
 536
 537     // If we're extracting a subvector of at most m1 size at a sub-register
 538     // boundary - which unfortunately we need exact vlen to identify - this is
 539     // a subregister extract at worst and thus won't require a vslidedown.
 540     // TODO: Extend for aligned m2, m4 subvector extracts
 541     // TODO: Extend for misalgined (but contained) extracts
 542     // TODO: Extend for scalable subvector types
 543     if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
 544         SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
 545       const unsigned MinVLen = ST->getRealMinVLen();
 546       const unsigned MaxVLen = ST->getRealMaxVLen();
 547       if (MinVLen == MaxVLen &&
 548           SubLT.second.getScalarSizeInBits() * Index % MinVLen == 0 &&
 549           SubLT.second.getSizeInBits() <= MinVLen)
 550         return TTI::TCC_Free;
 551     }
 552
 553     // Example sequence:
 554     // vsetivli     zero, 4, e8, mf2, tu, ma (ignored)
 555     // vslidedown.vi  v8, v9, 2
 556     return LT.first *
 557            getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
 558   case TTI::SK_InsertSubvector:
 559     // Example sequence:
 560     // vsetivli     zero, 4, e8, mf2, tu, ma (ignored)
 561     // vslideup.vi  v8, v9, 2
 562     return LT.first *
 563            getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
 564   case TTI::SK_Select: {
 565     // Example sequence:
 566     // li           a0, 90
 567     // vsetivli     zero, 8, e8, mf2, ta, ma (ignored)
 568     // vmv.s.x      v0, a0
 569     // vmerge.vvm   v8, v9, v8, v0
 570     // We use 2 for the cost of the mask materialization as this is the true
 571     // cost for small masks and most shuffles are small.  At worst, this cost
 572     // should be a very small constant for the constant pool load.  As such,
 573     // we may bias towards large selects slightly more than truely warranted.
 574     return LT.first *
 575            (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
 576                                         LT.second, CostKind));
 577   }
 578   case TTI::SK_Broadcast: {
 579     bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
 580                                            Instruction::InsertElement);
 581     if (LT.second.getScalarSizeInBits() == 1) {
 582       if (HasScalar) {
 583         // Example sequence:
 584         //   andi a0, a0, 1
 585         //   vsetivli zero, 2, e8, mf8, ta, ma (ignored)
 586         //   vmv.v.x v8, a0
 587         //   vmsne.vi v0, v8, 0
 588         return LT.first *
 589                (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
 590                                             LT.second, CostKind));
 591       }
 592       // Example sequence:
 593       //   vsetivli  zero, 2, e8, mf8, ta, mu (ignored)
 594       //   vmv.v.i v8, 0
 595       //   vmerge.vim      v8, v8, 1, v0
 596       //   vmv.x.s a0, v8
 597       //   andi    a0, a0, 1
 598       //   vmv.v.x v8, a0
 599       //   vmsne.vi  v0, v8, 0
 600
 601       return LT.first *
 602              (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
 603                                            RISCV::VMV_X_S, RISCV::VMV_V_X,
 604                                            RISCV::VMSNE_VI},
 605                                           LT.second, CostKind));
 606     }
 607
 608     if (HasScalar) {
 609       // Example sequence:
 610       //   vmv.v.x v8, a0
 611       return LT.first *
 612              getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
 613     }
 614
 615     // Example sequence:
 616     //   vrgather.vi     v9, v8, 0
 617     return LT.first *
 618            getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
 619   }
 620   case TTI::SK_Splice: {
 621     // vslidedown+vslideup.
 622     // TODO: Multiplying by LT.first implies this legalizes into multiple copies
 623     // of similar code, but I think we expand through memory.
 624     unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
 625     if (Index >= 0 && Index < 32)
 626       Opcodes[0] = RISCV::VSLIDEDOWN_VI;
 627     else if (Index < 0 && Index > -32)
 628       Opcodes[1] = RISCV::VSLIDEUP_VI;
 629     return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
 630   }
 631   case TTI::SK_Reverse: {
 632     // TODO: Cases to improve here:
 633     // * Illegal vector types
 634     // * i64 on RV32
 635     // * i1 vector
 636     // At low LMUL, most of the cost is producing the vrgather index register.
 637     // At high LMUL, the cost of the vrgather itself will dominate.
 638     // Example sequence:
 639     //   csrr a0, vlenb
 640     //   srli a0, a0, 3
 641     //   addi a0, a0, -1
 642     //   vsetvli a1, zero, e8, mf8, ta, mu (ignored)
 643     //   vid.v v9
 644     //   vrsub.vx v10, v9, a0
 645     //   vrgather.vv v9, v8, v10
 646     InstructionCost LenCost = 3;
 647     if (LT.second.isFixedLengthVector())
 648       // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
 649       LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
 650     unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
 651     if (LT.second.isFixedLengthVector() &&
 652         isInt<5>(LT.second.getVectorNumElements() - 1))
 653       Opcodes[1] = RISCV::VRSUB_VI;
 654     InstructionCost GatherCost =
 655         getRISCVInstructionCost(Opcodes, LT.second, CostKind);
 656     // Mask operation additionally required extend and truncate
 657     InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
 658     return LT.first * (LenCost + GatherCost + ExtendCost);
 659   }
 660   }
 661   return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
 662 }
 663
 664 static unsigned isM1OrSmaller(MVT VT) {
 665   RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
 666   return (LMUL == RISCVII::VLMUL::LMUL_F8 || LMUL == RISCVII::VLMUL::LMUL_F4 ||
 667           LMUL == RISCVII::VLMUL::LMUL_F2 || LMUL == RISCVII::VLMUL::LMUL_1);
 668 }
 669
 670 InstructionCost RISCVTTIImpl::getScalarizationOverhead(
 671     VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
 672     TTI::TargetCostKind CostKind, ArrayRef<Value *> VL) {
 673   if (isa<ScalableVectorType>(Ty))
 674     return InstructionCost::getInvalid();
 675
 676   // A build_vector (which is m1 sized or smaller) can be done in no
 677   // worse than one vslide1down.vx per element in the type.  We could
 678   // in theory do an explode_vector in the inverse manner, but our
 679   // lowering today does not have a first class node for this pattern.
 680   InstructionCost Cost = BaseT::getScalarizationOverhead(
 681       Ty, DemandedElts, Insert, Extract, CostKind);
 682   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
 683   if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
 684     if (Ty->getScalarSizeInBits() == 1) {
 685       auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
 686       // Note: Implicit scalar anyextend is assumed to be free since the i1
 687       // must be stored in a GPR.
 688       return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,
 689                                       CostKind) +
 690              getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
 691                               TTI::CastContextHint::None, CostKind, nullptr);
 692     }
 693
 694     assert(LT.second.isFixedLengthVector());
 695     MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
 696     if (isM1OrSmaller(ContainerVT)) {
 697       InstructionCost BV =
 698           cast<FixedVectorType>(Ty)->getNumElements() *
 699           getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);
 700       if (BV < Cost)
 701         Cost = BV;
 702     }
 703   }
 704   return Cost;
 705 }
 706
 707 InstructionCost
 708 RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
 709                                     unsigned AddressSpace,
 710                                     TTI::TargetCostKind CostKind) {
 711   if (!isLegalMaskedLoadStore(Src, Alignment) ||
 712       CostKind != TTI::TCK_RecipThroughput)
 713     return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
 714                                         CostKind);
 715
 716   return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
 717 }
 718
 719 InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
 720     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
 721     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
 722     bool UseMaskForCond, bool UseMaskForGaps) {
 723
 724   // The interleaved memory access pass will lower interleaved memory ops (i.e
 725   // a load and store followed by a specific shuffle) to vlseg/vsseg
 726   // intrinsics.
 727   if (!UseMaskForCond && !UseMaskForGaps &&
 728       Factor <= TLI->getMaxSupportedInterleaveFactor()) {
 729     auto *VTy = cast<VectorType>(VecTy);
 730     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
 731     // Need to make sure type has't been scalarized
 732     if (LT.second.isVector()) {
 733       auto *SubVecTy =
 734           VectorType::get(VTy->getElementType(),
 735                           VTy->getElementCount().divideCoefficientBy(Factor));
 736       if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
 737           TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
 738                                             AddressSpace, DL)) {
 739
 740         // Some processors optimize segment loads/stores as one wide memory op +
 741         // Factor * LMUL shuffle ops.
 742         if (ST->hasOptimizedSegmentLoadStore(Factor)) {
 743           InstructionCost Cost =
 744               getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
 745           MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
 746           Cost += Factor * TLI->getLMULCost(SubVecVT);
 747           return LT.first * Cost;
 748         }
 749
 750         // Otherwise, the cost is proportional to the number of elements (VL *
 751         // Factor ops).
 752         InstructionCost MemOpCost =
 753             getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,
 754                             CostKind, {TTI::OK_AnyValue, TTI::OP_None});
 755         unsigned NumLoads = getEstimatedVLFor(VTy);
 756         return NumLoads * MemOpCost;
 757       }
 758     }
 759   }
 760
 761   // TODO: Return the cost of interleaved accesses for scalable vector when
 762   // unable to convert to segment accesses instructions.
 763   if (isa<ScalableVectorType>(VecTy))
 764     return InstructionCost::getInvalid();
 765
 766   auto *FVTy = cast<FixedVectorType>(VecTy);
 767   InstructionCost MemCost =
 768       getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
 769   unsigned VF = FVTy->getNumElements() / Factor;
 770
 771   // An interleaved load will look like this for Factor=3:
 772   // %wide.vec = load <12 x i32>, ptr %3, align 4
 773   // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
 774   // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
 775   // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
 776   if (Opcode == Instruction::Load) {
 777     InstructionCost Cost = MemCost;
 778     for (unsigned Index : Indices) {
 779       FixedVectorType *SubVecTy =
 780           FixedVectorType::get(FVTy->getElementType(), VF * Factor);
 781       auto Mask = createStrideMask(Index, Factor, VF);
 782       InstructionCost ShuffleCost =
 783           getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, SubVecTy, Mask,
 784                          CostKind, 0, nullptr, {});
 785       Cost += ShuffleCost;
 786     }
 787     return Cost;
 788   }
 789
 790   // TODO: Model for NF > 2
 791   // We'll need to enhance getShuffleCost to model shuffles that are just
 792   // inserts and extracts into subvectors, since they won't have the full cost
 793   // of a vrgather.
 794   // An interleaved store for 3 vectors of 4 lanes will look like
 795   // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
 796   // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
 797   // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
 798   // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
 799   // store <12 x i32> %interleaved.vec, ptr %10, align 4
 800   if (Factor != 2)
 801     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
 802                                              Alignment, AddressSpace, CostKind,
 803                                              UseMaskForCond, UseMaskForGaps);
 804
 805   assert(Opcode == Instruction::Store && "Opcode must be a store");
 806   // For an interleaving store of 2 vectors, we perform one large interleaving
 807   // shuffle that goes into the wide store
 808   auto Mask = createInterleaveMask(VF, Factor);
 809   InstructionCost ShuffleCost =
 810       getShuffleCost(TTI::ShuffleKind::SK_PermuteSingleSrc, FVTy, Mask,
 811                      CostKind, 0, nullptr, {});
 812   return MemCost + ShuffleCost;
 813 }
 814
 815 InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
 816     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
 817     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
 818   if (CostKind != TTI::TCK_RecipThroughput)
 819     return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
 820                                          Alignment, CostKind, I);
 821
 822   if ((Opcode == Instruction::Load &&
 823        !isLegalMaskedGather(DataTy, Align(Alignment))) ||
 824       (Opcode == Instruction::Store &&
 825        !isLegalMaskedScatter(DataTy, Align(Alignment))))
 826     return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
 827                                          Alignment, CostKind, I);
 828
 829   // Cost is proportional to the number of memory operations implied.  For
 830   // scalable vectors, we use an estimate on that number since we don't
 831   // know exactly what VL will be.
 832   auto &VTy = *cast<VectorType>(DataTy);
 833   InstructionCost MemOpCost =
 834       getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
 835                       {TTI::OK_AnyValue, TTI::OP_None}, I);
 836   unsigned NumLoads = getEstimatedVLFor(&VTy);
 837   return NumLoads * MemOpCost;
 838 }
 839
 840 InstructionCost RISCVTTIImpl::getStridedMemoryOpCost(
 841     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
 842     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
 843   if (((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
 844        !isLegalStridedLoadStore(DataTy, Alignment)) ||
 845       (Opcode != Instruction::Load && Opcode != Instruction::Store))
 846     return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
 847                                          Alignment, CostKind, I);
 848
 849   if (CostKind == TTI::TCK_CodeSize)
 850     return TTI::TCC_Basic;
 851
 852   // Cost is proportional to the number of memory operations implied.  For
 853   // scalable vectors, we use an estimate on that number since we don't
 854   // know exactly what VL will be.
 855   auto &VTy = *cast<VectorType>(DataTy);
 856   InstructionCost MemOpCost =
 857       getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
 858                       {TTI::OK_AnyValue, TTI::OP_None}, I);
 859   unsigned NumLoads = getEstimatedVLFor(&VTy);
 860   return NumLoads * MemOpCost;
 861 }
 862
 863 InstructionCost
 864 RISCVTTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
 865   // FIXME: This is a property of the default vector convention, not
 866   // all possible calling conventions.  Fixing that will require
 867   // some TTI API and SLP rework.
 868   InstructionCost Cost = 0;
 869   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 870   for (auto *Ty : Tys) {
 871     if (!Ty->isVectorTy())
 872       continue;
 873     Align A = DL.getPrefTypeAlign(Ty);
 874     Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +
 875             getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);
 876   }
 877   return Cost;
 878 }
 879
 880 // Currently, these represent both throughput and codesize costs
 881 // for the respective intrinsics.  The costs in this table are simply
 882 // instruction counts with the following adjustments made:
 883 // * One vsetvli is considered free.
 884 static const CostTblEntry VectorIntrinsicCostTable[]{
 885     {Intrinsic::floor, MVT::f32, 9},
 886     {Intrinsic::floor, MVT::f64, 9},
 887     {Intrinsic::ceil, MVT::f32, 9},
 888     {Intrinsic::ceil, MVT::f64, 9},
 889     {Intrinsic::trunc, MVT::f32, 7},
 890     {Intrinsic::trunc, MVT::f64, 7},
 891     {Intrinsic::round, MVT::f32, 9},
 892     {Intrinsic::round, MVT::f64, 9},
 893     {Intrinsic::roundeven, MVT::f32, 9},
 894     {Intrinsic::roundeven, MVT::f64, 9},
 895     {Intrinsic::rint, MVT::f32, 7},
 896     {Intrinsic::rint, MVT::f64, 7},
 897     {Intrinsic::lrint, MVT::i32, 1},
 898     {Intrinsic::lrint, MVT::i64, 1},
 899     {Intrinsic::llrint, MVT::i64, 1},
 900     {Intrinsic::nearbyint, MVT::f32, 9},
 901     {Intrinsic::nearbyint, MVT::f64, 9},
 902     {Intrinsic::bswap, MVT::i16, 3},
 903     {Intrinsic::bswap, MVT::i32, 12},
 904     {Intrinsic::bswap, MVT::i64, 31},
 905     {Intrinsic::vp_bswap, MVT::i16, 3},
 906     {Intrinsic::vp_bswap, MVT::i32, 12},
 907     {Intrinsic::vp_bswap, MVT::i64, 31},
 908     {Intrinsic::vp_fshl, MVT::i8, 7},
 909     {Intrinsic::vp_fshl, MVT::i16, 7},
 910     {Intrinsic::vp_fshl, MVT::i32, 7},
 911     {Intrinsic::vp_fshl, MVT::i64, 7},
 912     {Intrinsic::vp_fshr, MVT::i8, 7},
 913     {Intrinsic::vp_fshr, MVT::i16, 7},
 914     {Intrinsic::vp_fshr, MVT::i32, 7},
 915     {Intrinsic::vp_fshr, MVT::i64, 7},
 916     {Intrinsic::bitreverse, MVT::i8, 17},
 917     {Intrinsic::bitreverse, MVT::i16, 24},
 918     {Intrinsic::bitreverse, MVT::i32, 33},
 919     {Intrinsic::bitreverse, MVT::i64, 52},
 920     {Intrinsic::vp_bitreverse, MVT::i8, 17},
 921     {Intrinsic::vp_bitreverse, MVT::i16, 24},
 922     {Intrinsic::vp_bitreverse, MVT::i32, 33},
 923     {Intrinsic::vp_bitreverse, MVT::i64, 52},
 924     {Intrinsic::ctpop, MVT::i8, 12},
 925     {Intrinsic::ctpop, MVT::i16, 19},
 926     {Intrinsic::ctpop, MVT::i32, 20},
 927     {Intrinsic::ctpop, MVT::i64, 21},
 928     {Intrinsic::ctlz, MVT::i8, 19},
 929     {Intrinsic::ctlz, MVT::i16, 28},
 930     {Intrinsic::ctlz, MVT::i32, 31},
 931     {Intrinsic::ctlz, MVT::i64, 35},
 932     {Intrinsic::cttz, MVT::i8, 16},
 933     {Intrinsic::cttz, MVT::i16, 23},
 934     {Intrinsic::cttz, MVT::i32, 24},
 935     {Intrinsic::cttz, MVT::i64, 25},
 936     {Intrinsic::vp_ctpop, MVT::i8, 12},
 937     {Intrinsic::vp_ctpop, MVT::i16, 19},
 938     {Intrinsic::vp_ctpop, MVT::i32, 20},
 939     {Intrinsic::vp_ctpop, MVT::i64, 21},
 940     {Intrinsic::vp_ctlz, MVT::i8, 19},
 941     {Intrinsic::vp_ctlz, MVT::i16, 28},
 942     {Intrinsic::vp_ctlz, MVT::i32, 31},
 943     {Intrinsic::vp_ctlz, MVT::i64, 35},
 944     {Intrinsic::vp_cttz, MVT::i8, 16},
 945     {Intrinsic::vp_cttz, MVT::i16, 23},
 946     {Intrinsic::vp_cttz, MVT::i32, 24},
 947     {Intrinsic::vp_cttz, MVT::i64, 25},
 948 };
 949
 950 static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID) {
 951   switch (ID) {
 952 #define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD)                                    \
 953   case Intrinsic::VPID:                                                        \
 954     return ISD::VPSD;
 955 #include "llvm/IR/VPIntrinsics.def"
 956 #undef HELPER_MAP_VPID_TO_VPSD
 957   }
 958   return ISD::DELETED_NODE;
 959 }
 960
 961 InstructionCost
 962 RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
 963                                     TTI::TargetCostKind CostKind) {
 964   auto *RetTy = ICA.getReturnType();
 965   switch (ICA.getID()) {
 966   case Intrinsic::lrint:
 967   case Intrinsic::llrint:
 968     // We can't currently lower half or bfloat vector lrint/llrint.
 969     if (auto *VecTy = dyn_cast<VectorType>(ICA.getArgTypes()[0]);
 970         VecTy && VecTy->getElementType()->is16bitFPTy())
 971       return InstructionCost::getInvalid();
 972     [[fallthrough]];
 973   case Intrinsic::ceil:
 974   case Intrinsic::floor:
 975   case Intrinsic::trunc:
 976   case Intrinsic::rint:
 977   case Intrinsic::round:
 978   case Intrinsic::roundeven: {
 979     // These all use the same code.
 980     auto LT = getTypeLegalizationCost(RetTy);
 981     if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
 982       return LT.first * 8;
 983     break;
 984   }
 985   case Intrinsic::umin:
 986   case Intrinsic::umax:
 987   case Intrinsic::smin:
 988   case Intrinsic::smax: {
 989     auto LT = getTypeLegalizationCost(RetTy);
 990     if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
 991       return LT.first;
 992
 993     if (ST->hasVInstructions() && LT.second.isVector()) {
 994       unsigned Op;
 995       switch (ICA.getID()) {
 996       case Intrinsic::umin:
 997         Op = RISCV::VMINU_VV;
 998         break;
 999       case Intrinsic::umax:
1000         Op = RISCV::VMAXU_VV;
1001         break;
1002       case Intrinsic::smin:
1003         Op = RISCV::VMIN_VV;
1004         break;
1005       case Intrinsic::smax:
1006         Op = RISCV::VMAX_VV;
1007         break;
1008       }
1009       return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1010     }
1011     break;
1012   }
1013   case Intrinsic::sadd_sat:
1014   case Intrinsic::ssub_sat:
1015   case Intrinsic::uadd_sat:
1016   case Intrinsic::usub_sat: {
1017     auto LT = getTypeLegalizationCost(RetTy);
1018     if (ST->hasVInstructions() && LT.second.isVector()) {
1019       unsigned Op;
1020       switch (ICA.getID()) {
1021       case Intrinsic::sadd_sat:
1022         Op = RISCV::VSADD_VV;
1023         break;
1024       case Intrinsic::ssub_sat:
1025         Op = RISCV::VSSUBU_VV;
1026         break;
1027       case Intrinsic::uadd_sat:
1028         Op = RISCV::VSADDU_VV;
1029         break;
1030       case Intrinsic::usub_sat:
1031         Op = RISCV::VSSUBU_VV;
1032         break;
1033       }
1034       return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1035     }
1036     break;
1037   }
1038   case Intrinsic::fabs:
1039   case Intrinsic::sqrt: {
1040     auto LT = getTypeLegalizationCost(RetTy);
1041     // TODO: add f16/bf16, bf16 with zvfbfmin && f16 with zvfhmin
1042     if (ST->hasVInstructions() && LT.second.isVector()) {
1043       unsigned Op;
1044       switch (ICA.getID()) {
1045       case Intrinsic::fabs:
1046         Op = RISCV::VFSGNJX_VV;
1047         break;
1048       case Intrinsic::sqrt:
1049         Op = RISCV::VFSQRT_V;
1050         break;
1051       }
1052       return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1053     }
1054     break;
1055   }
1056   case Intrinsic::cttz:
1057   case Intrinsic::ctlz:
1058   case Intrinsic::ctpop: {
1059     auto LT = getTypeLegalizationCost(RetTy);
1060     if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector()) {
1061       unsigned Op;
1062       switch (ICA.getID()) {
1063       case Intrinsic::cttz:
1064         Op = RISCV::VCTZ_V;
1065         break;
1066       case Intrinsic::ctlz:
1067         Op = RISCV::VCLZ_V;
1068         break;
1069       case Intrinsic::ctpop:
1070         Op = RISCV::VCPOP_V;
1071         break;
1072       }
1073       return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1074     }
1075     break;
1076   }
1077   case Intrinsic::abs: {
1078     auto LT = getTypeLegalizationCost(RetTy);
1079     if (ST->hasVInstructions() && LT.second.isVector()) {
1080       // vrsub.vi v10, v8, 0
1081       // vmax.vv v8, v8, v10
1082       return LT.first *
1083              getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},
1084                                      LT.second, CostKind);
1085     }
1086     break;
1087   }
1088   case Intrinsic::get_active_lane_mask: {
1089     if (ST->hasVInstructions()) {
1090       Type *ExpRetTy = VectorType::get(
1091           ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1092       auto LT = getTypeLegalizationCost(ExpRetTy);
1093
1094       // vid.v   v8  // considered hoisted
1095       // vsaddu.vx   v8, v8, a0
1096       // vmsltu.vx   v0, v8, a1
1097       return LT.first *
1098              getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
1099                                      LT.second, CostKind);
1100     }
1101     break;
1102   }
1103   // TODO: add more intrinsic
1104   case Intrinsic::stepvector: {
1105     auto LT = getTypeLegalizationCost(RetTy);
1106     // Legalisation of illegal types involves an `index' instruction plus
1107     // (LT.first - 1) vector adds.
1108     if (ST->hasVInstructions())
1109       return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
1110              (LT.first - 1) *
1111                  getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
1112     return 1 + (LT.first - 1);
1113   }
1114   case Intrinsic::experimental_cttz_elts: {
1115     Type *ArgTy = ICA.getArgTypes()[0];
1116     EVT ArgType = TLI->getValueType(DL, ArgTy, true);
1117     if (getTLI()->shouldExpandCttzElements(ArgType))
1118       break;
1119     InstructionCost Cost = getRISCVInstructionCost(
1120         RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
1121
1122     // If zero_is_poison is false, then we will generate additional
1123     // cmp + select instructions to convert -1 to EVL.
1124     Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
1125     if (ICA.getArgs().size() > 1 &&
1126         cast<ConstantInt>(ICA.getArgs()[1])->isZero())
1127       Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
1128                                  CmpInst::ICMP_SLT, CostKind) +
1129               getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
1130                                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
1131
1132     return Cost;
1133   }
1134   case Intrinsic::vp_rint: {
1135     // RISC-V target uses at least 5 instructions to lower rounding intrinsics.
1136     unsigned Cost = 5;
1137     auto LT = getTypeLegalizationCost(RetTy);
1138     if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
1139       return Cost * LT.first;
1140     break;
1141   }
1142   case Intrinsic::vp_nearbyint: {
1143     // More one read and one write for fflags than vp_rint.
1144     unsigned Cost = 7;
1145     auto LT = getTypeLegalizationCost(RetTy);
1146     if (TLI->isOperationCustom(ISD::VP_FRINT, LT.second))
1147       return Cost * LT.first;
1148     break;
1149   }
1150   case Intrinsic::vp_ceil:
1151   case Intrinsic::vp_floor:
1152   case Intrinsic::vp_round:
1153   case Intrinsic::vp_roundeven:
1154   case Intrinsic::vp_roundtozero: {
1155     // Rounding with static rounding mode needs two more instructions to
1156     // swap/write FRM than vp_rint.
1157     unsigned Cost = 7;
1158     auto LT = getTypeLegalizationCost(RetTy);
1159     unsigned VPISD = getISDForVPIntrinsicID(ICA.getID());
1160     if (TLI->isOperationCustom(VPISD, LT.second))
1161       return Cost * LT.first;
1162     break;
1163   }
1164   case Intrinsic::vp_fneg: {
1165     std::optional<unsigned> FOp =
1166         VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID());
1167     assert(FOp.has_value());
1168     return getArithmeticInstrCost(*FOp, ICA.getReturnType(), CostKind);
1169     break;
1170   }
1171   case Intrinsic::vp_select: {
1172     Intrinsic::ID IID = ICA.getID();
1173     std::optional<unsigned> FOp = VPIntrinsic::getFunctionalOpcodeForVP(IID);
1174     assert(FOp.has_value());
1175     return getCmpSelInstrCost(*FOp, ICA.getReturnType(), ICA.getArgTypes()[0],
1176                               CmpInst::BAD_ICMP_PREDICATE, CostKind);
1177   }
1178   case Intrinsic::vp_merge:
1179     return getCmpSelInstrCost(Instruction::Select, ICA.getReturnType(),
1180                               ICA.getArgTypes()[0], CmpInst::BAD_ICMP_PREDICATE,
1181                               CostKind);
1182   case Intrinsic::experimental_vp_splat: {
1183     auto LT = getTypeLegalizationCost(RetTy);
1184     // TODO: Lower i1 experimental_vp_splat
1185     if (!ST->hasVInstructions() || LT.second.getScalarType() == MVT::i1)
1186       return InstructionCost::getInvalid();
1187     return LT.first * getRISCVInstructionCost(LT.second.isFloatingPoint()
1188                                                   ? RISCV::VFMV_V_F
1189                                                   : RISCV::VMV_V_X,
1190                                               LT.second, CostKind);
1191   }
1192   }
1193
1194   if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1195     if (auto LT = getTypeLegalizationCost(RetTy);
1196         LT.second.isVector()) {
1197       MVT EltTy = LT.second.getVectorElementType();
1198       if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1199                                               ICA.getID(), EltTy))
1200         return LT.first * Entry->Cost;
1201     }
1202   }
1203
1204   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1205 }
1206
1207 InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
1208                                                Type *Src,
1209                                                TTI::CastContextHint CCH,
1210                                                TTI::TargetCostKind CostKind,
1211                                                const Instruction *I) {
1212   bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1213   if (!IsVectorType)
1214     return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1215
1216   // FIXME: Need to compute legalizing cost for illegal types.  The current
1217   // code handles only legal types and those which can be trivially
1218   // promoted to legal.
1219   if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1220       Dst->getScalarSizeInBits() > ST->getELen())
1221     return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1222
1223   int ISD = TLI->InstructionOpcodeToISD(Opcode);
1224   assert(ISD && "Invalid opcode");
1225   std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1226   std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1227
1228   // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1229   // The shared implementation doesn't model vector widening during legalization
1230   // and instead assumes scalarization.  In order to scalarize an <N x i1>
1231   // vector, we need to extend/trunc to/from i8.  If we don't special case
1232   // this, we can get an infinite recursion cycle.
1233   switch (ISD) {
1234   default:
1235     break;
1236   case ISD::SIGN_EXTEND:
1237   case ISD::ZERO_EXTEND:
1238     if (Src->getScalarSizeInBits() == 1) {
1239       // We do not use vsext/vzext to extend from mask vector.
1240       // Instead we use the following instructions to extend from mask vector:
1241       // vmv.v.i v8, 0
1242       // vmerge.vim v8, v8, -1, v0 (repeated per split)
1243       return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +
1244              DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,
1245                                                    DstLT.second, CostKind) +
1246              DstLT.first - 1;
1247     }
1248     break;
1249   case ISD::TRUNCATE:
1250     if (Dst->getScalarSizeInBits() == 1) {
1251       // We do not use several vncvt to truncate to mask vector. So we could
1252       // not use PowDiff to calculate it.
1253       // Instead we use the following instructions to truncate to mask vector:
1254       // vand.vi v8, v8, 1
1255       // vmsne.vi v0, v8, 0
1256       return SrcLT.first *
1257                  getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1258                                          SrcLT.second, CostKind) +
1259              SrcLT.first - 1;
1260     }
1261     break;
1262   };
1263
1264   // Our actual lowering for the case where a wider legal type is available
1265   // uses promotion to the wider type.  This is reflected in the result of
1266   // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1267   // scalarized if the legalized Src and Dst are not equal sized.
1268   const DataLayout &DL = this->getDataLayout();
1269   if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1270       !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),
1271                            SrcLT.second.getSizeInBits()) ||
1272       !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),
1273                            DstLT.second.getSizeInBits()))
1274     return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1275
1276   // The split cost is handled by the base getCastInstrCost
1277   assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
1278
1279   int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -
1280                 (int)Log2_32(SrcLT.second.getScalarSizeInBits());
1281   switch (ISD) {
1282   case ISD::SIGN_EXTEND:
1283   case ISD::ZERO_EXTEND: {
1284     if ((PowDiff < 1) || (PowDiff > 3))
1285       return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1286     unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1287     unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1288     unsigned Op =
1289         (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1290     return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1291   }
1292   case ISD::TRUNCATE:
1293   case ISD::FP_EXTEND:
1294   case ISD::FP_ROUND: {
1295     // Counts of narrow/widen instructions.
1296     unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();
1297     unsigned DstEltSize = DstLT.second.getScalarSizeInBits();
1298
1299     unsigned Op = (ISD == ISD::TRUNCATE)    ? RISCV::VNSRL_WI
1300                   : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1301                                             : RISCV::VFNCVT_F_F_W;
1302     InstructionCost Cost = 0;
1303     for (; SrcEltSize != DstEltSize;) {
1304       MVT ElementMVT = (ISD == ISD::TRUNCATE)
1305                            ? MVT::getIntegerVT(DstEltSize)
1306                            : MVT::getFloatingPointVT(DstEltSize);
1307       MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1308       DstEltSize =
1309           (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1310       Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1311     }
1312     return Cost;
1313   }
1314   case ISD::FP_TO_SINT:
1315   case ISD::FP_TO_UINT: {
1316     unsigned IsSigned = ISD == ISD::FP_TO_SINT;
1317     unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;
1318     unsigned FWCVT =
1319         IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;
1320     unsigned FNCVT =
1321         IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;
1322     unsigned SrcEltSize = Src->getScalarSizeInBits();
1323     unsigned DstEltSize = Dst->getScalarSizeInBits();
1324     InstructionCost Cost = 0;
1325     if ((SrcEltSize == 16) &&
1326         (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {
1327       // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1328       // pre-widening to f32 and then convert f32 to integer
1329       VectorType *VecF32Ty =
1330           VectorType::get(Type::getFloatTy(Dst->getContext()),
1331                           cast<VectorType>(Dst)->getElementCount());
1332       std::pair<InstructionCost, MVT> VecF32LT =
1333           getTypeLegalizationCost(VecF32Ty);
1334       Cost +=
1335           VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,
1336                                                    VecF32LT.second, CostKind);
1337       Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);
1338       return Cost;
1339     }
1340     if (DstEltSize == SrcEltSize)
1341       Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1342     else if (DstEltSize > SrcEltSize)
1343       Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1344     else { // (SrcEltSize > DstEltSize)
1345       // First do a narrowing conversion to an integer half the size, then
1346       // truncate if needed.
1347       MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);
1348       MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);
1349       Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);
1350       if ((SrcEltSize / 2) > DstEltSize) {
1351         Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());
1352         Cost +=
1353             getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);
1354       }
1355     }
1356     return Cost;
1357   }
1358   case ISD::SINT_TO_FP:
1359   case ISD::UINT_TO_FP: {
1360     unsigned IsSigned = ISD == ISD::SINT_TO_FP;
1361     unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;
1362     unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;
1363     unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;
1364     unsigned SrcEltSize = Src->getScalarSizeInBits();
1365     unsigned DstEltSize = Dst->getScalarSizeInBits();
1366
1367     InstructionCost Cost = 0;
1368     if ((DstEltSize == 16) &&
1369         (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {
1370       // If the target only supports zvfhmin or it is i64-to-fp16 conversion
1371       // it is converted to f32 and then converted to f16
1372       VectorType *VecF32Ty =
1373           VectorType::get(Type::getFloatTy(Dst->getContext()),
1374                           cast<VectorType>(Dst)->getElementCount());
1375       std::pair<InstructionCost, MVT> VecF32LT =
1376           getTypeLegalizationCost(VecF32Ty);
1377       Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);
1378       Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,
1379                                                        DstLT.second, CostKind);
1380       return Cost;
1381     }
1382
1383     if (DstEltSize == SrcEltSize)
1384       Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1385     else if (DstEltSize > SrcEltSize) {
1386       if ((DstEltSize / 2) > SrcEltSize) {
1387         VectorType *VecTy =
1388             VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),
1389                             cast<VectorType>(Dst)->getElementCount());
1390         unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;
1391         Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);
1392       }
1393       Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1394     } else
1395       Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);
1396     return Cost;
1397   }
1398   }
1399   return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1400 }
1401
1402 unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
1403   if (isa<ScalableVectorType>(Ty)) {
1404     const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1405     const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1406     const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1407     return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1408   }
1409   return cast<FixedVectorType>(Ty)->getNumElements();
1410 }
1411
1412 InstructionCost
1413 RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
1414                                      FastMathFlags FMF,
1415                                      TTI::TargetCostKind CostKind) {
1416   if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1417     return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1418
1419   // Skip if scalar size of Ty is bigger than ELEN.
1420   if (Ty->getScalarSizeInBits() > ST->getELen())
1421     return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1422
1423   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1424   if (Ty->getElementType()->isIntegerTy(1)) {
1425     // SelectionDAGBuilder does following transforms:
1426     //   vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1427     //   vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1428     if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1429       return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1430     else
1431       return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1432   }
1433
1434   if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1435     SmallVector<unsigned, 3> Opcodes;
1436     InstructionCost ExtraCost = 0;
1437     switch (IID) {
1438     case Intrinsic::maximum:
1439       if (FMF.noNaNs()) {
1440         Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1441       } else {
1442         Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1443                    RISCV::VFMV_F_S};
1444         // Cost of Canonical Nan + branch
1445         // lui a0, 523264
1446         // fmv.w.x fa0, a0
1447         Type *DstTy = Ty->getScalarType();
1448         const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1449         Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1450         ExtraCost = 1 +
1451                     getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1452                                      TTI::CastContextHint::None, CostKind) +
1453                     getCFInstrCost(Instruction::Br, CostKind);
1454       }
1455       break;
1456
1457     case Intrinsic::minimum:
1458       if (FMF.noNaNs()) {
1459         Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1460       } else {
1461         Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1462                    RISCV::VFMV_F_S};
1463         // Cost of Canonical Nan + branch
1464         // lui a0, 523264
1465         // fmv.w.x fa0, a0
1466         Type *DstTy = Ty->getScalarType();
1467         const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1468         Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1469         ExtraCost = 1 +
1470                     getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1471                                      TTI::CastContextHint::None, CostKind) +
1472                     getCFInstrCost(Instruction::Br, CostKind);
1473       }
1474       break;
1475     }
1476     return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1477   }
1478
1479   // IR Reduction is composed by one rvv reduction instruction and vmv
1480   unsigned SplitOp;
1481   SmallVector<unsigned, 3> Opcodes;
1482   switch (IID) {
1483   default:
1484     llvm_unreachable("Unsupported intrinsic");
1485   case Intrinsic::smax:
1486     SplitOp = RISCV::VMAX_VV;
1487     Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1488     break;
1489   case Intrinsic::smin:
1490     SplitOp = RISCV::VMIN_VV;
1491     Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};
1492     break;
1493   case Intrinsic::umax:
1494     SplitOp = RISCV::VMAXU_VV;
1495     Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1496     break;
1497   case Intrinsic::umin:
1498     SplitOp = RISCV::VMINU_VV;
1499     Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};
1500     break;
1501   case Intrinsic::maxnum:
1502     SplitOp = RISCV::VFMAX_VV;
1503     Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1504     break;
1505   case Intrinsic::minnum:
1506     SplitOp = RISCV::VFMIN_VV;
1507     Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1508     break;
1509   }
1510   // Add a cost for data larger than LMUL8
1511   InstructionCost SplitCost =
1512       (LT.first > 1) ? (LT.first - 1) *
1513                            getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1514                      : 0;
1515   return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1516 }
1517
1518 InstructionCost
1519 RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
1520                                          std::optional<FastMathFlags> FMF,
1521                                          TTI::TargetCostKind CostKind) {
1522   if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1523     return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1524
1525   // Skip if scalar size of Ty is bigger than ELEN.
1526   if (Ty->getScalarSizeInBits() > ST->getELen())
1527     return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1528
1529   int ISD = TLI->InstructionOpcodeToISD(Opcode);
1530   assert(ISD && "Invalid opcode");
1531
1532   if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1533       ISD != ISD::FADD)
1534     return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1535
1536   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1537   Type *ElementTy = Ty->getElementType();
1538   if (ElementTy->isIntegerTy(1)) {
1539     if (ISD == ISD::AND) {
1540       // Example sequences:
1541       //   vsetvli a0, zero, e8, mf8, ta, ma
1542       //   vmand.mm v8, v9, v8 ; needed every time type is split
1543       //   vmnot.m v8, v0
1544       //   vcpop.m a0, v8
1545       //   seqz a0, a0
1546       return LT.first * getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second,
1547                                                 CostKind) +
1548              getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
1549              getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1550                                 CmpInst::ICMP_EQ, CostKind);
1551     } else if (ISD == ISD::XOR) {
1552       // Example sequences:
1553       //   vsetvli a0, zero, e8, mf8, ta, ma
1554       //   vmxor.mm v8, v0, v8 ; needed every time type is split
1555       //   vcpop.m a0, v8
1556       //   andi a0, a0, 1
1557       return (LT.first - 1) *
1558                  getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
1559              getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;
1560     } else {
1561       // Example sequences:
1562       //   vsetvli a0, zero, e8, mf8, ta, ma
1563       //   vmxor.mm v8, v9, v8 ; needed every time type is split
1564       //   vcpop.m a0, v0
1565       //   snez a0, a0
1566       return (LT.first - 1) *
1567                  getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
1568              getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
1569              getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1570                                 CmpInst::ICMP_NE, CostKind);
1571     }
1572   }
1573
1574   // IR Reduction of or/and is composed by one vmv and one rvv reduction
1575   // instruction, and others is composed by two vmv and one rvv reduction
1576   // instruction
1577   unsigned SplitOp;
1578   SmallVector<unsigned, 3> Opcodes;
1579   switch (ISD) {
1580   case ISD::ADD:
1581     SplitOp = RISCV::VADD_VV;
1582     Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
1583     break;
1584   case ISD::OR:
1585     SplitOp = RISCV::VOR_VV;
1586     Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};
1587     break;
1588   case ISD::XOR:
1589     SplitOp = RISCV::VXOR_VV;
1590     Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
1591     break;
1592   case ISD::AND:
1593     SplitOp = RISCV::VAND_VV;
1594     Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};
1595     break;
1596   case ISD::FADD:
1597     // We can't promote f16/bf16 fadd reductions.
1598     if ((LT.second.getVectorElementType() == MVT::f16 &&
1599          !ST->hasVInstructionsF16()) ||
1600         LT.second.getVectorElementType() == MVT::bf16)
1601       return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1602     if (TTI::requiresOrderedReduction(FMF)) {
1603       Opcodes.push_back(RISCV::VFMV_S_F);
1604       for (unsigned i = 0; i < LT.first.getValue(); i++)
1605         Opcodes.push_back(RISCV::VFREDOSUM_VS);
1606       Opcodes.push_back(RISCV::VFMV_F_S);
1607       return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1608     }
1609     SplitOp = RISCV::VFADD_VV;
1610     Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
1611     break;
1612   }
1613   // Add a cost for data larger than LMUL8
1614   InstructionCost SplitCost =
1615       (LT.first > 1) ? (LT.first - 1) *
1616                            getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1617                      : 0;
1618   return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1619 }
1620
1621 InstructionCost RISCVTTIImpl::getExtendedReductionCost(
1622     unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1623     FastMathFlags FMF, TTI::TargetCostKind CostKind) {
1624   if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1625     return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1626                                            FMF, CostKind);
1627
1628   // Skip if scalar size of ResTy is bigger than ELEN.
1629   if (ResTy->getScalarSizeInBits() > ST->getELen())
1630     return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1631                                            FMF, CostKind);
1632
1633   if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1634     return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1635                                            FMF, CostKind);
1636
1637   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1638
1639   if (IsUnsigned && Opcode == Instruction::Add &&
1640       LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) {
1641     // Represent vector_reduce_add(ZExt(<n x i1>)) as
1642     // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
1643     return LT.first *
1644            getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);
1645   }
1646
1647   if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
1648     return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
1649                                            FMF, CostKind);
1650
1651   return (LT.first - 1) +
1652          getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1653 }
1654
1655 InstructionCost RISCVTTIImpl::getStoreImmCost(Type *Ty,
1656                                               TTI::OperandValueInfo OpInfo,
1657                                               TTI::TargetCostKind CostKind) {
1658   assert(OpInfo.isConstant() && "non constant operand?");
1659   if (!isa<VectorType>(Ty))
1660     // FIXME: We need to account for immediate materialization here, but doing
1661     // a decent job requires more knowledge about the immediate than we
1662     // currently have here.
1663     return 0;
1664
1665   if (OpInfo.isUniform())
1666     // vmv.v.i, vmv.v.x, or vfmv.v.f
1667     // We ignore the cost of the scalar constant materialization to be consistent
1668     // with how we treat scalar constants themselves just above.
1669     return 1;
1670
1671   return getConstantPoolLoadCost(Ty, CostKind);
1672 }
1673
1674
1675 InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1676                                               MaybeAlign Alignment,
1677                                               unsigned AddressSpace,
1678                                               TTI::TargetCostKind CostKind,
1679                                               TTI::OperandValueInfo OpInfo,
1680                                               const Instruction *I) {
1681   EVT VT = TLI->getValueType(DL, Src, true);
1682   // Type legalization can't handle structs
1683   if (VT == MVT::Other)
1684     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1685                                   CostKind, OpInfo, I);
1686
1687   InstructionCost Cost = 0;
1688   if (Opcode == Instruction::Store && OpInfo.isConstant())
1689     Cost += getStoreImmCost(Src, OpInfo, CostKind);
1690
1691   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1692
1693   InstructionCost BaseCost = [&]() {
1694     InstructionCost Cost = LT.first;
1695     if (CostKind != TTI::TCK_RecipThroughput)
1696       return Cost;
1697
1698     // Our actual lowering for the case where a wider legal type is available
1699     // uses the a VL predicated load on the wider type.  This is reflected in
1700     // the result of getTypeLegalizationCost, but BasicTTI assumes the
1701     // widened cases are scalarized.
1702     const DataLayout &DL = this->getDataLayout();
1703     if (Src->isVectorTy() && LT.second.isVector() &&
1704         TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
1705                             LT.second.getSizeInBits()))
1706         return Cost;
1707
1708     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1709                                   CostKind, OpInfo, I);
1710   }();
1711
1712   // Assume memory ops cost scale with the number of vector registers
1713   // possible accessed by the instruction.  Note that BasicTTI already
1714   // handles the LT.first term for us.
1715   if (LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
1716     BaseCost *= TLI->getLMULCost(LT.second);
1717   return Cost + BaseCost;
1718
1719 }
1720
1721 InstructionCost RISCVTTIImpl::getCmpSelInstrCost(
1722     unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
1723     TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
1724     TTI::OperandValueInfo Op2Info, const Instruction *I) {
1725   if (CostKind != TTI::TCK_RecipThroughput)
1726     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1727                                      Op1Info, Op2Info, I);
1728
1729   if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
1730     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1731                                      Op1Info, Op2Info, I);
1732
1733   // Skip if scalar size of ValTy is bigger than ELEN.
1734   if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
1735     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1736                                      Op1Info, Op2Info, I);
1737
1738   auto GetConstantMatCost =
1739       [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
1740     if (OpInfo.isUniform())
1741       // We return 0 we currently ignore the cost of materializing scalar
1742       // constants in GPRs.
1743       return 0;
1744
1745     return getConstantPoolLoadCost(ValTy, CostKind);
1746   };
1747
1748   InstructionCost ConstantMatCost;
1749   if (Op1Info.isConstant())
1750     ConstantMatCost += GetConstantMatCost(Op1Info);
1751   if (Op2Info.isConstant())
1752     ConstantMatCost += GetConstantMatCost(Op2Info);
1753
1754   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1755   if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1756     if (CondTy->isVectorTy()) {
1757       if (ValTy->getScalarSizeInBits() == 1) {
1758         // vmandn.mm v8, v8, v9
1759         // vmand.mm v9, v0, v9
1760         // vmor.mm v0, v9, v8
1761         return ConstantMatCost +
1762                LT.first *
1763                    getRISCVInstructionCost(
1764                        {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1765                        LT.second, CostKind);
1766       }
1767       // vselect and max/min are supported natively.
1768       return ConstantMatCost +
1769              LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,
1770                                                 CostKind);
1771     }
1772
1773     if (ValTy->getScalarSizeInBits() == 1) {
1774       //  vmv.v.x v9, a0
1775       //  vmsne.vi v9, v9, 0
1776       //  vmandn.mm v8, v8, v9
1777       //  vmand.mm v9, v0, v9
1778       //  vmor.mm v0, v9, v8
1779       MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
1780       return ConstantMatCost +
1781              LT.first *
1782                  getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
1783                                          InterimVT, CostKind) +
1784              LT.first * getRISCVInstructionCost(
1785                             {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1786                             LT.second, CostKind);
1787     }
1788
1789     // vmv.v.x v10, a0
1790     // vmsne.vi v0, v10, 0
1791     // vmerge.vvm v8, v9, v8, v0
1792     return ConstantMatCost +
1793            LT.first * getRISCVInstructionCost(
1794                           {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
1795                           LT.second, CostKind);
1796   }
1797
1798   if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
1799       CmpInst::isIntPredicate(VecPred)) {
1800     // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
1801     // provided they incur the same cost across all implementations
1802     return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,
1803                                                                 LT.second,
1804                                                                 CostKind);
1805   }
1806
1807   if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
1808       CmpInst::isFPPredicate(VecPred)) {
1809
1810     // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
1811     if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
1812       return ConstantMatCost +
1813              getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
1814
1815     // If we do not support the input floating point vector type, use the base
1816     // one which will calculate as:
1817     // ScalarizeCost + Num * Cost for fixed vector,
1818     // InvalidCost for scalable vector.
1819     if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
1820         (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
1821         (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
1822       return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1823                                        Op1Info, Op2Info, I);
1824
1825     // Assuming vector fp compare and mask instructions are all the same cost
1826     // until a need arises to differentiate them.
1827     switch (VecPred) {
1828     case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
1829     case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
1830     case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
1831     case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
1832       return ConstantMatCost +
1833              LT.first * getRISCVInstructionCost(
1834                             {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
1835                             LT.second, CostKind);
1836
1837     case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
1838     case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
1839     case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
1840     case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
1841       return ConstantMatCost +
1842              LT.first *
1843                  getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
1844                                          LT.second, CostKind);
1845
1846     case CmpInst::FCMP_OEQ: // vmfeq.vv
1847     case CmpInst::FCMP_OGT: // vmflt.vv
1848     case CmpInst::FCMP_OGE: // vmfle.vv
1849     case CmpInst::FCMP_OLT: // vmflt.vv
1850     case CmpInst::FCMP_OLE: // vmfle.vv
1851     case CmpInst::FCMP_UNE: // vmfne.vv
1852       return ConstantMatCost +
1853              LT.first *
1854                  getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
1855     default:
1856       break;
1857     }
1858   }
1859
1860   // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
1861   // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
1862   // generate a conditional branch + mv. The cost of scalar (icmp + select) will
1863   // be (0 + select instr cost).
1864   if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
1865       ValTy->isIntegerTy() && !I->user_empty()) {
1866     if (all_of(I->users(), [&](const User *U) {
1867           return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
1868                  U->getType()->isIntegerTy() &&
1869                  !isa<ConstantData>(U->getOperand(1)) &&
1870                  !isa<ConstantData>(U->getOperand(2));
1871         }))
1872       return 0;
1873   }
1874
1875   // TODO: Add cost for scalar type.
1876
1877   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1878                                    Op1Info, Op2Info, I);
1879 }
1880
1881 InstructionCost RISCVTTIImpl::getCFInstrCost(unsigned Opcode,
1882                                              TTI::TargetCostKind CostKind,
1883                                              const Instruction *I) {
1884   if (CostKind != TTI::TCK_RecipThroughput)
1885     return Opcode == Instruction::PHI ? 0 : 1;
1886   // Branches are assumed to be predicted.
1887   return 0;
1888 }
1889
1890 InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
1891                                                  TTI::TargetCostKind CostKind,
1892                                                  unsigned Index, Value *Op0,
1893                                                  Value *Op1) {
1894   assert(Val->isVectorTy() && "This must be a vector type");
1895
1896   if (Opcode != Instruction::ExtractElement &&
1897       Opcode != Instruction::InsertElement)
1898     return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1899
1900   // Legalize the type.
1901   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
1902
1903   // This type is legalized to a scalar type.
1904   if (!LT.second.isVector()) {
1905     auto *FixedVecTy = cast<FixedVectorType>(Val);
1906     // If Index is a known constant, cost is zero.
1907     if (Index != -1U)
1908       return 0;
1909     // Extract/InsertElement with non-constant index is very costly when
1910     // scalarized; estimate cost of loads/stores sequence via the stack:
1911     // ExtractElement cost: store vector to stack, load scalar;
1912     // InsertElement cost: store vector to stack, store scalar, load vector.
1913     Type *ElemTy = FixedVecTy->getElementType();
1914     auto NumElems = FixedVecTy->getNumElements();
1915     auto Align = DL.getPrefTypeAlign(ElemTy);
1916     InstructionCost LoadCost =
1917         getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
1918     InstructionCost StoreCost =
1919         getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
1920     return Opcode == Instruction::ExtractElement
1921                ? StoreCost * NumElems + LoadCost
1922                : (StoreCost + LoadCost) * NumElems + StoreCost;
1923   }
1924
1925   // For unsupported scalable vector.
1926   if (LT.second.isScalableVector() && !LT.first.isValid())
1927     return LT.first;
1928
1929   // Mask vector extract/insert is expanded via e8.
1930   if (Val->getScalarSizeInBits() == 1) {
1931     VectorType *WideTy =
1932       VectorType::get(IntegerType::get(Val->getContext(), 8),
1933                       cast<VectorType>(Val)->getElementCount());
1934     if (Opcode == Instruction::ExtractElement) {
1935       InstructionCost ExtendCost
1936         = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1937                            TTI::CastContextHint::None, CostKind);
1938       InstructionCost ExtractCost
1939         = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1940       return ExtendCost + ExtractCost;
1941     }
1942     InstructionCost ExtendCost
1943       = getCastInstrCost(Instruction::ZExt, WideTy, Val,
1944                          TTI::CastContextHint::None, CostKind);
1945     InstructionCost InsertCost
1946       = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
1947     InstructionCost TruncCost
1948       = getCastInstrCost(Instruction::Trunc, Val, WideTy,
1949                          TTI::CastContextHint::None, CostKind);
1950     return ExtendCost + InsertCost + TruncCost;
1951   }
1952
1953
1954   // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1955   // and vslideup + vmv.s.x to insert element to vector.
1956   unsigned BaseCost = 1;
1957   // When insertelement we should add the index with 1 as the input of vslideup.
1958   unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
1959
1960   if (Index != -1U) {
1961     // The type may be split. For fixed-width vectors we can normalize the
1962     // index to the new type.
1963     if (LT.second.isFixedLengthVector()) {
1964       unsigned Width = LT.second.getVectorNumElements();
1965       Index = Index % Width;
1966     }
1967
1968     // If exact VLEN is known, we will insert/extract into the appropriate
1969     // subvector with no additional subvector insert/extract cost.
1970     if (auto VLEN = ST->getRealVLen()) {
1971       unsigned EltSize = LT.second.getScalarSizeInBits();
1972       unsigned M1Max = *VLEN / EltSize;
1973       Index = Index % M1Max;
1974     }
1975
1976     // We could extract/insert the first element without vslidedown/vslideup.
1977     if (Index == 0)
1978       SlideCost = 0;
1979     else if (Opcode == Instruction::InsertElement)
1980       SlideCost = 1; // With a constant index, we do not need to use addi.
1981   }
1982
1983   // When the vector needs to split into multiple register groups and the index
1984   // exceeds single vector register group, we need to insert/extract the element
1985   // via stack.
1986   if (LT.first > 1 &&
1987       ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
1988                           LT.second.isScalableVector()))) {
1989     Type *ScalarType = Val->getScalarType();
1990     Align VecAlign = DL.getPrefTypeAlign(Val);
1991     Align SclAlign = DL.getPrefTypeAlign(ScalarType);
1992     // Extra addi for unknown index.
1993     InstructionCost IdxCost = Index == -1U ? 1 : 0;
1994
1995     // Store all split vectors into stack and load the target element.
1996     if (Opcode == Instruction::ExtractElement)
1997       return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
1998              getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
1999                              CostKind) +
2000              IdxCost;
2001
2002     // Store all split vectors into stack and store the target element and load
2003     // vectors back.
2004     return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2005            getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +
2006            getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
2007                            CostKind) +
2008            IdxCost;
2009   }
2010
2011   // Extract i64 in the target that has XLEN=32 need more instruction.
2012   if (Val->getScalarType()->isIntegerTy() &&
2013       ST->getXLen() < Val->getScalarSizeInBits()) {
2014     // For extractelement, we need the following instructions:
2015     // vsetivli zero, 1, e64, m1, ta, mu (not count)
2016     // vslidedown.vx v8, v8, a0
2017     // vmv.x.s a0, v8
2018     // li a1, 32
2019     // vsrl.vx v8, v8, a1
2020     // vmv.x.s a1, v8
2021
2022     // For insertelement, we need the following instructions:
2023     // vsetivli zero, 2, e32, m4, ta, mu (not count)
2024     // vmv.v.i v12, 0
2025     // vslide1up.vx v16, v12, a1
2026     // vslide1up.vx v12, v16, a0
2027     // addi a0, a2, 1
2028     // vsetvli zero, a0, e64, m4, tu, mu (not count)
2029     // vslideup.vx v8, v12, a2
2030
2031     // TODO: should we count these special vsetvlis?
2032     BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
2033   }
2034   return BaseCost + SlideCost;
2035 }
2036
2037 InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
2038     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2039     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
2040     ArrayRef<const Value *> Args, const Instruction *CxtI) {
2041
2042   // TODO: Handle more cost kinds.
2043   if (CostKind != TTI::TCK_RecipThroughput)
2044     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2045                                          Args, CxtI);
2046
2047   if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2048     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2049                                          Args, CxtI);
2050
2051   // Skip if scalar size of Ty is bigger than ELEN.
2052   if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
2053     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2054                                          Args, CxtI);
2055
2056   // Legalize the type.
2057   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2058
2059   // TODO: Handle scalar type.
2060   if (!LT.second.isVector())
2061     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2062                                          Args, CxtI);
2063
2064   // f16 with zvfhmin and bf16 will be promoted to f32.
2065   // FIXME: nxv32[b]f16 will be custom lowered and split.
2066   unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
2067   InstructionCost CastCost = 0;
2068   if ((LT.second.getVectorElementType() == MVT::f16 ||
2069        LT.second.getVectorElementType() == MVT::bf16) &&
2070       TLI->getOperationAction(ISDOpcode, LT.second) ==
2071           TargetLoweringBase::LegalizeAction::Promote) {
2072     MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);
2073     Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());
2074     Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2075     // Add cost of extending arguments
2076     CastCost += LT.first * Args.size() *
2077                 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,
2078                                  TTI::CastContextHint::None, CostKind);
2079     // Add cost of truncating result
2080     CastCost +=
2081         LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,
2082                                     TTI::CastContextHint::None, CostKind);
2083     // Compute cost of op in promoted type
2084     LT.second = PromotedVT;
2085   }
2086
2087   auto getConstantMatCost =
2088       [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
2089     if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))
2090       // Two sub-cases:
2091       // * Has a 5 bit immediate operand which can be splatted.
2092       // * Has a larger immediate which must be materialized in scalar register
2093       // We return 0 for both as we currently ignore the cost of materializing
2094       // scalar constants in GPRs.
2095       return 0;
2096
2097     return getConstantPoolLoadCost(Ty, CostKind);
2098   };
2099
2100   // Add the cost of materializing any constant vectors required.
2101   InstructionCost ConstantMatCost = 0;
2102   if (Op1Info.isConstant())
2103     ConstantMatCost += getConstantMatCost(0, Op1Info);
2104   if (Op2Info.isConstant())
2105     ConstantMatCost += getConstantMatCost(1, Op2Info);
2106
2107   unsigned Op;
2108   switch (ISDOpcode) {
2109   case ISD::ADD:
2110   case ISD::SUB:
2111     Op = RISCV::VADD_VV;
2112     break;
2113   case ISD::SHL:
2114   case ISD::SRL:
2115   case ISD::SRA:
2116     Op = RISCV::VSLL_VV;
2117     break;
2118   case ISD::AND:
2119   case ISD::OR:
2120   case ISD::XOR:
2121     Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
2122     break;
2123   case ISD::MUL:
2124   case ISD::MULHS:
2125   case ISD::MULHU:
2126     Op = RISCV::VMUL_VV;
2127     break;
2128   case ISD::SDIV:
2129   case ISD::UDIV:
2130     Op = RISCV::VDIV_VV;
2131     break;
2132   case ISD::SREM:
2133   case ISD::UREM:
2134     Op = RISCV::VREM_VV;
2135     break;
2136   case ISD::FADD:
2137   case ISD::FSUB:
2138     Op = RISCV::VFADD_VV;
2139     break;
2140   case ISD::FMUL:
2141     Op = RISCV::VFMUL_VV;
2142     break;
2143   case ISD::FDIV:
2144     Op = RISCV::VFDIV_VV;
2145     break;
2146   case ISD::FNEG:
2147     Op = RISCV::VFSGNJN_VV;
2148     break;
2149   default:
2150     // Assuming all other instructions have the same cost until a need arises to
2151     // differentiate them.
2152     return CastCost + ConstantMatCost +
2153            BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2154                                          Args, CxtI);
2155   }
2156
2157   InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
2158   // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2159   // ops are twice as expensive as integer ops. Do the same for vectors so
2160   // scalar floating point ops aren't cheaper than their vector equivalents.
2161   if (Ty->isFPOrFPVectorTy())
2162     InstrCost *= 2;
2163   return CastCost + ConstantMatCost + LT.first * InstrCost;
2164 }
2165
2166 // TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2167 InstructionCost RISCVTTIImpl::getPointersChainCost(
2168     ArrayRef<const Value *> Ptrs, const Value *Base,
2169     const TTI::PointersChainInfo &Info, Type *AccessTy,
2170     TTI::TargetCostKind CostKind) {
2171   InstructionCost Cost = TTI::TCC_Free;
2172   // In the basic model we take into account GEP instructions only
2173   // (although here can come alloca instruction, a value, constants and/or
2174   // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2175   // pointer). Typically, if Base is a not a GEP-instruction and all the
2176   // pointers are relative to the same base address, all the rest are
2177   // either GEP instructions, PHIs, bitcasts or constants. When we have same
2178   // base, we just calculate cost of each non-Base GEP as an ADD operation if
2179   // any their index is a non-const.
2180   // If no known dependecies between the pointers cost is calculated as a sum
2181   // of costs of GEP instructions.
2182   for (auto [I, V] : enumerate(Ptrs)) {
2183     const auto *GEP = dyn_cast<GetElementPtrInst>(V);
2184     if (!GEP)
2185       continue;
2186     if (Info.isSameBase() && V != Base) {
2187       if (GEP->hasAllConstantIndices())
2188         continue;
2189       // If the chain is unit-stride and BaseReg + stride*i is a legal
2190       // addressing mode, then presume the base GEP is sitting around in a
2191       // register somewhere and check if we can fold the offset relative to
2192       // it.
2193       unsigned Stride = DL.getTypeStoreSize(AccessTy);
2194       if (Info.isUnitStride() &&
2195           isLegalAddressingMode(AccessTy,
2196                                 /* BaseGV */ nullptr,
2197                                 /* BaseOffset */ Stride * I,
2198                                 /* HasBaseReg */ true,
2199                                 /* Scale */ 0,
2200                                 GEP->getType()->getPointerAddressSpace()))
2201         continue;
2202       Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
2203                                      {TTI::OK_AnyValue, TTI::OP_None},
2204                                      {TTI::OK_AnyValue, TTI::OP_None}, {});
2205     } else {
2206       SmallVector<const Value *> Indices(GEP->indices());
2207       Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
2208                          Indices, AccessTy, CostKind);
2209     }
2210   }
2211   return Cost;
2212 }
2213
2214 void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
2215                                            TTI::UnrollingPreferences &UP,
2216                                            OptimizationRemarkEmitter *ORE) {
2217   // TODO: More tuning on benchmarks and metrics with changes as needed
2218   //       would apply to all settings below to enable performance.
2219
2220
2221   if (ST->enableDefaultUnroll())
2222     return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2223
2224   // Enable Upper bound unrolling universally, not dependant upon the conditions
2225   // below.
2226   UP.UpperBound = true;
2227
2228   // Disable loop unrolling for Oz and Os.
2229   UP.OptSizeThreshold = 0;
2230   UP.PartialOptSizeThreshold = 0;
2231   if (L->getHeader()->getParent()->hasOptSize())
2232     return;
2233
2234   SmallVector<BasicBlock *, 4> ExitingBlocks;
2235   L->getExitingBlocks(ExitingBlocks);
2236   LLVM_DEBUG(dbgs() << "Loop has:\n"
2237                     << "Blocks: " << L->getNumBlocks() << "\n"
2238                     << "Exit blocks: " << ExitingBlocks.size() << "\n");
2239
2240   // Only allow another exit other than the latch. This acts as an early exit
2241   // as it mirrors the profitability calculation of the runtime unroller.
2242   if (ExitingBlocks.size() > 2)
2243     return;
2244
2245   // Limit the CFG of the loop body for targets with a branch predictor.
2246   // Allowing 4 blocks permits if-then-else diamonds in the body.
2247   if (L->getNumBlocks() > 4)
2248     return;
2249
2250   // Don't unroll vectorized loops, including the remainder loop
2251   if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2252     return;
2253
2254   // Scan the loop: don't unroll loops with calls as this could prevent
2255   // inlining.
2256   InstructionCost Cost = 0;
2257   for (auto *BB : L->getBlocks()) {
2258     for (auto &I : *BB) {
2259       // Initial setting - Don't unroll loops containing vectorized
2260       // instructions.
2261       if (I.getType()->isVectorTy())
2262         return;
2263
2264       if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2265         if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2266           if (!isLoweredToCall(F))
2267             continue;
2268         }
2269         return;
2270       }
2271
2272       SmallVector<const Value *> Operands(I.operand_values());
2273       Cost += getInstructionCost(&I, Operands,
2274                                  TargetTransformInfo::TCK_SizeAndLatency);
2275     }
2276   }
2277
2278   LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2279
2280   UP.Partial = true;
2281   UP.Runtime = true;
2282   UP.UnrollRemainder = true;
2283   UP.UnrollAndJam = true;
2284
2285   // Force unrolling small loops can be very useful because of the branch
2286   // taken cost of the backedge.
2287   if (Cost < 12)
2288     UP.Force = true;
2289 }
2290
2291 void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
2292                                          TTI::PeelingPreferences &PP) {
2293   BaseT::getPeelingPreferences(L, SE, PP);
2294 }
2295
2296 unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) {
2297   if (Ty->isVectorTy()) {
2298     // f16 with only zvfhmin and bf16 will be promoted to f32
2299     Type *EltTy = cast<VectorType>(Ty)->getElementType();
2300     if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
2301         EltTy->isBFloatTy())
2302       Ty = VectorType::get(Type::getFloatTy(Ty->getContext()),
2303                            cast<VectorType>(Ty));
2304
2305     TypeSize Size = DL.getTypeSizeInBits(Ty);
2306     if (Size.isScalable() && ST->hasVInstructions())
2307       return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
2308
2309     if (ST->useRVVForFixedLengthVectors())
2310       return divideCeil(Size, ST->getRealMinVLen());
2311   }
2312
2313   return BaseT::getRegUsageForType(Ty);
2314 }
2315
2316 unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2317   if (SLPMaxVF.getNumOccurrences())
2318     return SLPMaxVF;
2319
2320   // Return how many elements can fit in getRegisterBitwidth.  This is the
2321   // same routine as used in LoopVectorizer.  We should probably be
2322   // accounting for whether we actually have instructions with the right
2323   // lane type, but we don't have enough information to do that without
2324   // some additional plumbing which hasn't been justified yet.
2325   TypeSize RegWidth =
2326     getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
2327   // If no vector registers, or absurd element widths, disable
2328   // vectorization by returning 1.
2329   return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
2330 }
2331
2332 TTI::AddressingModeKind
2333 RISCVTTIImpl::getPreferredAddressingMode(const Loop *L,
2334                                          ScalarEvolution *SE) const {
2335   if (ST->hasVendorXCVmem() && !ST->is64Bit())
2336     return TTI::AMK_PostIndexed;
2337
2338   return BasicTTIImplBase::getPreferredAddressingMode(L, SE);
2339 }
2340
2341 bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
2342                                  const TargetTransformInfo::LSRCost &C2) {
2343   // RISC-V specific here are "instruction number 1st priority".
2344   // If we need to emit adds inside the loop to add up base registers, then
2345   // we need at least one extra temporary register.
2346   unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
2347   unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
2348   return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
2349                   C1.NumIVMuls, C1.NumBaseAdds,
2350                   C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
2351          std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
2352                   C2.NumIVMuls, C2.NumBaseAdds,
2353                   C2.ScaleCost, C2.ImmCost, C2.SetupCost);
2354 }
2355
2356 bool RISCVTTIImpl::isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) {
2357   auto *VTy = dyn_cast<VectorType>(DataTy);
2358   if (!VTy || VTy->isScalableTy())
2359     return false;
2360
2361   if (!isLegalMaskedLoadStore(DataTy, Alignment))
2362     return false;
2363
2364   // FIXME: If it is an i8 vector and the element count exceeds 256, we should
2365   // scalarize these types with LMUL >= maximum fixed-length LMUL.
2366   if (VTy->getElementType()->isIntegerTy(8))
2367     if (VTy->getElementCount().getFixedValue() > 256)
2368       return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <
2369              ST->getMaxLMULForFixedLengthVectors();
2370   return true;
2371 }
2372
2373 bool RISCVTTIImpl::isLegalMaskedCompressStore(Type *DataTy, Align Alignment) {
2374   auto *VTy = dyn_cast<VectorType>(DataTy);
2375   if (!VTy || VTy->isScalableTy())
2376     return false;
2377
2378   if (!isLegalMaskedLoadStore(DataTy, Alignment))
2379     return false;
2380   return true;
2381 }
2382
2383 /// See if \p I should be considered for address type promotion. We check if \p
2384 /// I is a sext with right type and used in memory accesses. If it used in a
2385 /// "complex" getelementptr, we allow it to be promoted without finding other
2386 /// sext instructions that sign extended the same initial value. A getelementptr
2387 /// is considered as "complex" if it has more than 2 operands.
2388 bool RISCVTTIImpl::shouldConsiderAddressTypePromotion(
2389     const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
2390   bool Considerable = false;
2391   AllowPromotionWithoutCommonHeader = false;
2392   if (!isa<SExtInst>(&I))
2393     return false;
2394   Type *ConsideredSExtType =
2395       Type::getInt64Ty(I.getParent()->getParent()->getContext());
2396   if (I.getType() != ConsideredSExtType)
2397     return false;
2398   // See if the sext is the one with the right type and used in at least one
2399   // GetElementPtrInst.
2400   for (const User *U : I.users()) {
2401     if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
2402       Considerable = true;
2403       // A getelementptr is considered as "complex" if it has more than 2
2404       // operands. We will promote a SExt used in such complex GEP as we
2405       // expect some computation to be merged if they are done on 64 bits.
2406       if (GEPInst->getNumOperands() > 2) {
2407         AllowPromotionWithoutCommonHeader = true;
2408         break;
2409       }
2410     }
2411   }
2412   return Considerable;
2413 }
2414
2415 bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {
2416   switch (Opcode) {
2417   case Instruction::Add:
2418   case Instruction::Sub:
2419   case Instruction::Mul:
2420   case Instruction::And:
2421   case Instruction::Or:
2422   case Instruction::Xor:
2423   case Instruction::FAdd:
2424   case Instruction::FSub:
2425   case Instruction::FMul:
2426   case Instruction::FDiv:
2427   case Instruction::ICmp:
2428   case Instruction::FCmp:
2429     return true;
2430   case Instruction::Shl:
2431   case Instruction::LShr:
2432   case Instruction::AShr:
2433   case Instruction::UDiv:
2434   case Instruction::SDiv:
2435   case Instruction::URem:
2436   case Instruction::SRem:
2437   case Instruction::Select:
2438     return Operand == 1;
2439   default:
2440     return false;
2441   }
2442 }
2443
2444 bool RISCVTTIImpl::canSplatOperand(Instruction *I, int Operand) const {
2445   if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
2446     return false;
2447
2448   if (canSplatOperand(I->getOpcode(), Operand))
2449     return true;
2450
2451   auto *II = dyn_cast<IntrinsicInst>(I);
2452   if (!II)
2453     return false;
2454
2455   switch (II->getIntrinsicID()) {
2456   case Intrinsic::fma:
2457   case Intrinsic::vp_fma:
2458   case Intrinsic::fmuladd:
2459   case Intrinsic::vp_fmuladd:
2460     return Operand == 0 || Operand == 1;
2461   case Intrinsic::vp_shl:
2462   case Intrinsic::vp_lshr:
2463   case Intrinsic::vp_ashr:
2464   case Intrinsic::vp_udiv:
2465   case Intrinsic::vp_sdiv:
2466   case Intrinsic::vp_urem:
2467   case Intrinsic::vp_srem:
2468   case Intrinsic::ssub_sat:
2469   case Intrinsic::vp_ssub_sat:
2470   case Intrinsic::usub_sat:
2471   case Intrinsic::vp_usub_sat:
2472   case Intrinsic::vp_select:
2473     return Operand == 1;
2474     // These intrinsics are commutative.
2475   case Intrinsic::vp_add:
2476   case Intrinsic::vp_mul:
2477   case Intrinsic::vp_and:
2478   case Intrinsic::vp_or:
2479   case Intrinsic::vp_xor:
2480   case Intrinsic::vp_fadd:
2481   case Intrinsic::vp_fmul:
2482   case Intrinsic::vp_icmp:
2483   case Intrinsic::vp_fcmp:
2484   case Intrinsic::smin:
2485   case Intrinsic::vp_smin:
2486   case Intrinsic::umin:
2487   case Intrinsic::vp_umin:
2488   case Intrinsic::smax:
2489   case Intrinsic::vp_smax:
2490   case Intrinsic::umax:
2491   case Intrinsic::vp_umax:
2492   case Intrinsic::sadd_sat:
2493   case Intrinsic::vp_sadd_sat:
2494   case Intrinsic::uadd_sat:
2495   case Intrinsic::vp_uadd_sat:
2496     // These intrinsics have 'vr' versions.
2497   case Intrinsic::vp_sub:
2498   case Intrinsic::vp_fsub:
2499   case Intrinsic::vp_fdiv:
2500     return Operand == 0 || Operand == 1;
2501   default:
2502     return false;
2503   }
2504 }
2505
2506 /// Check if sinking \p I's operands to I's basic block is profitable, because
2507 /// the operands can be folded into a target instruction, e.g.
2508 /// splats of scalars can fold into vector instructions.
2509 bool RISCVTTIImpl::isProfitableToSinkOperands(
2510     Instruction *I, SmallVectorImpl<Use *> &Ops) const {
2511   using namespace llvm::PatternMatch;
2512
2513   if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
2514     return false;
2515
2516   // Don't sink splat operands if the target prefers it. Some targets requires
2517   // S2V transfer buffers and we can run out of them copying the same value
2518   // repeatedly.
2519   // FIXME: It could still be worth doing if it would improve vector register
2520   // pressure and prevent a vector spill.
2521   if (!ST->sinkSplatOperands())
2522     return false;
2523
2524   for (auto OpIdx : enumerate(I->operands())) {
2525     if (!canSplatOperand(I, OpIdx.index()))
2526       continue;
2527
2528     Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
2529     // Make sure we are not already sinking this operand
2530     if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
2531       continue;
2532
2533     // We are looking for a splat that can be sunk.
2534     if (!match(Op, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),
2535                              m_Undef(), m_ZeroMask())))
2536       continue;
2537
2538     // Don't sink i1 splats.
2539     if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))
2540       continue;
2541
2542     // All uses of the shuffle should be sunk to avoid duplicating it across gpr
2543     // and vector registers
2544     for (Use &U : Op->uses()) {
2545       Instruction *Insn = cast<Instruction>(U.getUser());
2546       if (!canSplatOperand(Insn, U.getOperandNo()))
2547         return false;
2548     }
2549
2550     Ops.push_back(&Op->getOperandUse(0));
2551     Ops.push_back(&OpIdx.value());
2552   }
2553   return true;
2554 }
2555
2556 RISCVTTIImpl::TTI::MemCmpExpansionOptions
2557 RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
2558   TTI::MemCmpExpansionOptions Options;
2559   // TODO: Enable expansion when unaligned access is not supported after we fix
2560   // issues in ExpandMemcmp.
2561   if (!ST->enableUnalignedScalarMem())
2562     return Options;
2563
2564   if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
2565     return Options;
2566
2567   Options.AllowOverlappingLoads = true;
2568   Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
2569   Options.NumLoadsPerBlock = Options.MaxNumLoads;
2570   if (ST->is64Bit()) {
2571     Options.LoadSizes = {8, 4, 2, 1};
2572     Options.AllowedTailExpansions = {3, 5, 6};
2573   } else {
2574     Options.LoadSizes = {4, 2, 1};
2575     Options.AllowedTailExpansions = {3};
2576   }
2577   return Options;
2578 }