llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

   1 //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #include "AArch64ExpandImm.h"
  10 #include "AArch64TargetTransformInfo.h"
  11 #include "MCTargetDesc/AArch64AddressingModes.h"
  12 #include "llvm/Analysis/LoopInfo.h"
  13 #include "llvm/Analysis/TargetTransformInfo.h"
  14 #include "llvm/CodeGen/BasicTTIImpl.h"
  15 #include "llvm/CodeGen/CostTable.h"
  16 #include "llvm/CodeGen/TargetLowering.h"
  17 #include "llvm/IR/IntrinsicInst.h"
  18 #include "llvm/IR/IntrinsicsAArch64.h"
  19 #include "llvm/Support/Debug.h"
  20 #include <algorithm>
  21 using namespace llvm;
  22
  23 #define DEBUG_TYPE "aarch64tti"
  24
  25 static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
  26                                                cl::init(true), cl::Hidden);
  27
  28 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
  29                                          const Function *Callee) const {
  30   const TargetMachine &TM = getTLI()->getTargetMachine();
  31
  32   const FeatureBitset &CallerBits =
  33       TM.getSubtargetImpl(*Caller)->getFeatureBits();
  34   const FeatureBitset &CalleeBits =
  35       TM.getSubtargetImpl(*Callee)->getFeatureBits();
  36
  37   // Inline a callee if its target-features are a subset of the callers
  38   // target-features.
  39   return (CallerBits & CalleeBits) == CalleeBits;
  40 }
  41
  42 /// Calculate the cost of materializing a 64-bit value. This helper
  43 /// method might only calculate a fraction of a larger immediate. Therefore it
  44 /// is valid to return a cost of ZERO.
  45 int AArch64TTIImpl::getIntImmCost(int64_t Val) {
  46   // Check if the immediate can be encoded within an instruction.
  47   if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
  48     return 0;
  49
  50   if (Val < 0)
  51     Val = ~Val;
  52
  53   // Calculate how many moves we will need to materialize this constant.
  54   SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
  55   AArch64_IMM::expandMOVImm(Val, 64, Insn);
  56   return Insn.size();
  57 }
  58
  59 /// Calculate the cost of materializing the given constant.
  60 int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
  61   assert(Ty->isIntegerTy());
  62
  63   unsigned BitSize = Ty->getPrimitiveSizeInBits();
  64   if (BitSize == 0)
  65     return ~0U;
  66
  67   // Sign-extend all constants to a multiple of 64-bit.
  68   APInt ImmVal = Imm;
  69   if (BitSize & 0x3f)
  70     ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
  71
  72   // Split the constant into 64-bit chunks and calculate the cost for each
  73   // chunk.
  74   int Cost = 0;
  75   for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
  76     APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
  77     int64_t Val = Tmp.getSExtValue();
  78     Cost += getIntImmCost(Val);
  79   }
  80   // We need at least one instruction to materialze the constant.
  81   return std::max(1, Cost);
  82 }
  83
  84 int AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
  85                                       const APInt &Imm, Type *Ty) {
  86   assert(Ty->isIntegerTy());
  87
  88   unsigned BitSize = Ty->getPrimitiveSizeInBits();
  89   // There is no cost model for constants with a bit size of 0. Return TCC_Free
  90   // here, so that constant hoisting will ignore this constant.
  91   if (BitSize == 0)
  92     return TTI::TCC_Free;
  93
  94   unsigned ImmIdx = ~0U;
  95   switch (Opcode) {
  96   default:
  97     return TTI::TCC_Free;
  98   case Instruction::GetElementPtr:
  99     // Always hoist the base address of a GetElementPtr.
 100     if (Idx == 0)
 101       return 2 * TTI::TCC_Basic;
 102     return TTI::TCC_Free;
 103   case Instruction::Store:
 104     ImmIdx = 0;
 105     break;
 106   case Instruction::Add:
 107   case Instruction::Sub:
 108   case Instruction::Mul:
 109   case Instruction::UDiv:
 110   case Instruction::SDiv:
 111   case Instruction::URem:
 112   case Instruction::SRem:
 113   case Instruction::And:
 114   case Instruction::Or:
 115   case Instruction::Xor:
 116   case Instruction::ICmp:
 117     ImmIdx = 1;
 118     break;
 119   // Always return TCC_Free for the shift value of a shift instruction.
 120   case Instruction::Shl:
 121   case Instruction::LShr:
 122   case Instruction::AShr:
 123     if (Idx == 1)
 124       return TTI::TCC_Free;
 125     break;
 126   case Instruction::Trunc:
 127   case Instruction::ZExt:
 128   case Instruction::SExt:
 129   case Instruction::IntToPtr:
 130   case Instruction::PtrToInt:
 131   case Instruction::BitCast:
 132   case Instruction::PHI:
 133   case Instruction::Call:
 134   case Instruction::Select:
 135   case Instruction::Ret:
 136   case Instruction::Load:
 137     break;
 138   }
 139
 140   if (Idx == ImmIdx) {
 141     int NumConstants = (BitSize + 63) / 64;
 142     int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
 143     return (Cost <= NumConstants * TTI::TCC_Basic)
 144                ? static_cast<int>(TTI::TCC_Free)
 145                : Cost;
 146   }
 147   return AArch64TTIImpl::getIntImmCost(Imm, Ty);
 148 }
 149
 150 int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
 151                                         const APInt &Imm, Type *Ty) {
 152   assert(Ty->isIntegerTy());
 153
 154   unsigned BitSize = Ty->getPrimitiveSizeInBits();
 155   // There is no cost model for constants with a bit size of 0. Return TCC_Free
 156   // here, so that constant hoisting will ignore this constant.
 157   if (BitSize == 0)
 158     return TTI::TCC_Free;
 159
 160   // Most (all?) AArch64 intrinsics do not support folding immediates into the
 161   // selected instruction, so we compute the materialization cost for the
 162   // immediate directly.
 163   if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
 164     return AArch64TTIImpl::getIntImmCost(Imm, Ty);
 165
 166   switch (IID) {
 167   default:
 168     return TTI::TCC_Free;
 169   case Intrinsic::sadd_with_overflow:
 170   case Intrinsic::uadd_with_overflow:
 171   case Intrinsic::ssub_with_overflow:
 172   case Intrinsic::usub_with_overflow:
 173   case Intrinsic::smul_with_overflow:
 174   case Intrinsic::umul_with_overflow:
 175     if (Idx == 1) {
 176       int NumConstants = (BitSize + 63) / 64;
 177       int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
 178       return (Cost <= NumConstants * TTI::TCC_Basic)
 179                  ? static_cast<int>(TTI::TCC_Free)
 180                  : Cost;
 181     }
 182     break;
 183   case Intrinsic::experimental_stackmap:
 184     if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
 185       return TTI::TCC_Free;
 186     break;
 187   case Intrinsic::experimental_patchpoint_void:
 188   case Intrinsic::experimental_patchpoint_i64:
 189     if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
 190       return TTI::TCC_Free;
 191     break;
 192   }
 193   return AArch64TTIImpl::getIntImmCost(Imm, Ty);
 194 }
 195
 196 TargetTransformInfo::PopcntSupportKind
 197 AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
 198   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
 199   if (TyWidth == 32 || TyWidth == 64)
 200     return TTI::PSK_FastHardware;
 201   // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
 202   return TTI::PSK_Software;
 203 }
 204
 205 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
 206                                            ArrayRef<const Value *> Args) {
 207
 208   // A helper that returns a vector type from the given type. The number of
 209   // elements in type Ty determine the vector width.
 210   auto toVectorTy = [&](Type *ArgTy) {
 211     return VectorType::get(ArgTy->getScalarType(),
 212                            DstTy->getVectorNumElements());
 213   };
 214
 215   // Exit early if DstTy is not a vector type whose elements are at least
 216   // 16-bits wide.
 217   if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
 218     return false;
 219
 220   // Determine if the operation has a widening variant. We consider both the
 221   // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
 222   // instructions.
 223   //
 224   // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
 225   //       verify that their extending operands are eliminated during code
 226   //       generation.
 227   switch (Opcode) {
 228   case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
 229   case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
 230     break;
 231   default:
 232     return false;
 233   }
 234
 235   // To be a widening instruction (either the "wide" or "long" versions), the
 236   // second operand must be a sign- or zero extend having a single user. We
 237   // only consider extends having a single user because they may otherwise not
 238   // be eliminated.
 239   if (Args.size() != 2 ||
 240       (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
 241       !Args[1]->hasOneUse())
 242     return false;
 243   auto *Extend = cast<CastInst>(Args[1]);
 244
 245   // Legalize the destination type and ensure it can be used in a widening
 246   // operation.
 247   auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
 248   unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
 249   if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
 250     return false;
 251
 252   // Legalize the source type and ensure it can be used in a widening
 253   // operation.
 254   Type *SrcTy = toVectorTy(Extend->getSrcTy());
 255   auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
 256   unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
 257   if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
 258     return false;
 259
 260   // Get the total number of vector elements in the legalized types.
 261   unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements();
 262   unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements();
 263
 264   // Return true if the legalized types have the same number of vector elements
 265   // and the destination element type size is twice that of the source type.
 266   return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
 267 }
 268
 269 int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
 270                                      const Instruction *I) {
 271   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 272   assert(ISD && "Invalid opcode");
 273
 274   // If the cast is observable, and it is used by a widening instruction (e.g.,
 275   // uaddl, saddw, etc.), it may be free.
 276   if (I && I->hasOneUse()) {
 277     auto *SingleUser = cast<Instruction>(*I->user_begin());
 278     SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
 279     if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
 280       // If the cast is the second operand, it is free. We will generate either
 281       // a "wide" or "long" version of the widening instruction.
 282       if (I == SingleUser->getOperand(1))
 283         return 0;
 284       // If the cast is not the second operand, it will be free if it looks the
 285       // same as the second operand. In this case, we will generate a "long"
 286       // version of the widening instruction.
 287       if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
 288         if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
 289             cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
 290           return 0;
 291     }
 292   }
 293
 294   EVT SrcTy = TLI->getValueType(DL, Src);
 295   EVT DstTy = TLI->getValueType(DL, Dst);
 296
 297   if (!SrcTy.isSimple() || !DstTy.isSimple())
 298     return BaseT::getCastInstrCost(Opcode, Dst, Src);
 299
 300   static const TypeConversionCostTblEntry
 301   ConversionTbl[] = {
 302     { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32,  1 },
 303     { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64,  0 },
 304     { ISD::TRUNCATE, MVT::v8i8,  MVT::v8i32,  3 },
 305     { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
 306
 307     // The number of shll instructions for the extension.
 308     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
 309     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
 310     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
 311     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 2 },
 312     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
 313     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  3 },
 314     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
 315     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 2 },
 316     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
 317     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i8,  7 },
 318     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
 319     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16, 6 },
 320     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
 321     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
 322     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
 323     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
 324
 325     // LowerVectorINT_TO_FP:
 326     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
 327     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
 328     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
 329     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
 330     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
 331     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
 332
 333     // Complex: to v2f32
 334     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
 335     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
 336     { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
 337     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8,  3 },
 338     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
 339     { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
 340
 341     // Complex: to v4f32
 342     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8,  4 },
 343     { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
 344     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8,  3 },
 345     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
 346
 347     // Complex: to v8f32
 348     { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
 349     { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
 350     { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8,  10 },
 351     { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
 352
 353     // Complex: to v16f32
 354     { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
 355     { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
 356
 357     // Complex: to v2f64
 358     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
 359     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
 360     { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
 361     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8,  4 },
 362     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
 363     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
 364
 365
 366     // LowerVectorFP_TO_INT
 367     { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
 368     { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
 369     { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
 370     { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
 371     { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
 372     { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
 373
 374     // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
 375     { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
 376     { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
 377     { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f32, 1 },
 378     { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
 379     { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
 380     { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f32, 1 },
 381
 382     // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
 383     { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
 384     { ISD::FP_TO_SINT, MVT::v4i8,  MVT::v4f32, 2 },
 385     { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
 386     { ISD::FP_TO_UINT, MVT::v4i8,  MVT::v4f32, 2 },
 387
 388     // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
 389     { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
 390     { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
 391     { ISD::FP_TO_SINT, MVT::v2i8,  MVT::v2f64, 2 },
 392     { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
 393     { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
 394     { ISD::FP_TO_UINT, MVT::v2i8,  MVT::v2f64, 2 },
 395   };
 396
 397   if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
 398                                                  DstTy.getSimpleVT(),
 399                                                  SrcTy.getSimpleVT()))
 400     return Entry->Cost;
 401
 402   return BaseT::getCastInstrCost(Opcode, Dst, Src);
 403 }
 404
 405 int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
 406                                              VectorType *VecTy,
 407                                              unsigned Index) {
 408
 409   // Make sure we were given a valid extend opcode.
 410   assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
 411          "Invalid opcode");
 412
 413   // We are extending an element we extract from a vector, so the source type
 414   // of the extend is the element type of the vector.
 415   auto *Src = VecTy->getElementType();
 416
 417   // Sign- and zero-extends are for integer types only.
 418   assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
 419
 420   // Get the cost for the extract. We compute the cost (if any) for the extend
 421   // below.
 422   auto Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
 423
 424   // Legalize the types.
 425   auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
 426   auto DstVT = TLI->getValueType(DL, Dst);
 427   auto SrcVT = TLI->getValueType(DL, Src);
 428
 429   // If the resulting type is still a vector and the destination type is legal,
 430   // we may get the extension for free. If not, get the default cost for the
 431   // extend.
 432   if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
 433     return Cost + getCastInstrCost(Opcode, Dst, Src);
 434
 435   // The destination type should be larger than the element type. If not, get
 436   // the default cost for the extend.
 437   if (DstVT.getSizeInBits() < SrcVT.getSizeInBits())
 438     return Cost + getCastInstrCost(Opcode, Dst, Src);
 439
 440   switch (Opcode) {
 441   default:
 442     llvm_unreachable("Opcode should be either SExt or ZExt");
 443
 444   // For sign-extends, we only need a smov, which performs the extension
 445   // automatically.
 446   case Instruction::SExt:
 447     return Cost;
 448
 449   // For zero-extends, the extend is performed automatically by a umov unless
 450   // the destination type is i64 and the element type is i8 or i16.
 451   case Instruction::ZExt:
 452     if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
 453       return Cost;
 454   }
 455
 456   // If we are unable to perform the extend for free, get the default cost.
 457   return Cost + getCastInstrCost(Opcode, Dst, Src);
 458 }
 459
 460 int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
 461                                        unsigned Index) {
 462   assert(Val->isVectorTy() && "This must be a vector type");
 463
 464   if (Index != -1U) {
 465     // Legalize the type.
 466     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
 467
 468     // This type is legalized to a scalar type.
 469     if (!LT.second.isVector())
 470       return 0;
 471
 472     // The type may be split. Normalize the index to the new type.
 473     unsigned Width = LT.second.getVectorNumElements();
 474     Index = Index % Width;
 475
 476     // The element at index zero is already inside the vector.
 477     if (Index == 0)
 478       return 0;
 479   }
 480
 481   // All other insert/extracts cost this much.
 482   return ST->getVectorInsertExtractBaseCost();
 483 }
 484
 485 int AArch64TTIImpl::getArithmeticInstrCost(
 486     unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
 487     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
 488     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
 489     const Instruction *CxtI) {
 490   // Legalize the type.
 491   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 492
 493   // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
 494   // add in the widening overhead specified by the sub-target. Since the
 495   // extends feeding widening instructions are performed automatically, they
 496   // aren't present in the generated code and have a zero cost. By adding a
 497   // widening overhead here, we attach the total cost of the combined operation
 498   // to the widening instruction.
 499   int Cost = 0;
 500   if (isWideningInstruction(Ty, Opcode, Args))
 501     Cost += ST->getWideningBaseCost();
 502
 503   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 504
 505   switch (ISD) {
 506   default:
 507     return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
 508                                                 Opd1PropInfo, Opd2PropInfo);
 509   case ISD::SDIV:
 510     if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
 511         Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
 512       // On AArch64, scalar signed division by constants power-of-two are
 513       // normally expanded to the sequence ADD + CMP + SELECT + SRA.
 514       // The OperandValue properties many not be same as that of previous
 515       // operation; conservatively assume OP_None.
 516       Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
 517                                      TargetTransformInfo::OP_None,
 518                                      TargetTransformInfo::OP_None);
 519       Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
 520                                      TargetTransformInfo::OP_None,
 521                                      TargetTransformInfo::OP_None);
 522       Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info,
 523                                      TargetTransformInfo::OP_None,
 524                                      TargetTransformInfo::OP_None);
 525       Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info,
 526                                      TargetTransformInfo::OP_None,
 527                                      TargetTransformInfo::OP_None);
 528       return Cost;
 529     }
 530     LLVM_FALLTHROUGH;
 531   case ISD::UDIV:
 532     if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) {
 533       auto VT = TLI->getValueType(DL, Ty);
 534       if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
 535         // Vector signed division by constant are expanded to the
 536         // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
 537         // to MULHS + SUB + SRL + ADD + SRL.
 538         int MulCost = getArithmeticInstrCost(Instruction::Mul, Ty, Opd1Info,
 539                                              Opd2Info,
 540                                              TargetTransformInfo::OP_None,
 541                                              TargetTransformInfo::OP_None);
 542         int AddCost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info,
 543                                              Opd2Info,
 544                                              TargetTransformInfo::OP_None,
 545                                              TargetTransformInfo::OP_None);
 546         int ShrCost = getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info,
 547                                              Opd2Info,
 548                                              TargetTransformInfo::OP_None,
 549                                              TargetTransformInfo::OP_None);
 550         return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
 551       }
 552     }
 553
 554     Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
 555                                           Opd1PropInfo, Opd2PropInfo);
 556     if (Ty->isVectorTy()) {
 557       // On AArch64, vector divisions are not supported natively and are
 558       // expanded into scalar divisions of each pair of elements.
 559       Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, Opd1Info,
 560                                      Opd2Info, Opd1PropInfo, Opd2PropInfo);
 561       Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, Opd1Info,
 562                                      Opd2Info, Opd1PropInfo, Opd2PropInfo);
 563       // TODO: if one of the arguments is scalar, then it's not necessary to
 564       // double the cost of handling the vector elements.
 565       Cost += Cost;
 566     }
 567     return Cost;
 568
 569   case ISD::ADD:
 570   case ISD::MUL:
 571   case ISD::XOR:
 572   case ISD::OR:
 573   case ISD::AND:
 574     // These nodes are marked as 'custom' for combining purposes only.
 575     // We know that they are legal. See LowerAdd in ISelLowering.
 576     return (Cost + 1) * LT.first;
 577   }
 578 }
 579
 580 int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
 581                                               const SCEV *Ptr) {
 582   // Address computations in vectorized code with non-consecutive addresses will
 583   // likely result in more instructions compared to scalar code where the
 584   // computation can more often be merged into the index mode. The resulting
 585   // extra micro-ops can significantly decrease throughput.
 586   unsigned NumVectorInstToHideOverhead = 10;
 587   int MaxMergeDistance = 64;
 588
 589   if (Ty->isVectorTy() && SE &&
 590       !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
 591     return NumVectorInstToHideOverhead;
 592
 593   // In many cases the address computation is not merged into the instruction
 594   // addressing mode.
 595   return 1;
 596 }
 597
 598 int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
 599                                        Type *CondTy, const Instruction *I) {
 600
 601   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 602   // We don't lower some vector selects well that are wider than the register
 603   // width.
 604   if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
 605     // We would need this many instructions to hide the scalarization happening.
 606     const int AmortizationCost = 20;
 607     static const TypeConversionCostTblEntry
 608     VectorSelectTbl[] = {
 609       { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
 610       { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
 611       { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
 612       { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
 613       { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
 614       { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
 615     };
 616
 617     EVT SelCondTy = TLI->getValueType(DL, CondTy);
 618     EVT SelValTy = TLI->getValueType(DL, ValTy);
 619     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
 620       if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
 621                                                      SelCondTy.getSimpleVT(),
 622                                                      SelValTy.getSimpleVT()))
 623         return Entry->Cost;
 624     }
 625   }
 626   return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 627 }
 628
 629 AArch64TTIImpl::TTI::MemCmpExpansionOptions
 630 AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
 631   TTI::MemCmpExpansionOptions Options;
 632   Options.AllowOverlappingLoads = !ST->requiresStrictAlign();
 633   Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
 634   Options.NumLoadsPerBlock = Options.MaxNumLoads;
 635   // TODO: Though vector loads usually perform well on AArch64, in some targets
 636   // they may wake up the FP unit, which raises the power consumption.  Perhaps
 637   // they could be used with no holds barred (-O3).
 638   Options.LoadSizes = {8, 4, 2, 1};
 639   return Options;
 640 }
 641
 642 int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
 643                                     MaybeAlign Alignment, unsigned AddressSpace,
 644                                     const Instruction *I) {
 645   auto LT = TLI->getTypeLegalizationCost(DL, Ty);
 646
 647   if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
 648       LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
 649     // Unaligned stores are extremely inefficient. We don't split all
 650     // unaligned 128-bit stores because the negative impact that has shown in
 651     // practice on inlined block copy code.
 652     // We make such stores expensive so that we will only vectorize if there
 653     // are 6 other instructions getting vectorized.
 654     const int AmortizationCost = 6;
 655
 656     return LT.first * 2 * AmortizationCost;
 657   }
 658
 659   if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8)) {
 660     unsigned ProfitableNumElements;
 661     if (Opcode == Instruction::Store)
 662       // We use a custom trunc store lowering so v.4b should be profitable.
 663       ProfitableNumElements = 4;
 664     else
 665       // We scalarize the loads because there is not v.4b register and we
 666       // have to promote the elements to v.2.
 667       ProfitableNumElements = 8;
 668
 669     if (Ty->getVectorNumElements() < ProfitableNumElements) {
 670       unsigned NumVecElts = Ty->getVectorNumElements();
 671       unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
 672       // We generate 2 instructions per vector element.
 673       return NumVectorizableInstsToAmortize * NumVecElts * 2;
 674     }
 675   }
 676
 677   return LT.first;
 678 }
 679
 680 int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
 681                                                unsigned Factor,
 682                                                ArrayRef<unsigned> Indices,
 683                                                unsigned Alignment,
 684                                                unsigned AddressSpace,
 685                                                bool UseMaskForCond,
 686                                                bool UseMaskForGaps) {
 687   assert(Factor >= 2 && "Invalid interleave factor");
 688   assert(isa<VectorType>(VecTy) && "Expect a vector type");
 689
 690   if (!UseMaskForCond && !UseMaskForGaps &&
 691       Factor <= TLI->getMaxSupportedInterleaveFactor()) {
 692     unsigned NumElts = VecTy->getVectorNumElements();
 693     auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
 694
 695     // ldN/stN only support legal vector types of size 64 or 128 in bits.
 696     // Accesses having vector types that are a multiple of 128 bits can be
 697     // matched to more than one ldN/stN instruction.
 698     if (NumElts % Factor == 0 &&
 699         TLI->isLegalInterleavedAccessType(SubVecTy, DL))
 700       return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
 701   }
 702
 703   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
 704                                            Alignment, AddressSpace,
 705                                            UseMaskForCond, UseMaskForGaps);
 706 }
 707
 708 int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
 709   int Cost = 0;
 710   for (auto *I : Tys) {
 711     if (!I->isVectorTy())
 712       continue;
 713     if (I->getScalarSizeInBits() * I->getVectorNumElements() == 128)
 714       Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0) +
 715               getMemoryOpCost(Instruction::Load, I, Align(128), 0);
 716   }
 717   return Cost;
 718 }
 719
 720 unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
 721   return ST->getMaxInterleaveFactor();
 722 }
 723
 724 // For Falkor, we want to avoid having too many strided loads in a loop since
 725 // that can exhaust the HW prefetcher resources.  We adjust the unroller
 726 // MaxCount preference below to attempt to ensure unrolling doesn't create too
 727 // many strided loads.
 728 static void
 729 getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
 730                               TargetTransformInfo::UnrollingPreferences &UP) {
 731   enum { MaxStridedLoads = 7 };
 732   auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
 733     int StridedLoads = 0;
 734     // FIXME? We could make this more precise by looking at the CFG and
 735     // e.g. not counting loads in each side of an if-then-else diamond.
 736     for (const auto BB : L->blocks()) {
 737       for (auto &I : *BB) {
 738         LoadInst *LMemI = dyn_cast<LoadInst>(&I);
 739         if (!LMemI)
 740           continue;
 741
 742         Value *PtrValue = LMemI->getPointerOperand();
 743         if (L->isLoopInvariant(PtrValue))
 744           continue;
 745
 746         const SCEV *LSCEV = SE.getSCEV(PtrValue);
 747         const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
 748         if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
 749           continue;
 750
 751         // FIXME? We could take pairing of unrolled load copies into account
 752         // by looking at the AddRec, but we would probably have to limit this
 753         // to loops with no stores or other memory optimization barriers.
 754         ++StridedLoads;
 755         // We've seen enough strided loads that seeing more won't make a
 756         // difference.
 757         if (StridedLoads > MaxStridedLoads / 2)
 758           return StridedLoads;
 759       }
 760     }
 761     return StridedLoads;
 762   };
 763
 764   int StridedLoads = countStridedLoads(L, SE);
 765   LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
 766                     << " strided loads\n");
 767   // Pick the largest power of 2 unroll count that won't result in too many
 768   // strided loads.
 769   if (StridedLoads) {
 770     UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
 771     LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
 772                       << UP.MaxCount << '\n');
 773   }
 774 }
 775
 776 void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
 777                                              TTI::UnrollingPreferences &UP) {
 778   // Enable partial unrolling and runtime unrolling.
 779   BaseT::getUnrollingPreferences(L, SE, UP);
 780
 781   // For inner loop, it is more likely to be a hot one, and the runtime check
 782   // can be promoted out from LICM pass, so the overhead is less, let's try
 783   // a larger threshold to unroll more loops.
 784   if (L->getLoopDepth() > 1)
 785     UP.PartialThreshold *= 2;
 786
 787   // Disable partial & runtime unrolling on -Os.
 788   UP.PartialOptSizeThreshold = 0;
 789
 790   if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
 791       EnableFalkorHWPFUnrollFix)
 792     getFalkorUnrollingPreferences(L, SE, UP);
 793 }
 794
 795 Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
 796                                                          Type *ExpectedType) {
 797   switch (Inst->getIntrinsicID()) {
 798   default:
 799     return nullptr;
 800   case Intrinsic::aarch64_neon_st2:
 801   case Intrinsic::aarch64_neon_st3:
 802   case Intrinsic::aarch64_neon_st4: {
 803     // Create a struct type
 804     StructType *ST = dyn_cast<StructType>(ExpectedType);
 805     if (!ST)
 806       return nullptr;
 807     unsigned NumElts = Inst->getNumArgOperands() - 1;
 808     if (ST->getNumElements() != NumElts)
 809       return nullptr;
 810     for (unsigned i = 0, e = NumElts; i != e; ++i) {
 811       if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
 812         return nullptr;
 813     }
 814     Value *Res = UndefValue::get(ExpectedType);
 815     IRBuilder<> Builder(Inst);
 816     for (unsigned i = 0, e = NumElts; i != e; ++i) {
 817       Value *L = Inst->getArgOperand(i);
 818       Res = Builder.CreateInsertValue(Res, L, i);
 819     }
 820     return Res;
 821   }
 822   case Intrinsic::aarch64_neon_ld2:
 823   case Intrinsic::aarch64_neon_ld3:
 824   case Intrinsic::aarch64_neon_ld4:
 825     if (Inst->getType() == ExpectedType)
 826       return Inst;
 827     return nullptr;
 828   }
 829 }
 830
 831 bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
 832                                         MemIntrinsicInfo &Info) {
 833   switch (Inst->getIntrinsicID()) {
 834   default:
 835     break;
 836   case Intrinsic::aarch64_neon_ld2:
 837   case Intrinsic::aarch64_neon_ld3:
 838   case Intrinsic::aarch64_neon_ld4:
 839     Info.ReadMem = true;
 840     Info.WriteMem = false;
 841     Info.PtrVal = Inst->getArgOperand(0);
 842     break;
 843   case Intrinsic::aarch64_neon_st2:
 844   case Intrinsic::aarch64_neon_st3:
 845   case Intrinsic::aarch64_neon_st4:
 846     Info.ReadMem = false;
 847     Info.WriteMem = true;
 848     Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
 849     break;
 850   }
 851
 852   switch (Inst->getIntrinsicID()) {
 853   default:
 854     return false;
 855   case Intrinsic::aarch64_neon_ld2:
 856   case Intrinsic::aarch64_neon_st2:
 857     Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
 858     break;
 859   case Intrinsic::aarch64_neon_ld3:
 860   case Intrinsic::aarch64_neon_st3:
 861     Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
 862     break;
 863   case Intrinsic::aarch64_neon_ld4:
 864   case Intrinsic::aarch64_neon_st4:
 865     Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
 866     break;
 867   }
 868   return true;
 869 }
 870
 871 /// See if \p I should be considered for address type promotion. We check if \p
 872 /// I is a sext with right type and used in memory accesses. If it used in a
 873 /// "complex" getelementptr, we allow it to be promoted without finding other
 874 /// sext instructions that sign extended the same initial value. A getelementptr
 875 /// is considered as "complex" if it has more than 2 operands.
 876 bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
 877     const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
 878   bool Considerable = false;
 879   AllowPromotionWithoutCommonHeader = false;
 880   if (!isa<SExtInst>(&I))
 881     return false;
 882   Type *ConsideredSExtType =
 883       Type::getInt64Ty(I.getParent()->getParent()->getContext());
 884   if (I.getType() != ConsideredSExtType)
 885     return false;
 886   // See if the sext is the one with the right type and used in at least one
 887   // GetElementPtrInst.
 888   for (const User *U : I.users()) {
 889     if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
 890       Considerable = true;
 891       // A getelementptr is considered as "complex" if it has more than 2
 892       // operands. We will promote a SExt used in such complex GEP as we
 893       // expect some computation to be merged if they are done on 64 bits.
 894       if (GEPInst->getNumOperands() > 2) {
 895         AllowPromotionWithoutCommonHeader = true;
 896         break;
 897       }
 898     }
 899   }
 900   return Considerable;
 901 }
 902
 903 bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
 904                                            TTI::ReductionFlags Flags) const {
 905   assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
 906   unsigned ScalarBits = Ty->getScalarSizeInBits();
 907   switch (Opcode) {
 908   case Instruction::FAdd:
 909   case Instruction::FMul:
 910   case Instruction::And:
 911   case Instruction::Or:
 912   case Instruction::Xor:
 913   case Instruction::Mul:
 914     return false;
 915   case Instruction::Add:
 916     return ScalarBits * Ty->getVectorNumElements() >= 128;
 917   case Instruction::ICmp:
 918     return (ScalarBits < 64) &&
 919            (ScalarBits * Ty->getVectorNumElements() >= 128);
 920   case Instruction::FCmp:
 921     return Flags.NoNaN;
 922   default:
 923     llvm_unreachable("Unhandled reduction opcode");
 924   }
 925   return false;
 926 }
 927
 928 int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
 929                                                bool IsPairwiseForm) {
 930
 931   if (IsPairwiseForm)
 932     return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm);
 933
 934   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
 935   MVT MTy = LT.second;
 936   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 937   assert(ISD && "Invalid opcode");
 938
 939   // Horizontal adds can use the 'addv' instruction. We model the cost of these
 940   // instructions as normal vector adds. This is the only arithmetic vector
 941   // reduction operation for which we have an instruction.
 942   static const CostTblEntry CostTblNoPairwise[]{
 943       {ISD::ADD, MVT::v8i8,  1},
 944       {ISD::ADD, MVT::v16i8, 1},
 945       {ISD::ADD, MVT::v4i16, 1},
 946       {ISD::ADD, MVT::v8i16, 1},
 947       {ISD::ADD, MVT::v4i32, 1},
 948   };
 949
 950   if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
 951     return LT.first * Entry->Cost;
 952
 953   return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm);
 954 }
 955
 956 int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
 957                                    Type *SubTp) {
 958   if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
 959       Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc) {
 960     static const CostTblEntry ShuffleTbl[] = {
 961       // Broadcast shuffle kinds can be performed with 'dup'.
 962       { TTI::SK_Broadcast, MVT::v8i8,  1 },
 963       { TTI::SK_Broadcast, MVT::v16i8, 1 },
 964       { TTI::SK_Broadcast, MVT::v4i16, 1 },
 965       { TTI::SK_Broadcast, MVT::v8i16, 1 },
 966       { TTI::SK_Broadcast, MVT::v2i32, 1 },
 967       { TTI::SK_Broadcast, MVT::v4i32, 1 },
 968       { TTI::SK_Broadcast, MVT::v2i64, 1 },
 969       { TTI::SK_Broadcast, MVT::v2f32, 1 },
 970       { TTI::SK_Broadcast, MVT::v4f32, 1 },
 971       { TTI::SK_Broadcast, MVT::v2f64, 1 },
 972       // Transpose shuffle kinds can be performed with 'trn1/trn2' and
 973       // 'zip1/zip2' instructions.
 974       { TTI::SK_Transpose, MVT::v8i8,  1 },
 975       { TTI::SK_Transpose, MVT::v16i8, 1 },
 976       { TTI::SK_Transpose, MVT::v4i16, 1 },
 977       { TTI::SK_Transpose, MVT::v8i16, 1 },
 978       { TTI::SK_Transpose, MVT::v2i32, 1 },
 979       { TTI::SK_Transpose, MVT::v4i32, 1 },
 980       { TTI::SK_Transpose, MVT::v2i64, 1 },
 981       { TTI::SK_Transpose, MVT::v2f32, 1 },
 982       { TTI::SK_Transpose, MVT::v4f32, 1 },
 983       { TTI::SK_Transpose, MVT::v2f64, 1 },
 984       // Select shuffle kinds.
 985       // TODO: handle vXi8/vXi16.
 986       { TTI::SK_Select, MVT::v2i32, 1 }, // mov.
 987       { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar).
 988       { TTI::SK_Select, MVT::v2i64, 1 }, // mov.
 989       { TTI::SK_Select, MVT::v2f32, 1 }, // mov.
 990       { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
 991       { TTI::SK_Select, MVT::v2f64, 1 }, // mov.
 992       // PermuteSingleSrc shuffle kinds.
 993       // TODO: handle vXi8/vXi16.
 994       { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
 995       { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
 996       { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
 997       { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
 998       { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
 999       { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
1000     };
1001     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
1002     if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
1003       return LT.first * Entry->Cost;
1004   }
1005
1006   return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
1007 }