lib/Target/ARM/ARMTargetTransformInfo.cpp

   1 //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #include "ARMTargetTransformInfo.h"
  10 #include "ARMSubtarget.h"
  11 #include "MCTargetDesc/ARMAddressingModes.h"
  12 #include "llvm/ADT/APInt.h"
  13 #include "llvm/ADT/SmallVector.h"
  14 #include "llvm/Analysis/LoopInfo.h"
  15 #include "llvm/CodeGen/CostTable.h"
  16 #include "llvm/CodeGen/ISDOpcodes.h"
  17 #include "llvm/CodeGen/ValueTypes.h"
  18 #include "llvm/IR/BasicBlock.h"
  19 #include "llvm/IR/CallSite.h"
  20 #include "llvm/IR/DataLayout.h"
  21 #include "llvm/IR/DerivedTypes.h"
  22 #include "llvm/IR/Instruction.h"
  23 #include "llvm/IR/Instructions.h"
  24 #include "llvm/IR/IntrinsicInst.h"
  25 #include "llvm/IR/Type.h"
  26 #include "llvm/MC/SubtargetFeature.h"
  27 #include "llvm/Support/Casting.h"
  28 #include "llvm/Support/MachineValueType.h"
  29 #include "llvm/Target/TargetMachine.h"
  30 #include <algorithm>
  31 #include <cassert>
  32 #include <cstdint>
  33 #include <utility>
  34
  35 using namespace llvm;
  36
  37 #define DEBUG_TYPE "armtti"
  38
  39 static cl::opt<bool> EnableMaskedLoadStores(
  40   "enable-arm-maskedldst", cl::Hidden, cl::init(false),
  41   cl::desc("Enable the generation of masked loads and stores"));
  42
  43 static cl::opt<bool> DisableLowOverheadLoops(
  44   "disable-arm-loloops", cl::Hidden, cl::init(false),
  45   cl::desc("Disable the generation of low-overhead loops"));
  46
  47 bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
  48                                      const Function *Callee) const {
  49   const TargetMachine &TM = getTLI()->getTargetMachine();
  50   const FeatureBitset &CallerBits =
  51       TM.getSubtargetImpl(*Caller)->getFeatureBits();
  52   const FeatureBitset &CalleeBits =
  53       TM.getSubtargetImpl(*Callee)->getFeatureBits();
  54
  55   // To inline a callee, all features not in the whitelist must match exactly.
  56   bool MatchExact = (CallerBits & ~InlineFeatureWhitelist) ==
  57                     (CalleeBits & ~InlineFeatureWhitelist);
  58   // For features in the whitelist, the callee's features must be a subset of
  59   // the callers'.
  60   bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeatureWhitelist) ==
  61                      (CalleeBits & InlineFeatureWhitelist);
  62   return MatchExact && MatchSubset;
  63 }
  64
  65 int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
  66   assert(Ty->isIntegerTy());
  67
  68  unsigned Bits = Ty->getPrimitiveSizeInBits();
  69  if (Bits == 0 || Imm.getActiveBits() >= 64)
  70    return 4;
  71
  72   int64_t SImmVal = Imm.getSExtValue();
  73   uint64_t ZImmVal = Imm.getZExtValue();
  74   if (!ST->isThumb()) {
  75     if ((SImmVal >= 0 && SImmVal < 65536) ||
  76         (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
  77         (ARM_AM::getSOImmVal(~ZImmVal) != -1))
  78       return 1;
  79     return ST->hasV6T2Ops() ? 2 : 3;
  80   }
  81   if (ST->isThumb2()) {
  82     if ((SImmVal >= 0 && SImmVal < 65536) ||
  83         (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
  84         (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
  85       return 1;
  86     return ST->hasV6T2Ops() ? 2 : 3;
  87   }
  88   // Thumb1, any i8 imm cost 1.
  89   if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
  90     return 1;
  91   if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
  92     return 2;
  93   // Load from constantpool.
  94   return 3;
  95 }
  96
  97 // Constants smaller than 256 fit in the immediate field of
  98 // Thumb1 instructions so we return a zero cost and 1 otherwise.
  99 int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
 100                                       const APInt &Imm, Type *Ty) {
 101   if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
 102     return 0;
 103
 104   return 1;
 105 }
 106
 107 int ARMTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
 108                               Type *Ty) {
 109   // Division by a constant can be turned into multiplication, but only if we
 110   // know it's constant. So it's not so much that the immediate is cheap (it's
 111   // not), but that the alternative is worse.
 112   // FIXME: this is probably unneeded with GlobalISel.
 113   if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
 114        Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
 115       Idx == 1)
 116     return 0;
 117
 118   if (Opcode == Instruction::And) {
 119     // UXTB/UXTH
 120     if (Imm == 255 || Imm == 65535)
 121       return 0;
 122     // Conversion to BIC is free, and means we can use ~Imm instead.
 123     return std::min(getIntImmCost(Imm, Ty), getIntImmCost(~Imm, Ty));
 124   }
 125
 126   if (Opcode == Instruction::Add)
 127     // Conversion to SUB is free, and means we can use -Imm instead.
 128     return std::min(getIntImmCost(Imm, Ty), getIntImmCost(-Imm, Ty));
 129
 130   if (Opcode == Instruction::ICmp && Imm.isNegative() &&
 131       Ty->getIntegerBitWidth() == 32) {
 132     int64_t NegImm = -Imm.getSExtValue();
 133     if (ST->isThumb2() && NegImm < 1<<12)
 134       // icmp X, #-C -> cmn X, #C
 135       return 0;
 136     if (ST->isThumb() && NegImm < 1<<8)
 137       // icmp X, #-C -> adds X, #C
 138       return 0;
 139   }
 140
 141   // xor a, -1 can always be folded to MVN
 142   if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
 143     return 0;
 144
 145   return getIntImmCost(Imm, Ty);
 146 }
 147
 148 int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
 149                                  const Instruction *I) {
 150   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 151   assert(ISD && "Invalid opcode");
 152
 153   // Single to/from double precision conversions.
 154   static const CostTblEntry NEONFltDblTbl[] = {
 155     // Vector fptrunc/fpext conversions.
 156     { ISD::FP_ROUND,   MVT::v2f64, 2 },
 157     { ISD::FP_EXTEND,  MVT::v2f32, 2 },
 158     { ISD::FP_EXTEND,  MVT::v4f32, 4 }
 159   };
 160
 161   if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND ||
 162                                           ISD == ISD::FP_EXTEND)) {
 163     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
 164     if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
 165       return LT.first * Entry->Cost;
 166   }
 167
 168   EVT SrcTy = TLI->getValueType(DL, Src);
 169   EVT DstTy = TLI->getValueType(DL, Dst);
 170
 171   if (!SrcTy.isSimple() || !DstTy.isSimple())
 172     return BaseT::getCastInstrCost(Opcode, Dst, Src);
 173
 174   // The extend of a load is free
 175   if (I && isa<LoadInst>(I->getOperand(0))) {
 176     static const TypeConversionCostTblEntry LoadConversionTbl[] = {
 177         {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
 178         {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
 179         {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
 180         {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
 181         {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
 182         {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
 183         {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
 184         {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
 185         {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
 186         {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
 187         {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
 188         {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
 189     };
 190     if (const auto *Entry = ConvertCostTableLookup(
 191             LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
 192       return Entry->Cost;
 193
 194     static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
 195         {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
 196         {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
 197         {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
 198         {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
 199         {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
 200         {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
 201     };
 202     if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
 203       if (const auto *Entry =
 204               ConvertCostTableLookup(MVELoadConversionTbl, ISD,
 205                                      DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
 206         return Entry->Cost;
 207     }
 208   }
 209
 210   // Some arithmetic, load and store operations have specific instructions
 211   // to cast up/down their types automatically at no extra cost.
 212   // TODO: Get these tables to know at least what the related operations are.
 213   static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
 214     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
 215     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
 216     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
 217     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
 218     { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 0 },
 219     { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i32, 1 },
 220
 221     // The number of vmovl instructions for the extension.
 222     { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
 223     { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
 224     { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
 225     { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
 226     { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
 227     { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
 228     { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
 229     { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
 230     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
 231     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
 232
 233     // Operations that we legalize using splitting.
 234     { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 6 },
 235     { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 3 },
 236
 237     // Vector float <-> i32 conversions.
 238     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
 239     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
 240
 241     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
 242     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
 243     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
 244     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
 245     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
 246     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
 247     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
 248     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
 249     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
 250     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
 251     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
 252     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
 253     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
 254     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
 255     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
 256     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
 257     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
 258     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
 259     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
 260     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
 261
 262     { ISD::FP_TO_SINT,  MVT::v4i32, MVT::v4f32, 1 },
 263     { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f32, 1 },
 264     { ISD::FP_TO_SINT,  MVT::v4i8, MVT::v4f32, 3 },
 265     { ISD::FP_TO_UINT,  MVT::v4i8, MVT::v4f32, 3 },
 266     { ISD::FP_TO_SINT,  MVT::v4i16, MVT::v4f32, 2 },
 267     { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f32, 2 },
 268
 269     // Vector double <-> i32 conversions.
 270     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
 271     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
 272
 273     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
 274     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
 275     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
 276     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
 277     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
 278     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
 279
 280     { ISD::FP_TO_SINT,  MVT::v2i32, MVT::v2f64, 2 },
 281     { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f64, 2 },
 282     { ISD::FP_TO_SINT,  MVT::v8i16, MVT::v8f32, 4 },
 283     { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 4 },
 284     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f32, 8 },
 285     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 8 }
 286   };
 287
 288   if (SrcTy.isVector() && ST->hasNEON()) {
 289     if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
 290                                                    DstTy.getSimpleVT(),
 291                                                    SrcTy.getSimpleVT()))
 292       return Entry->Cost;
 293   }
 294
 295   // Scalar float to integer conversions.
 296   static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
 297     { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },
 298     { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },
 299     { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },
 300     { ISD::FP_TO_UINT,  MVT::i1, MVT::f64, 2 },
 301     { ISD::FP_TO_SINT,  MVT::i8, MVT::f32, 2 },
 302     { ISD::FP_TO_UINT,  MVT::i8, MVT::f32, 2 },
 303     { ISD::FP_TO_SINT,  MVT::i8, MVT::f64, 2 },
 304     { ISD::FP_TO_UINT,  MVT::i8, MVT::f64, 2 },
 305     { ISD::FP_TO_SINT,  MVT::i16, MVT::f32, 2 },
 306     { ISD::FP_TO_UINT,  MVT::i16, MVT::f32, 2 },
 307     { ISD::FP_TO_SINT,  MVT::i16, MVT::f64, 2 },
 308     { ISD::FP_TO_UINT,  MVT::i16, MVT::f64, 2 },
 309     { ISD::FP_TO_SINT,  MVT::i32, MVT::f32, 2 },
 310     { ISD::FP_TO_UINT,  MVT::i32, MVT::f32, 2 },
 311     { ISD::FP_TO_SINT,  MVT::i32, MVT::f64, 2 },
 312     { ISD::FP_TO_UINT,  MVT::i32, MVT::f64, 2 },
 313     { ISD::FP_TO_SINT,  MVT::i64, MVT::f32, 10 },
 314     { ISD::FP_TO_UINT,  MVT::i64, MVT::f32, 10 },
 315     { ISD::FP_TO_SINT,  MVT::i64, MVT::f64, 10 },
 316     { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }
 317   };
 318   if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
 319     if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
 320                                                    DstTy.getSimpleVT(),
 321                                                    SrcTy.getSimpleVT()))
 322       return Entry->Cost;
 323   }
 324
 325   // Scalar integer to float conversions.
 326   static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
 327     { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
 328     { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },
 329     { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },
 330     { ISD::UINT_TO_FP,  MVT::f64, MVT::i1, 2 },
 331     { ISD::SINT_TO_FP,  MVT::f32, MVT::i8, 2 },
 332     { ISD::UINT_TO_FP,  MVT::f32, MVT::i8, 2 },
 333     { ISD::SINT_TO_FP,  MVT::f64, MVT::i8, 2 },
 334     { ISD::UINT_TO_FP,  MVT::f64, MVT::i8, 2 },
 335     { ISD::SINT_TO_FP,  MVT::f32, MVT::i16, 2 },
 336     { ISD::UINT_TO_FP,  MVT::f32, MVT::i16, 2 },
 337     { ISD::SINT_TO_FP,  MVT::f64, MVT::i16, 2 },
 338     { ISD::UINT_TO_FP,  MVT::f64, MVT::i16, 2 },
 339     { ISD::SINT_TO_FP,  MVT::f32, MVT::i32, 2 },
 340     { ISD::UINT_TO_FP,  MVT::f32, MVT::i32, 2 },
 341     { ISD::SINT_TO_FP,  MVT::f64, MVT::i32, 2 },
 342     { ISD::UINT_TO_FP,  MVT::f64, MVT::i32, 2 },
 343     { ISD::SINT_TO_FP,  MVT::f32, MVT::i64, 10 },
 344     { ISD::UINT_TO_FP,  MVT::f32, MVT::i64, 10 },
 345     { ISD::SINT_TO_FP,  MVT::f64, MVT::i64, 10 },
 346     { ISD::UINT_TO_FP,  MVT::f64, MVT::i64, 10 }
 347   };
 348
 349   if (SrcTy.isInteger() && ST->hasNEON()) {
 350     if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
 351                                                    ISD, DstTy.getSimpleVT(),
 352                                                    SrcTy.getSimpleVT()))
 353       return Entry->Cost;
 354   }
 355
 356   // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
 357   // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
 358   // are linearised so take more.
 359   static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
 360     { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
 361     { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
 362     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
 363     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
 364     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
 365     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
 366     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
 367     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
 368     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
 369     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
 370     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
 371     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
 372   };
 373
 374   if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
 375     if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
 376                                                    ISD, DstTy.getSimpleVT(),
 377                                                    SrcTy.getSimpleVT()))
 378       return Entry->Cost * ST->getMVEVectorCostFactor();
 379   }
 380
 381   // Scalar integer conversion costs.
 382   static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
 383     // i16 -> i64 requires two dependent operations.
 384     { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
 385
 386     // Truncates on i64 are assumed to be free.
 387     { ISD::TRUNCATE,    MVT::i32, MVT::i64, 0 },
 388     { ISD::TRUNCATE,    MVT::i16, MVT::i64, 0 },
 389     { ISD::TRUNCATE,    MVT::i8,  MVT::i64, 0 },
 390     { ISD::TRUNCATE,    MVT::i1,  MVT::i64, 0 }
 391   };
 392
 393   if (SrcTy.isInteger()) {
 394     if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
 395                                                    DstTy.getSimpleVT(),
 396                                                    SrcTy.getSimpleVT()))
 397       return Entry->Cost;
 398   }
 399
 400   int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
 401                      ? ST->getMVEVectorCostFactor()
 402                      : 1;
 403   return BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src);
 404 }
 405
 406 int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
 407                                    unsigned Index) {
 408   // Penalize inserting into an D-subregister. We end up with a three times
 409   // lower estimated throughput on swift.
 410   if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
 411       ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
 412     return 3;
 413
 414   if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
 415                         Opcode == Instruction::ExtractElement)) {
 416     // Cross-class copies are expensive on many microarchitectures,
 417     // so assume they are expensive by default.
 418     if (ValTy->getVectorElementType()->isIntegerTy())
 419       return 3;
 420
 421     // Even if it's not a cross class copy, this likely leads to mixing
 422     // of NEON and VFP code and should be therefore penalized.
 423     if (ValTy->isVectorTy() &&
 424         ValTy->getScalarSizeInBits() <= 32)
 425       return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
 426   }
 427
 428   if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
 429                                  Opcode == Instruction::ExtractElement)) {
 430     // We say MVE moves costs at least the MVEVectorCostFactor, even though
 431     // they are scalar instructions. This helps prevent mixing scalar and
 432     // vector, to prevent vectorising where we end up just scalarising the
 433     // result anyway.
 434     return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index),
 435                     ST->getMVEVectorCostFactor()) *
 436            ValTy->getVectorNumElements() / 2;
 437   }
 438
 439   return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
 440 }
 441
 442 int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
 443                                    const Instruction *I) {
 444   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 445   // On NEON a vector select gets lowered to vbsl.
 446   if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
 447     // Lowering of some vector selects is currently far from perfect.
 448     static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
 449       { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
 450       { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
 451       { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
 452     };
 453
 454     EVT SelCondTy = TLI->getValueType(DL, CondTy);
 455     EVT SelValTy = TLI->getValueType(DL, ValTy);
 456     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
 457       if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
 458                                                      SelCondTy.getSimpleVT(),
 459                                                      SelValTy.getSimpleVT()))
 460         return Entry->Cost;
 461     }
 462
 463     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
 464     return LT.first;
 465   }
 466
 467   int BaseCost = ST->hasMVEIntegerOps() && ValTy->isVectorTy()
 468                      ? ST->getMVEVectorCostFactor()
 469                      : 1;
 470   return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 471 }
 472
 473 int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
 474                                           const SCEV *Ptr) {
 475   // Address computations in vectorized code with non-consecutive addresses will
 476   // likely result in more instructions compared to scalar code where the
 477   // computation can more often be merged into the index mode. The resulting
 478   // extra micro-ops can significantly decrease throughput.
 479   unsigned NumVectorInstToHideOverhead = 10;
 480   int MaxMergeDistance = 64;
 481
 482   if (ST->hasNEON()) {
 483     if (Ty->isVectorTy() && SE &&
 484         !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
 485       return NumVectorInstToHideOverhead;
 486
 487     // In many cases the address computation is not merged into the instruction
 488     // addressing mode.
 489     return 1;
 490   }
 491   return BaseT::getAddressComputationCost(Ty, SE, Ptr);
 492 }
 493
 494 bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
 495   if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
 496     return false;
 497
 498   if (auto *VecTy = dyn_cast<VectorType>(DataTy)) {
 499     // Don't support v2i1 yet.
 500     if (VecTy->getNumElements() == 2)
 501       return false;
 502
 503     // We don't support extending fp types.
 504      unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
 505     if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
 506       return false;
 507   }
 508
 509   unsigned EltWidth = DataTy->getScalarSizeInBits();
 510   return (EltWidth == 32 && (!Alignment || Alignment >= 4)) ||
 511          (EltWidth == 16 && (!Alignment || Alignment >= 2)) ||
 512          (EltWidth == 8);
 513 }
 514
 515 int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
 516   const MemCpyInst *MI = dyn_cast<MemCpyInst>(I);
 517   assert(MI && "MemcpyInst expected");
 518   ConstantInt *C = dyn_cast<ConstantInt>(MI->getLength());
 519
 520   // To model the cost of a library call, we assume 1 for the call, and
 521   // 3 for the argument setup.
 522   const unsigned LibCallCost = 4;
 523
 524   // If 'size' is not a constant, a library call will be generated.
 525   if (!C)
 526     return LibCallCost;
 527
 528   const unsigned Size = C->getValue().getZExtValue();
 529   const unsigned DstAlign = MI->getDestAlignment();
 530   const unsigned SrcAlign = MI->getSourceAlignment();
 531   const Function *F = I->getParent()->getParent();
 532   const unsigned Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
 533   std::vector<EVT> MemOps;
 534
 535   // MemOps will be poplulated with a list of data types that needs to be
 536   // loaded and stored. That's why we multiply the number of elements by 2 to
 537   // get the cost for this memcpy.
 538   if (getTLI()->findOptimalMemOpLowering(
 539           MemOps, Limit, Size, DstAlign, SrcAlign, false /*IsMemset*/,
 540           false /*ZeroMemset*/, false /*MemcpyStrSrc*/, false /*AllowOverlap*/,
 541           MI->getDestAddressSpace(), MI->getSourceAddressSpace(),
 542           F->getAttributes()))
 543     return MemOps.size() * 2;
 544
 545   // If we can't find an optimal memop lowering, return the default cost
 546   return LibCallCost;
 547 }
 548
 549 int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
 550                                Type *SubTp) {
 551   if (ST->hasNEON()) {
 552     if (Kind == TTI::SK_Broadcast) {
 553       static const CostTblEntry NEONDupTbl[] = {
 554           // VDUP handles these cases.
 555           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
 556           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
 557           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
 558           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
 559           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
 560           {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
 561
 562           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
 563           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
 564           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
 565           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
 566
 567       std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
 568
 569       if (const auto *Entry =
 570               CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
 571         return LT.first * Entry->Cost;
 572     }
 573     if (Kind == TTI::SK_Reverse) {
 574       static const CostTblEntry NEONShuffleTbl[] = {
 575           // Reverse shuffle cost one instruction if we are shuffling within a
 576           // double word (vrev) or two if we shuffle a quad word (vrev, vext).
 577           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
 578           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
 579           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
 580           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
 581           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
 582           {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
 583
 584           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
 585           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
 586           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
 587           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
 588
 589       std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
 590
 591       if (const auto *Entry =
 592               CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
 593         return LT.first * Entry->Cost;
 594     }
 595     if (Kind == TTI::SK_Select) {
 596       static const CostTblEntry NEONSelShuffleTbl[] = {
 597           // Select shuffle cost table for ARM. Cost is the number of
 598           // instructions
 599           // required to create the shuffled vector.
 600
 601           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
 602           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
 603           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
 604           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
 605
 606           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
 607           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
 608           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
 609
 610           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
 611
 612           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
 613
 614       std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
 615       if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
 616                                               ISD::VECTOR_SHUFFLE, LT.second))
 617         return LT.first * Entry->Cost;
 618     }
 619   }
 620   if (ST->hasMVEIntegerOps()) {
 621     if (Kind == TTI::SK_Broadcast) {
 622       static const CostTblEntry MVEDupTbl[] = {
 623           // VDUP handles these cases.
 624           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
 625           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
 626           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
 627           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
 628           {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
 629
 630       std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
 631
 632       if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
 633                                               LT.second))
 634         return LT.first * Entry->Cost * ST->getMVEVectorCostFactor();
 635     }
 636   }
 637   int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
 638                      ? ST->getMVEVectorCostFactor()
 639                      : 1;
 640   return BaseCost * BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 641 }
 642
 643 int ARMTTIImpl::getArithmeticInstrCost(
 644     unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
 645     TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
 646     TTI::OperandValueProperties Opd2PropInfo,
 647     ArrayRef<const Value *> Args) {
 648   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
 649   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 650
 651   const unsigned FunctionCallDivCost = 20;
 652   const unsigned ReciprocalDivCost = 10;
 653   static const CostTblEntry CostTbl[] = {
 654     // Division.
 655     // These costs are somewhat random. Choose a cost of 20 to indicate that
 656     // vectorizing devision (added function call) is going to be very expensive.
 657     // Double registers types.
 658     { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
 659     { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
 660     { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
 661     { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
 662     { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
 663     { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
 664     { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
 665     { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
 666     { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},
 667     { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},
 668     { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
 669     { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
 670     { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},
 671     { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},
 672     { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},
 673     { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},
 674     // Quad register types.
 675     { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
 676     { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
 677     { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
 678     { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
 679     { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
 680     { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
 681     { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
 682     { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
 683     { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
 684     { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
 685     { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
 686     { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
 687     { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
 688     { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
 689     { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
 690     { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
 691     // Multiplication.
 692   };
 693
 694   if (ST->hasNEON()) {
 695     if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
 696       return LT.first * Entry->Cost;
 697
 698     int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
 699                                              Opd1PropInfo, Opd2PropInfo);
 700
 701     // This is somewhat of a hack. The problem that we are facing is that SROA
 702     // creates a sequence of shift, and, or instructions to construct values.
 703     // These sequences are recognized by the ISel and have zero-cost. Not so for
 704     // the vectorized code. Because we have support for v2i64 but not i64 those
 705     // sequences look particularly beneficial to vectorize.
 706     // To work around this we increase the cost of v2i64 operations to make them
 707     // seem less beneficial.
 708     if (LT.second == MVT::v2i64 &&
 709         Op2Info == TargetTransformInfo::OK_UniformConstantValue)
 710       Cost += 4;
 711
 712     return Cost;
 713   }
 714
 715   int BaseCost = ST->hasMVEIntegerOps() && Ty->isVectorTy()
 716                      ? ST->getMVEVectorCostFactor()
 717                      : 1;
 718
 719   // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
 720   // without treating floats as more expensive that scalars or increasing the
 721   // costs for custom operations. The results is also multiplied by the
 722   // MVEVectorCostFactor where appropriate.
 723   if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
 724     return LT.first * BaseCost;
 725
 726   // Else this is expand, assume that we need to scalarize this op.
 727   if (Ty->isVectorTy()) {
 728     unsigned Num = Ty->getVectorNumElements();
 729     unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
 730     // Return the cost of multiple scalar invocation plus the cost of
 731     // inserting and extracting the values.
 732     return BaseT::getScalarizationOverhead(Ty, Args) + Num * Cost;
 733   }
 734
 735   return BaseCost;
 736 }
 737
 738 int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
 739                                 unsigned AddressSpace, const Instruction *I) {
 740   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
 741
 742   if (ST->hasNEON() && Src->isVectorTy() && Alignment != 16 &&
 743       Src->getVectorElementType()->isDoubleTy()) {
 744     // Unaligned loads/stores are extremely inefficient.
 745     // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
 746     return LT.first * 4;
 747   }
 748   int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
 749                      ? ST->getMVEVectorCostFactor()
 750                      : 1;
 751   return BaseCost * LT.first;
 752 }
 753
 754 int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
 755                                            unsigned Factor,
 756                                            ArrayRef<unsigned> Indices,
 757                                            unsigned Alignment,
 758                                            unsigned AddressSpace,
 759                                            bool UseMaskForCond,
 760                                            bool UseMaskForGaps) {
 761   assert(Factor >= 2 && "Invalid interleave factor");
 762   assert(isa<VectorType>(VecTy) && "Expect a vector type");
 763
 764   // vldN/vstN doesn't support vector types of i64/f64 element.
 765   bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
 766
 767   if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
 768       !UseMaskForCond && !UseMaskForGaps) {
 769     unsigned NumElts = VecTy->getVectorNumElements();
 770     auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
 771
 772     // vldN/vstN only support legal vector types of size 64 or 128 in bits.
 773     // Accesses having vector types that are a multiple of 128 bits can be
 774     // matched to more than one vldN/vstN instruction.
 775     if (NumElts % Factor == 0 &&
 776         TLI->isLegalInterleavedAccessType(SubVecTy, DL))
 777       return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
 778   }
 779
 780   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
 781                                            Alignment, AddressSpace,
 782                                            UseMaskForCond, UseMaskForGaps);
 783 }
 784
 785 bool ARMTTIImpl::isLoweredToCall(const Function *F) {
 786   if (!F->isIntrinsic())
 787     BaseT::isLoweredToCall(F);
 788
 789   // Assume all Arm-specific intrinsics map to an instruction.
 790   if (F->getName().startswith("llvm.arm"))
 791     return false;
 792
 793   switch (F->getIntrinsicID()) {
 794   default: break;
 795   case Intrinsic::powi:
 796   case Intrinsic::sin:
 797   case Intrinsic::cos:
 798   case Intrinsic::pow:
 799   case Intrinsic::log:
 800   case Intrinsic::log10:
 801   case Intrinsic::log2:
 802   case Intrinsic::exp:
 803   case Intrinsic::exp2:
 804     return true;
 805   case Intrinsic::sqrt:
 806   case Intrinsic::fabs:
 807   case Intrinsic::copysign:
 808   case Intrinsic::floor:
 809   case Intrinsic::ceil:
 810   case Intrinsic::trunc:
 811   case Intrinsic::rint:
 812   case Intrinsic::nearbyint:
 813   case Intrinsic::round:
 814   case Intrinsic::canonicalize:
 815   case Intrinsic::lround:
 816   case Intrinsic::llround:
 817   case Intrinsic::lrint:
 818   case Intrinsic::llrint:
 819     if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
 820       return true;
 821     if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
 822       return true;
 823     // Some operations can be handled by vector instructions and assume
 824     // unsupported vectors will be expanded into supported scalar ones.
 825     // TODO Handle scalar operations properly.
 826     return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
 827   case Intrinsic::masked_store:
 828   case Intrinsic::masked_load:
 829   case Intrinsic::masked_gather:
 830   case Intrinsic::masked_scatter:
 831     return !ST->hasMVEIntegerOps();
 832   case Intrinsic::sadd_with_overflow:
 833   case Intrinsic::uadd_with_overflow:
 834   case Intrinsic::ssub_with_overflow:
 835   case Intrinsic::usub_with_overflow:
 836   case Intrinsic::sadd_sat:
 837   case Intrinsic::uadd_sat:
 838   case Intrinsic::ssub_sat:
 839   case Intrinsic::usub_sat:
 840     return false;
 841   }
 842
 843   return BaseT::isLoweredToCall(F);
 844 }
 845
 846 bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
 847                                           AssumptionCache &AC,
 848                                           TargetLibraryInfo *LibInfo,
 849                                           HardwareLoopInfo &HWLoopInfo) {
 850   // Low-overhead branches are only supported in the 'low-overhead branch'
 851   // extension of v8.1-m.
 852   if (!ST->hasLOB() || DisableLowOverheadLoops)
 853     return false;
 854
 855   if (!SE.hasLoopInvariantBackedgeTakenCount(L))
 856     return false;
 857
 858   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
 859   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
 860     return false;
 861
 862   const SCEV *TripCountSCEV =
 863     SE.getAddExpr(BackedgeTakenCount,
 864                   SE.getOne(BackedgeTakenCount->getType()));
 865
 866   // We need to store the trip count in LR, a 32-bit register.
 867   if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32)
 868     return false;
 869
 870   // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
 871   // point in generating a hardware loop if that's going to happen.
 872   auto MaybeCall = [this](Instruction &I) {
 873     const ARMTargetLowering *TLI = getTLI();
 874     unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
 875     EVT VT = TLI->getValueType(DL, I.getType(), true);
 876     if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
 877       return true;
 878
 879     // Check if an intrinsic will be lowered to a call and assume that any
 880     // other CallInst will generate a bl.
 881     if (auto *Call = dyn_cast<CallInst>(&I)) {
 882       if (isa<IntrinsicInst>(Call)) {
 883         if (const Function *F = Call->getCalledFunction())
 884           return isLoweredToCall(F);
 885       }
 886       return true;
 887     }
 888
 889     // FPv5 provides conversions between integer, double-precision,
 890     // single-precision, and half-precision formats.
 891     switch (I.getOpcode()) {
 892     default:
 893       break;
 894     case Instruction::FPToSI:
 895     case Instruction::FPToUI:
 896     case Instruction::SIToFP:
 897     case Instruction::UIToFP:
 898     case Instruction::FPTrunc:
 899     case Instruction::FPExt:
 900       return !ST->hasFPARMv8Base();
 901     }
 902
 903     // FIXME: Unfortunately the approach of checking the Operation Action does
 904     // not catch all cases of Legalization that use library calls. Our
 905     // Legalization step categorizes some transformations into library calls as
 906     // Custom, Expand or even Legal when doing type legalization. So for now
 907     // we have to special case for instance the SDIV of 64bit integers and the
 908     // use of floating point emulation.
 909     if (VT.isInteger() && VT.getSizeInBits() >= 64) {
 910       switch (ISD) {
 911       default:
 912         break;
 913       case ISD::SDIV:
 914       case ISD::UDIV:
 915       case ISD::SREM:
 916       case ISD::UREM:
 917       case ISD::SDIVREM:
 918       case ISD::UDIVREM:
 919         return true;
 920       }
 921     }
 922
 923     // Assume all other non-float operations are supported.
 924     if (!VT.isFloatingPoint())
 925       return false;
 926
 927     // We'll need a library call to handle most floats when using soft.
 928     if (TLI->useSoftFloat()) {
 929       switch (I.getOpcode()) {
 930       default:
 931         return true;
 932       case Instruction::Alloca:
 933       case Instruction::Load:
 934       case Instruction::Store:
 935       case Instruction::Select:
 936       case Instruction::PHI:
 937         return false;
 938       }
 939     }
 940
 941     // We'll need a libcall to perform double precision operations on a single
 942     // precision only FPU.
 943     if (I.getType()->isDoubleTy() && !ST->hasFP64())
 944       return true;
 945
 946     // Likewise for half precision arithmetic.
 947     if (I.getType()->isHalfTy() && !ST->hasFullFP16())
 948       return true;
 949
 950     return false;
 951   };
 952
 953   auto IsHardwareLoopIntrinsic = [](Instruction &I) {
 954     if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
 955       switch (Call->getIntrinsicID()) {
 956       default:
 957         break;
 958       case Intrinsic::set_loop_iterations:
 959       case Intrinsic::test_set_loop_iterations:
 960       case Intrinsic::loop_decrement:
 961       case Intrinsic::loop_decrement_reg:
 962         return true;
 963       }
 964     }
 965     return false;
 966   };
 967
 968   // Scan the instructions to see if there's any that we know will turn into a
 969   // call or if this loop is already a low-overhead loop.
 970   auto ScanLoop = [&](Loop *L) {
 971     for (auto *BB : L->getBlocks()) {
 972       for (auto &I : *BB) {
 973         if (MaybeCall(I) || IsHardwareLoopIntrinsic(I))
 974           return false;
 975       }
 976     }
 977     return true;
 978   };
 979
 980   // Visit inner loops.
 981   for (auto Inner : *L)
 982     if (!ScanLoop(Inner))
 983       return false;
 984
 985   if (!ScanLoop(L))
 986     return false;
 987
 988   // TODO: Check whether the trip count calculation is expensive. If L is the
 989   // inner loop but we know it has a low trip count, calculating that trip
 990   // count (in the parent loop) may be detrimental.
 991
 992   LLVMContext &C = L->getHeader()->getContext();
 993   HWLoopInfo.CounterInReg = true;
 994   HWLoopInfo.IsNestingLegal = false;
 995   HWLoopInfo.PerformEntryTest = true;
 996   HWLoopInfo.CountType = Type::getInt32Ty(C);
 997   HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
 998   return true;
 999 }
1000
1001 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1002                                          TTI::UnrollingPreferences &UP) {
1003   // Only currently enable these preferences for M-Class cores.
1004   if (!ST->isMClass())
1005     return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP);
1006
1007   // Disable loop unrolling for Oz and Os.
1008   UP.OptSizeThreshold = 0;
1009   UP.PartialOptSizeThreshold = 0;
1010   if (L->getHeader()->getParent()->hasOptSize())
1011     return;
1012
1013   // Only enable on Thumb-2 targets.
1014   if (!ST->isThumb2())
1015     return;
1016
1017   SmallVector<BasicBlock*, 4> ExitingBlocks;
1018   L->getExitingBlocks(ExitingBlocks);
1019   LLVM_DEBUG(dbgs() << "Loop has:\n"
1020                     << "Blocks: " << L->getNumBlocks() << "\n"
1021                     << "Exit blocks: " << ExitingBlocks.size() << "\n");
1022
1023   // Only allow another exit other than the latch. This acts as an early exit
1024   // as it mirrors the profitability calculation of the runtime unroller.
1025   if (ExitingBlocks.size() > 2)
1026     return;
1027
1028   // Limit the CFG of the loop body for targets with a branch predictor.
1029   // Allowing 4 blocks permits if-then-else diamonds in the body.
1030   if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
1031     return;
1032
1033   // Scan the loop: don't unroll loops with calls as this could prevent
1034   // inlining.
1035   unsigned Cost = 0;
1036   for (auto *BB : L->getBlocks()) {
1037     for (auto &I : *BB) {
1038       if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1039         ImmutableCallSite CS(&I);
1040         if (const Function *F = CS.getCalledFunction()) {
1041           if (!isLoweredToCall(F))
1042             continue;
1043         }
1044         return;
1045       }
1046       // Don't unroll vectorised loop. MVE does not benefit from it as much as
1047       // scalar code.
1048       if (I.getType()->isVectorTy())
1049         return;
1050
1051       SmallVector<const Value*, 4> Operands(I.value_op_begin(),
1052                                             I.value_op_end());
1053       Cost += getUserCost(&I, Operands);
1054     }
1055   }
1056
1057   LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1058
1059   UP.Partial = true;
1060   UP.Runtime = true;
1061   UP.UpperBound = true;
1062   UP.UnrollRemainder = true;
1063   UP.DefaultUnrollRuntimeCount = 4;
1064   UP.UnrollAndJam = true;
1065   UP.UnrollAndJamInnerLoopThreshold = 60;
1066
1067   // Force unrolling small loops can be very useful because of the branch
1068   // taken cost of the backedge.
1069   if (Cost < 12)
1070     UP.Force = true;
1071 }
1072
1073 bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
1074                                        TTI::ReductionFlags Flags) const {
1075   assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
1076   unsigned ScalarBits = Ty->getScalarSizeInBits();
1077   if (!ST->hasMVEIntegerOps())
1078     return false;
1079
1080   switch (Opcode) {
1081   case Instruction::FAdd:
1082   case Instruction::FMul:
1083   case Instruction::And:
1084   case Instruction::Or:
1085   case Instruction::Xor:
1086   case Instruction::Mul:
1087   case Instruction::FCmp:
1088     return false;
1089   case Instruction::ICmp:
1090   case Instruction::Add:
1091     return ScalarBits < 64 && ScalarBits * Ty->getVectorNumElements() == 128;
1092   default:
1093     llvm_unreachable("Unhandled reduction opcode");
1094   }
1095   return false;
1096 }