lib/Target/ARM/ARMTargetTransformInfo.cpp

   1 //===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #include "ARMTargetTransformInfo.h"
  10 #include "ARMSubtarget.h"
  11 #include "MCTargetDesc/ARMAddressingModes.h"
  12 #include "llvm/ADT/APInt.h"
  13 #include "llvm/ADT/SmallVector.h"
  14 #include "llvm/Analysis/LoopInfo.h"
  15 #include "llvm/CodeGen/CostTable.h"
  16 #include "llvm/CodeGen/ISDOpcodes.h"
  17 #include "llvm/CodeGen/ValueTypes.h"
  18 #include "llvm/IR/BasicBlock.h"
  19 #include "llvm/IR/CallSite.h"
  20 #include "llvm/IR/DataLayout.h"
  21 #include "llvm/IR/DerivedTypes.h"
  22 #include "llvm/IR/Instruction.h"
  23 #include "llvm/IR/Instructions.h"
  24 #include "llvm/IR/IntrinsicInst.h"
  25 #include "llvm/IR/Type.h"
  26 #include "llvm/MC/SubtargetFeature.h"
  27 #include "llvm/Support/Casting.h"
  28 #include "llvm/Support/MachineValueType.h"
  29 #include "llvm/Target/TargetMachine.h"
  30 #include <algorithm>
  31 #include <cassert>
  32 #include <cstdint>
  33 #include <utility>
  34
  35 using namespace llvm;
  36
  37 #define DEBUG_TYPE "armtti"
  38
  39 static cl::opt<bool> EnableMaskedLoadStores(
  40   "enable-arm-maskedldst", cl::Hidden, cl::init(false),
  41   cl::desc("Enable the generation of masked loads and stores"));
  42
  43 static cl::opt<bool> DisableLowOverheadLoops(
  44   "disable-arm-loloops", cl::Hidden, cl::init(false),
  45   cl::desc("Disable the generation of low-overhead loops"));
  46
  47 bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
  48                                      const Function *Callee) const {
  49   const TargetMachine &TM = getTLI()->getTargetMachine();
  50   const FeatureBitset &CallerBits =
  51       TM.getSubtargetImpl(*Caller)->getFeatureBits();
  52   const FeatureBitset &CalleeBits =
  53       TM.getSubtargetImpl(*Callee)->getFeatureBits();
  54
  55   // To inline a callee, all features not in the whitelist must match exactly.
  56   bool MatchExact = (CallerBits & ~InlineFeatureWhitelist) ==
  57                     (CalleeBits & ~InlineFeatureWhitelist);
  58   // For features in the whitelist, the callee's features must be a subset of
  59   // the callers'.
  60   bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeatureWhitelist) ==
  61                      (CalleeBits & InlineFeatureWhitelist);
  62   return MatchExact && MatchSubset;
  63 }
  64
  65 int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
  66   assert(Ty->isIntegerTy());
  67
  68  unsigned Bits = Ty->getPrimitiveSizeInBits();
  69  if (Bits == 0 || Imm.getActiveBits() >= 64)
  70    return 4;
  71
  72   int64_t SImmVal = Imm.getSExtValue();
  73   uint64_t ZImmVal = Imm.getZExtValue();
  74   if (!ST->isThumb()) {
  75     if ((SImmVal >= 0 && SImmVal < 65536) ||
  76         (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
  77         (ARM_AM::getSOImmVal(~ZImmVal) != -1))
  78       return 1;
  79     return ST->hasV6T2Ops() ? 2 : 3;
  80   }
  81   if (ST->isThumb2()) {
  82     if ((SImmVal >= 0 && SImmVal < 65536) ||
  83         (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
  84         (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
  85       return 1;
  86     return ST->hasV6T2Ops() ? 2 : 3;
  87   }
  88   // Thumb1, any i8 imm cost 1.
  89   if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
  90     return 1;
  91   if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
  92     return 2;
  93   // Load from constantpool.
  94   return 3;
  95 }
  96
  97 // Constants smaller than 256 fit in the immediate field of
  98 // Thumb1 instructions so we return a zero cost and 1 otherwise.
  99 int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
 100                                       const APInt &Imm, Type *Ty) {
 101   if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
 102     return 0;
 103
 104   return 1;
 105 }
 106
 107 int ARMTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
 108                               Type *Ty) {
 109   // Division by a constant can be turned into multiplication, but only if we
 110   // know it's constant. So it's not so much that the immediate is cheap (it's
 111   // not), but that the alternative is worse.
 112   // FIXME: this is probably unneeded with GlobalISel.
 113   if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
 114        Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
 115       Idx == 1)
 116     return 0;
 117
 118   if (Opcode == Instruction::And) {
 119     // UXTB/UXTH
 120     if (Imm == 255 || Imm == 65535)
 121       return 0;
 122     // Conversion to BIC is free, and means we can use ~Imm instead.
 123     return std::min(getIntImmCost(Imm, Ty), getIntImmCost(~Imm, Ty));
 124   }
 125
 126   if (Opcode == Instruction::Add)
 127     // Conversion to SUB is free, and means we can use -Imm instead.
 128     return std::min(getIntImmCost(Imm, Ty), getIntImmCost(-Imm, Ty));
 129
 130   if (Opcode == Instruction::ICmp && Imm.isNegative() &&
 131       Ty->getIntegerBitWidth() == 32) {
 132     int64_t NegImm = -Imm.getSExtValue();
 133     if (ST->isThumb2() && NegImm < 1<<12)
 134       // icmp X, #-C -> cmn X, #C
 135       return 0;
 136     if (ST->isThumb() && NegImm < 1<<8)
 137       // icmp X, #-C -> adds X, #C
 138       return 0;
 139   }
 140
 141   // xor a, -1 can always be folded to MVN
 142   if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
 143     return 0;
 144
 145   return getIntImmCost(Imm, Ty);
 146 }
 147
 148 int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
 149                                  const Instruction *I) {
 150   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 151   assert(ISD && "Invalid opcode");
 152
 153   // Single to/from double precision conversions.
 154   static const CostTblEntry NEONFltDblTbl[] = {
 155     // Vector fptrunc/fpext conversions.
 156     { ISD::FP_ROUND,   MVT::v2f64, 2 },
 157     { ISD::FP_EXTEND,  MVT::v2f32, 2 },
 158     { ISD::FP_EXTEND,  MVT::v4f32, 4 }
 159   };
 160
 161   if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND ||
 162                                           ISD == ISD::FP_EXTEND)) {
 163     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
 164     if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
 165       return LT.first * Entry->Cost;
 166   }
 167
 168   EVT SrcTy = TLI->getValueType(DL, Src);
 169   EVT DstTy = TLI->getValueType(DL, Dst);
 170
 171   if (!SrcTy.isSimple() || !DstTy.isSimple())
 172     return BaseT::getCastInstrCost(Opcode, Dst, Src);
 173
 174   // The extend of a load is free
 175   if (I && isa<LoadInst>(I->getOperand(0))) {
 176     static const TypeConversionCostTblEntry LoadConversionTbl[] = {
 177         {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
 178         {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
 179         {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
 180         {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
 181         {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
 182         {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
 183         {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
 184         {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
 185         {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
 186         {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
 187         {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
 188         {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
 189     };
 190     if (const auto *Entry = ConvertCostTableLookup(
 191             LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
 192       return Entry->Cost;
 193
 194     static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
 195         {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
 196         {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
 197         {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
 198         {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
 199         {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
 200         {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
 201     };
 202     if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
 203       if (const auto *Entry =
 204               ConvertCostTableLookup(MVELoadConversionTbl, ISD,
 205                                      DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
 206         return Entry->Cost;
 207     }
 208   }
 209
 210   // Some arithmetic, load and store operations have specific instructions
 211   // to cast up/down their types automatically at no extra cost.
 212   // TODO: Get these tables to know at least what the related operations are.
 213   static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
 214     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
 215     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
 216     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
 217     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
 218     { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 0 },
 219     { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i32, 1 },
 220
 221     // The number of vmovl instructions for the extension.
 222     { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
 223     { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
 224     { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
 225     { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
 226     { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
 227     { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
 228     { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
 229     { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
 230     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
 231     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
 232
 233     // Operations that we legalize using splitting.
 234     { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 6 },
 235     { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 3 },
 236
 237     // Vector float <-> i32 conversions.
 238     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
 239     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
 240
 241     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
 242     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
 243     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
 244     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
 245     { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
 246     { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
 247     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
 248     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
 249     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
 250     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
 251     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
 252     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
 253     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
 254     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
 255     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
 256     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
 257     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
 258     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
 259     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
 260     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
 261
 262     { ISD::FP_TO_SINT,  MVT::v4i32, MVT::v4f32, 1 },
 263     { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f32, 1 },
 264     { ISD::FP_TO_SINT,  MVT::v4i8, MVT::v4f32, 3 },
 265     { ISD::FP_TO_UINT,  MVT::v4i8, MVT::v4f32, 3 },
 266     { ISD::FP_TO_SINT,  MVT::v4i16, MVT::v4f32, 2 },
 267     { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f32, 2 },
 268
 269     // Vector double <-> i32 conversions.
 270     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
 271     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
 272
 273     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
 274     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
 275     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
 276     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
 277     { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
 278     { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
 279
 280     { ISD::FP_TO_SINT,  MVT::v2i32, MVT::v2f64, 2 },
 281     { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f64, 2 },
 282     { ISD::FP_TO_SINT,  MVT::v8i16, MVT::v8f32, 4 },
 283     { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 4 },
 284     { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f32, 8 },
 285     { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 8 }
 286   };
 287
 288   if (SrcTy.isVector() && ST->hasNEON()) {
 289     if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
 290                                                    DstTy.getSimpleVT(),
 291                                                    SrcTy.getSimpleVT()))
 292       return Entry->Cost;
 293   }
 294
 295   // Scalar float to integer conversions.
 296   static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
 297     { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },
 298     { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },
 299     { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },
 300     { ISD::FP_TO_UINT,  MVT::i1, MVT::f64, 2 },
 301     { ISD::FP_TO_SINT,  MVT::i8, MVT::f32, 2 },
 302     { ISD::FP_TO_UINT,  MVT::i8, MVT::f32, 2 },
 303     { ISD::FP_TO_SINT,  MVT::i8, MVT::f64, 2 },
 304     { ISD::FP_TO_UINT,  MVT::i8, MVT::f64, 2 },
 305     { ISD::FP_TO_SINT,  MVT::i16, MVT::f32, 2 },
 306     { ISD::FP_TO_UINT,  MVT::i16, MVT::f32, 2 },
 307     { ISD::FP_TO_SINT,  MVT::i16, MVT::f64, 2 },
 308     { ISD::FP_TO_UINT,  MVT::i16, MVT::f64, 2 },
 309     { ISD::FP_TO_SINT,  MVT::i32, MVT::f32, 2 },
 310     { ISD::FP_TO_UINT,  MVT::i32, MVT::f32, 2 },
 311     { ISD::FP_TO_SINT,  MVT::i32, MVT::f64, 2 },
 312     { ISD::FP_TO_UINT,  MVT::i32, MVT::f64, 2 },
 313     { ISD::FP_TO_SINT,  MVT::i64, MVT::f32, 10 },
 314     { ISD::FP_TO_UINT,  MVT::i64, MVT::f32, 10 },
 315     { ISD::FP_TO_SINT,  MVT::i64, MVT::f64, 10 },
 316     { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }
 317   };
 318   if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
 319     if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
 320                                                    DstTy.getSimpleVT(),
 321                                                    SrcTy.getSimpleVT()))
 322       return Entry->Cost;
 323   }
 324
 325   // Scalar integer to float conversions.
 326   static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
 327     { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
 328     { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },
 329     { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },
 330     { ISD::UINT_TO_FP,  MVT::f64, MVT::i1, 2 },
 331     { ISD::SINT_TO_FP,  MVT::f32, MVT::i8, 2 },
 332     { ISD::UINT_TO_FP,  MVT::f32, MVT::i8, 2 },
 333     { ISD::SINT_TO_FP,  MVT::f64, MVT::i8, 2 },
 334     { ISD::UINT_TO_FP,  MVT::f64, MVT::i8, 2 },
 335     { ISD::SINT_TO_FP,  MVT::f32, MVT::i16, 2 },
 336     { ISD::UINT_TO_FP,  MVT::f32, MVT::i16, 2 },
 337     { ISD::SINT_TO_FP,  MVT::f64, MVT::i16, 2 },
 338     { ISD::UINT_TO_FP,  MVT::f64, MVT::i16, 2 },
 339     { ISD::SINT_TO_FP,  MVT::f32, MVT::i32, 2 },
 340     { ISD::UINT_TO_FP,  MVT::f32, MVT::i32, 2 },
 341     { ISD::SINT_TO_FP,  MVT::f64, MVT::i32, 2 },
 342     { ISD::UINT_TO_FP,  MVT::f64, MVT::i32, 2 },
 343     { ISD::SINT_TO_FP,  MVT::f32, MVT::i64, 10 },
 344     { ISD::UINT_TO_FP,  MVT::f32, MVT::i64, 10 },
 345     { ISD::SINT_TO_FP,  MVT::f64, MVT::i64, 10 },
 346     { ISD::UINT_TO_FP,  MVT::f64, MVT::i64, 10 }
 347   };
 348
 349   if (SrcTy.isInteger() && ST->hasNEON()) {
 350     if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
 351                                                    ISD, DstTy.getSimpleVT(),
 352                                                    SrcTy.getSimpleVT()))
 353       return Entry->Cost;
 354   }
 355
 356   // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
 357   // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
 358   // are linearised so take more.
 359   static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
 360     { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
 361     { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
 362     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
 363     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
 364     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
 365     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
 366     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
 367     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
 368     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
 369     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
 370     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
 371     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
 372   };
 373
 374   if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
 375     if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
 376                                                    ISD, DstTy.getSimpleVT(),
 377                                                    SrcTy.getSimpleVT()))
 378       return Entry->Cost * ST->getMVEVectorCostFactor();
 379   }
 380
 381   // Scalar integer conversion costs.
 382   static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
 383     // i16 -> i64 requires two dependent operations.
 384     { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
 385
 386     // Truncates on i64 are assumed to be free.
 387     { ISD::TRUNCATE,    MVT::i32, MVT::i64, 0 },
 388     { ISD::TRUNCATE,    MVT::i16, MVT::i64, 0 },
 389     { ISD::TRUNCATE,    MVT::i8,  MVT::i64, 0 },
 390     { ISD::TRUNCATE,    MVT::i1,  MVT::i64, 0 }
 391   };
 392
 393   if (SrcTy.isInteger()) {
 394     if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
 395                                                    DstTy.getSimpleVT(),
 396                                                    SrcTy.getSimpleVT()))
 397       return Entry->Cost;
 398   }
 399
 400   int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
 401                      ? ST->getMVEVectorCostFactor()
 402                      : 1;
 403   return BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src);
 404 }
 405
 406 int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
 407                                    unsigned Index) {
 408   // Penalize inserting into an D-subregister. We end up with a three times
 409   // lower estimated throughput on swift.
 410   if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
 411       ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
 412     return 3;
 413
 414   if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
 415                         Opcode == Instruction::ExtractElement)) {
 416     // Cross-class copies are expensive on many microarchitectures,
 417     // so assume they are expensive by default.
 418     if (ValTy->getVectorElementType()->isIntegerTy())
 419       return 3;
 420
 421     // Even if it's not a cross class copy, this likely leads to mixing
 422     // of NEON and VFP code and should be therefore penalized.
 423     if (ValTy->isVectorTy() &&
 424         ValTy->getScalarSizeInBits() <= 32)
 425       return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U);
 426   }
 427
 428   if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
 429                                  Opcode == Instruction::ExtractElement)) {
 430     // We say MVE moves costs at least the MVEVectorCostFactor, even though
 431     // they are scalar instructions. This helps prevent mixing scalar and
 432     // vector, to prevent vectorising where we end up just scalarising the
 433     // result anyway.
 434     return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index),
 435                     ST->getMVEVectorCostFactor()) *
 436            ValTy->getVectorNumElements() / 2;
 437   }
 438
 439   return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
 440 }
 441
 442 int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
 443                                    const Instruction *I) {
 444   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 445   // On NEON a vector select gets lowered to vbsl.
 446   if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
 447     // Lowering of some vector selects is currently far from perfect.
 448     static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
 449       { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
 450       { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
 451       { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
 452     };
 453
 454     EVT SelCondTy = TLI->getValueType(DL, CondTy);
 455     EVT SelValTy = TLI->getValueType(DL, ValTy);
 456     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
 457       if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
 458                                                      SelCondTy.getSimpleVT(),
 459                                                      SelValTy.getSimpleVT()))
 460         return Entry->Cost;
 461     }
 462
 463     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
 464     return LT.first;
 465   }
 466
 467   int BaseCost = ST->hasMVEIntegerOps() && ValTy->isVectorTy()
 468                      ? ST->getMVEVectorCostFactor()
 469                      : 1;
 470   return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 471 }
 472
 473 int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
 474                                           const SCEV *Ptr) {
 475   // Address computations in vectorized code with non-consecutive addresses will
 476   // likely result in more instructions compared to scalar code where the
 477   // computation can more often be merged into the index mode. The resulting
 478   // extra micro-ops can significantly decrease throughput.
 479   unsigned NumVectorInstToHideOverhead = 10;
 480   int MaxMergeDistance = 64;
 481
 482   if (ST->hasNEON()) {
 483     if (Ty->isVectorTy() && SE &&
 484         !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
 485       return NumVectorInstToHideOverhead;
 486
 487     // In many cases the address computation is not merged into the instruction
 488     // addressing mode.
 489     return 1;
 490   }
 491   return BaseT::getAddressComputationCost(Ty, SE, Ptr);
 492 }
 493
 494 bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy) {
 495   if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
 496     return false;
 497
 498   if (DataTy->isVectorTy()) {
 499     // We don't yet support narrowing or widening masked loads/stores. Expand
 500     // them for the moment.
 501     unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
 502     if (VecWidth != 128)
 503       return false;
 504   }
 505
 506   unsigned EltWidth = DataTy->getScalarSizeInBits();
 507   return EltWidth == 32 || EltWidth == 16 || EltWidth == 8;
 508 }
 509
 510 int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
 511   const MemCpyInst *MI = dyn_cast<MemCpyInst>(I);
 512   assert(MI && "MemcpyInst expected");
 513   ConstantInt *C = dyn_cast<ConstantInt>(MI->getLength());
 514
 515   // To model the cost of a library call, we assume 1 for the call, and
 516   // 3 for the argument setup.
 517   const unsigned LibCallCost = 4;
 518
 519   // If 'size' is not a constant, a library call will be generated.
 520   if (!C)
 521     return LibCallCost;
 522
 523   const unsigned Size = C->getValue().getZExtValue();
 524   const unsigned DstAlign = MI->getDestAlignment();
 525   const unsigned SrcAlign = MI->getSourceAlignment();
 526   const Function *F = I->getParent()->getParent();
 527   const unsigned Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
 528   std::vector<EVT> MemOps;
 529
 530   // MemOps will be poplulated with a list of data types that needs to be
 531   // loaded and stored. That's why we multiply the number of elements by 2 to
 532   // get the cost for this memcpy.
 533   if (getTLI()->findOptimalMemOpLowering(
 534           MemOps, Limit, Size, DstAlign, SrcAlign, false /*IsMemset*/,
 535           false /*ZeroMemset*/, false /*MemcpyStrSrc*/, false /*AllowOverlap*/,
 536           MI->getDestAddressSpace(), MI->getSourceAddressSpace(),
 537           F->getAttributes()))
 538     return MemOps.size() * 2;
 539
 540   // If we can't find an optimal memop lowering, return the default cost
 541   return LibCallCost;
 542 }
 543
 544 int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
 545                                Type *SubTp) {
 546   if (ST->hasNEON()) {
 547     if (Kind == TTI::SK_Broadcast) {
 548       static const CostTblEntry NEONDupTbl[] = {
 549           // VDUP handles these cases.
 550           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
 551           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
 552           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
 553           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
 554           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
 555           {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
 556
 557           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
 558           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
 559           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
 560           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
 561
 562       std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
 563
 564       if (const auto *Entry =
 565               CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
 566         return LT.first * Entry->Cost;
 567     }
 568     if (Kind == TTI::SK_Reverse) {
 569       static const CostTblEntry NEONShuffleTbl[] = {
 570           // Reverse shuffle cost one instruction if we are shuffling within a
 571           // double word (vrev) or two if we shuffle a quad word (vrev, vext).
 572           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
 573           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
 574           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
 575           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
 576           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
 577           {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
 578
 579           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
 580           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
 581           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
 582           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
 583
 584       std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
 585
 586       if (const auto *Entry =
 587               CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
 588         return LT.first * Entry->Cost;
 589     }
 590     if (Kind == TTI::SK_Select) {
 591       static const CostTblEntry NEONSelShuffleTbl[] = {
 592           // Select shuffle cost table for ARM. Cost is the number of
 593           // instructions
 594           // required to create the shuffled vector.
 595
 596           {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
 597           {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
 598           {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
 599           {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
 600
 601           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
 602           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
 603           {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
 604
 605           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
 606
 607           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
 608
 609       std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
 610       if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
 611                                               ISD::VECTOR_SHUFFLE, LT.second))
 612         return LT.first * Entry->Cost;
 613     }
 614   }
 615   if (ST->hasMVEIntegerOps()) {
 616     if (Kind == TTI::SK_Broadcast) {
 617       static const CostTblEntry MVEDupTbl[] = {
 618           // VDUP handles these cases.
 619           {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
 620           {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
 621           {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
 622           {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
 623           {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
 624
 625       std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
 626
 627       if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
 628                                               LT.second))
 629         return LT.first * Entry->Cost * ST->getMVEVectorCostFactor();
 630     }
 631   }
 632   int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy()
 633                      ? ST->getMVEVectorCostFactor()
 634                      : 1;
 635   return BaseCost * BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 636 }
 637
 638 int ARMTTIImpl::getArithmeticInstrCost(
 639     unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
 640     TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
 641     TTI::OperandValueProperties Opd2PropInfo,
 642     ArrayRef<const Value *> Args) {
 643   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
 644   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 645
 646   const unsigned FunctionCallDivCost = 20;
 647   const unsigned ReciprocalDivCost = 10;
 648   static const CostTblEntry CostTbl[] = {
 649     // Division.
 650     // These costs are somewhat random. Choose a cost of 20 to indicate that
 651     // vectorizing devision (added function call) is going to be very expensive.
 652     // Double registers types.
 653     { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
 654     { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
 655     { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
 656     { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
 657     { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
 658     { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
 659     { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
 660     { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
 661     { ISD::SDIV, MVT::v4i16,     ReciprocalDivCost},
 662     { ISD::UDIV, MVT::v4i16,     ReciprocalDivCost},
 663     { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
 664     { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
 665     { ISD::SDIV, MVT::v8i8,      ReciprocalDivCost},
 666     { ISD::UDIV, MVT::v8i8,      ReciprocalDivCost},
 667     { ISD::SREM, MVT::v8i8,  8 * FunctionCallDivCost},
 668     { ISD::UREM, MVT::v8i8,  8 * FunctionCallDivCost},
 669     // Quad register types.
 670     { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
 671     { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
 672     { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
 673     { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
 674     { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
 675     { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
 676     { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
 677     { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
 678     { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
 679     { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
 680     { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
 681     { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
 682     { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
 683     { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
 684     { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
 685     { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
 686     // Multiplication.
 687   };
 688
 689   if (ST->hasNEON()) {
 690     if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
 691       return LT.first * Entry->Cost;
 692
 693     int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
 694                                              Opd1PropInfo, Opd2PropInfo);
 695
 696     // This is somewhat of a hack. The problem that we are facing is that SROA
 697     // creates a sequence of shift, and, or instructions to construct values.
 698     // These sequences are recognized by the ISel and have zero-cost. Not so for
 699     // the vectorized code. Because we have support for v2i64 but not i64 those
 700     // sequences look particularly beneficial to vectorize.
 701     // To work around this we increase the cost of v2i64 operations to make them
 702     // seem less beneficial.
 703     if (LT.second == MVT::v2i64 &&
 704         Op2Info == TargetTransformInfo::OK_UniformConstantValue)
 705       Cost += 4;
 706
 707     return Cost;
 708   }
 709
 710   int BaseCost = ST->hasMVEIntegerOps() && Ty->isVectorTy()
 711                      ? ST->getMVEVectorCostFactor()
 712                      : 1;
 713
 714   // The rest of this mostly follows what is done in BaseT::getArithmeticInstrCost,
 715   // without treating floats as more expensive that scalars or increasing the
 716   // costs for custom operations. The results is also multiplied by the
 717   // MVEVectorCostFactor where appropriate.
 718   if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
 719     return LT.first * BaseCost;
 720
 721   // Else this is expand, assume that we need to scalarize this op.
 722   if (Ty->isVectorTy()) {
 723     unsigned Num = Ty->getVectorNumElements();
 724     unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
 725     // Return the cost of multiple scalar invocation plus the cost of
 726     // inserting and extracting the values.
 727     return BaseT::getScalarizationOverhead(Ty, Args) + Num * Cost;
 728   }
 729
 730   return BaseCost;
 731 }
 732
 733 int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
 734                                 unsigned AddressSpace, const Instruction *I) {
 735   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
 736
 737   if (ST->hasNEON() && Src->isVectorTy() && Alignment != 16 &&
 738       Src->getVectorElementType()->isDoubleTy()) {
 739     // Unaligned loads/stores are extremely inefficient.
 740     // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
 741     return LT.first * 4;
 742   }
 743   int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
 744                      ? ST->getMVEVectorCostFactor()
 745                      : 1;
 746   return BaseCost * LT.first;
 747 }
 748
 749 int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
 750                                            unsigned Factor,
 751                                            ArrayRef<unsigned> Indices,
 752                                            unsigned Alignment,
 753                                            unsigned AddressSpace,
 754                                            bool UseMaskForCond,
 755                                            bool UseMaskForGaps) {
 756   assert(Factor >= 2 && "Invalid interleave factor");
 757   assert(isa<VectorType>(VecTy) && "Expect a vector type");
 758
 759   // vldN/vstN doesn't support vector types of i64/f64 element.
 760   bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
 761
 762   if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
 763       !UseMaskForCond && !UseMaskForGaps) {
 764     unsigned NumElts = VecTy->getVectorNumElements();
 765     auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
 766
 767     // vldN/vstN only support legal vector types of size 64 or 128 in bits.
 768     // Accesses having vector types that are a multiple of 128 bits can be
 769     // matched to more than one vldN/vstN instruction.
 770     if (NumElts % Factor == 0 &&
 771         TLI->isLegalInterleavedAccessType(SubVecTy, DL))
 772       return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
 773   }
 774
 775   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
 776                                            Alignment, AddressSpace,
 777                                            UseMaskForCond, UseMaskForGaps);
 778 }
 779
 780 bool ARMTTIImpl::isLoweredToCall(const Function *F) {
 781   if (!F->isIntrinsic())
 782     BaseT::isLoweredToCall(F);
 783
 784   // Assume all Arm-specific intrinsics map to an instruction.
 785   if (F->getName().startswith("llvm.arm"))
 786     return false;
 787
 788   switch (F->getIntrinsicID()) {
 789   default: break;
 790   case Intrinsic::powi:
 791   case Intrinsic::sin:
 792   case Intrinsic::cos:
 793   case Intrinsic::pow:
 794   case Intrinsic::log:
 795   case Intrinsic::log10:
 796   case Intrinsic::log2:
 797   case Intrinsic::exp:
 798   case Intrinsic::exp2:
 799     return true;
 800   case Intrinsic::sqrt:
 801   case Intrinsic::fabs:
 802   case Intrinsic::copysign:
 803   case Intrinsic::floor:
 804   case Intrinsic::ceil:
 805   case Intrinsic::trunc:
 806   case Intrinsic::rint:
 807   case Intrinsic::nearbyint:
 808   case Intrinsic::round:
 809   case Intrinsic::canonicalize:
 810   case Intrinsic::lround:
 811   case Intrinsic::llround:
 812   case Intrinsic::lrint:
 813   case Intrinsic::llrint:
 814     if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
 815       return true;
 816     if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
 817       return true;
 818     // Some operations can be handled by vector instructions and assume
 819     // unsupported vectors will be expanded into supported scalar ones.
 820     // TODO Handle scalar operations properly.
 821     return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
 822   case Intrinsic::masked_store:
 823   case Intrinsic::masked_load:
 824   case Intrinsic::masked_gather:
 825   case Intrinsic::masked_scatter:
 826     return !ST->hasMVEIntegerOps();
 827   case Intrinsic::sadd_with_overflow:
 828   case Intrinsic::uadd_with_overflow:
 829   case Intrinsic::ssub_with_overflow:
 830   case Intrinsic::usub_with_overflow:
 831   case Intrinsic::sadd_sat:
 832   case Intrinsic::uadd_sat:
 833   case Intrinsic::ssub_sat:
 834   case Intrinsic::usub_sat:
 835     return false;
 836   }
 837
 838   return BaseT::isLoweredToCall(F);
 839 }
 840
 841 bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
 842                                           AssumptionCache &AC,
 843                                           TargetLibraryInfo *LibInfo,
 844                                           HardwareLoopInfo &HWLoopInfo) {
 845   // Low-overhead branches are only supported in the 'low-overhead branch'
 846   // extension of v8.1-m.
 847   if (!ST->hasLOB() || DisableLowOverheadLoops)
 848     return false;
 849
 850   if (!SE.hasLoopInvariantBackedgeTakenCount(L))
 851     return false;
 852
 853   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
 854   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
 855     return false;
 856
 857   const SCEV *TripCountSCEV =
 858     SE.getAddExpr(BackedgeTakenCount,
 859                   SE.getOne(BackedgeTakenCount->getType()));
 860
 861   // We need to store the trip count in LR, a 32-bit register.
 862   if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32)
 863     return false;
 864
 865   // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
 866   // point in generating a hardware loop if that's going to happen.
 867   auto MaybeCall = [this](Instruction &I) {
 868     const ARMTargetLowering *TLI = getTLI();
 869     unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
 870     EVT VT = TLI->getValueType(DL, I.getType(), true);
 871     if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
 872       return true;
 873
 874     // Check if an intrinsic will be lowered to a call and assume that any
 875     // other CallInst will generate a bl.
 876     if (auto *Call = dyn_cast<CallInst>(&I)) {
 877       if (isa<IntrinsicInst>(Call)) {
 878         if (const Function *F = Call->getCalledFunction())
 879           return isLoweredToCall(F);
 880       }
 881       return true;
 882     }
 883
 884     // FPv5 provides conversions between integer, double-precision,
 885     // single-precision, and half-precision formats.
 886     switch (I.getOpcode()) {
 887     default:
 888       break;
 889     case Instruction::FPToSI:
 890     case Instruction::FPToUI:
 891     case Instruction::SIToFP:
 892     case Instruction::UIToFP:
 893     case Instruction::FPTrunc:
 894     case Instruction::FPExt:
 895       return !ST->hasFPARMv8Base();
 896     }
 897
 898     // FIXME: Unfortunately the approach of checking the Operation Action does
 899     // not catch all cases of Legalization that use library calls. Our
 900     // Legalization step categorizes some transformations into library calls as
 901     // Custom, Expand or even Legal when doing type legalization. So for now
 902     // we have to special case for instance the SDIV of 64bit integers and the
 903     // use of floating point emulation.
 904     if (VT.isInteger() && VT.getSizeInBits() >= 64) {
 905       switch (ISD) {
 906       default:
 907         break;
 908       case ISD::SDIV:
 909       case ISD::UDIV:
 910       case ISD::SREM:
 911       case ISD::UREM:
 912       case ISD::SDIVREM:
 913       case ISD::UDIVREM:
 914         return true;
 915       }
 916     }
 917
 918     // Assume all other non-float operations are supported.
 919     if (!VT.isFloatingPoint())
 920       return false;
 921
 922     // We'll need a library call to handle most floats when using soft.
 923     if (TLI->useSoftFloat()) {
 924       switch (I.getOpcode()) {
 925       default:
 926         return true;
 927       case Instruction::Alloca:
 928       case Instruction::Load:
 929       case Instruction::Store:
 930       case Instruction::Select:
 931       case Instruction::PHI:
 932         return false;
 933       }
 934     }
 935
 936     // We'll need a libcall to perform double precision operations on a single
 937     // precision only FPU.
 938     if (I.getType()->isDoubleTy() && !ST->hasFP64())
 939       return true;
 940
 941     // Likewise for half precision arithmetic.
 942     if (I.getType()->isHalfTy() && !ST->hasFullFP16())
 943       return true;
 944
 945     return false;
 946   };
 947
 948   auto IsHardwareLoopIntrinsic = [](Instruction &I) {
 949     if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
 950       switch (Call->getIntrinsicID()) {
 951       default:
 952         break;
 953       case Intrinsic::set_loop_iterations:
 954       case Intrinsic::test_set_loop_iterations:
 955       case Intrinsic::loop_decrement:
 956       case Intrinsic::loop_decrement_reg:
 957         return true;
 958       }
 959     }
 960     return false;
 961   };
 962
 963   // Scan the instructions to see if there's any that we know will turn into a
 964   // call or if this loop is already a low-overhead loop.
 965   auto ScanLoop = [&](Loop *L) {
 966     for (auto *BB : L->getBlocks()) {
 967       for (auto &I : *BB) {
 968         if (MaybeCall(I) || IsHardwareLoopIntrinsic(I))
 969           return false;
 970       }
 971     }
 972     return true;
 973   };
 974
 975   // Visit inner loops.
 976   for (auto Inner : *L)
 977     if (!ScanLoop(Inner))
 978       return false;
 979
 980   if (!ScanLoop(L))
 981     return false;
 982
 983   // TODO: Check whether the trip count calculation is expensive. If L is the
 984   // inner loop but we know it has a low trip count, calculating that trip
 985   // count (in the parent loop) may be detrimental.
 986
 987   LLVMContext &C = L->getHeader()->getContext();
 988   HWLoopInfo.CounterInReg = true;
 989   HWLoopInfo.IsNestingLegal = false;
 990   HWLoopInfo.PerformEntryTest = true;
 991   HWLoopInfo.CountType = Type::getInt32Ty(C);
 992   HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
 993   return true;
 994 }
 995
 996 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
 997                                          TTI::UnrollingPreferences &UP) {
 998   // Only currently enable these preferences for M-Class cores.
 999   if (!ST->isMClass())
1000     return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP);
1001
1002   // Disable loop unrolling for Oz and Os.
1003   UP.OptSizeThreshold = 0;
1004   UP.PartialOptSizeThreshold = 0;
1005   if (L->getHeader()->getParent()->hasOptSize())
1006     return;
1007
1008   // Only enable on Thumb-2 targets.
1009   if (!ST->isThumb2())
1010     return;
1011
1012   SmallVector<BasicBlock*, 4> ExitingBlocks;
1013   L->getExitingBlocks(ExitingBlocks);
1014   LLVM_DEBUG(dbgs() << "Loop has:\n"
1015                     << "Blocks: " << L->getNumBlocks() << "\n"
1016                     << "Exit blocks: " << ExitingBlocks.size() << "\n");
1017
1018   // Only allow another exit other than the latch. This acts as an early exit
1019   // as it mirrors the profitability calculation of the runtime unroller.
1020   if (ExitingBlocks.size() > 2)
1021     return;
1022
1023   // Limit the CFG of the loop body for targets with a branch predictor.
1024   // Allowing 4 blocks permits if-then-else diamonds in the body.
1025   if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
1026     return;
1027
1028   // Scan the loop: don't unroll loops with calls as this could prevent
1029   // inlining.
1030   unsigned Cost = 0;
1031   for (auto *BB : L->getBlocks()) {
1032     for (auto &I : *BB) {
1033       if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1034         ImmutableCallSite CS(&I);
1035         if (const Function *F = CS.getCalledFunction()) {
1036           if (!isLoweredToCall(F))
1037             continue;
1038         }
1039         return;
1040       }
1041       // Don't unroll vectorised loop. MVE does not benefit from it as much as
1042       // scalar code.
1043       if (I.getType()->isVectorTy())
1044         return;
1045
1046       SmallVector<const Value*, 4> Operands(I.value_op_begin(),
1047                                             I.value_op_end());
1048       Cost += getUserCost(&I, Operands);
1049     }
1050   }
1051
1052   LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1053
1054   UP.Partial = true;
1055   UP.Runtime = true;
1056   UP.UpperBound = true;
1057   UP.UnrollRemainder = true;
1058   UP.DefaultUnrollRuntimeCount = 4;
1059   UP.UnrollAndJam = true;
1060   UP.UnrollAndJamInnerLoopThreshold = 60;
1061
1062   // Force unrolling small loops can be very useful because of the branch
1063   // taken cost of the backedge.
1064   if (Cost < 12)
1065     UP.Force = true;
1066 }
1067
1068 bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
1069                                        TTI::ReductionFlags Flags) const {
1070   assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
1071   unsigned ScalarBits = Ty->getScalarSizeInBits();
1072   if (!ST->hasMVEIntegerOps())
1073     return false;
1074
1075   switch (Opcode) {
1076   case Instruction::FAdd:
1077   case Instruction::FMul:
1078   case Instruction::And:
1079   case Instruction::Or:
1080   case Instruction::Xor:
1081   case Instruction::Mul:
1082   case Instruction::FCmp:
1083     return false;
1084   case Instruction::ICmp:
1085   case Instruction::Add:
1086     return ScalarBits < 64 && ScalarBits * Ty->getVectorNumElements() == 128;
1087   default:
1088     llvm_unreachable("Unhandled reduction opcode");
1089   }
1090   return false;
1091 }