lib/Target/ARM/ARMISelLowering.cpp

   1 //===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file defines the interfaces that ARM uses to lower LLVM code into a
  10 // selection DAG.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "ARMISelLowering.h"
  15 #include "ARMBaseInstrInfo.h"
  16 #include "ARMBaseRegisterInfo.h"
  17 #include "ARMCallingConv.h"
  18 #include "ARMConstantPoolValue.h"
  19 #include "ARMMachineFunctionInfo.h"
  20 #include "ARMPerfectShuffle.h"
  21 #include "ARMRegisterInfo.h"
  22 #include "ARMSelectionDAGInfo.h"
  23 #include "ARMSubtarget.h"
  24 #include "MCTargetDesc/ARMAddressingModes.h"
  25 #include "MCTargetDesc/ARMBaseInfo.h"
  26 #include "Utils/ARMBaseInfo.h"
  27 #include "llvm/ADT/APFloat.h"
  28 #include "llvm/ADT/APInt.h"
  29 #include "llvm/ADT/ArrayRef.h"
  30 #include "llvm/ADT/BitVector.h"
  31 #include "llvm/ADT/DenseMap.h"
  32 #include "llvm/ADT/STLExtras.h"
  33 #include "llvm/ADT/SmallPtrSet.h"
  34 #include "llvm/ADT/SmallVector.h"
  35 #include "llvm/ADT/Statistic.h"
  36 #include "llvm/ADT/StringExtras.h"
  37 #include "llvm/ADT/StringRef.h"
  38 #include "llvm/ADT/StringSwitch.h"
  39 #include "llvm/ADT/Triple.h"
  40 #include "llvm/ADT/Twine.h"
  41 #include "llvm/Analysis/VectorUtils.h"
  42 #include "llvm/CodeGen/CallingConvLower.h"
  43 #include "llvm/CodeGen/ISDOpcodes.h"
  44 #include "llvm/CodeGen/IntrinsicLowering.h"
  45 #include "llvm/CodeGen/MachineBasicBlock.h"
  46 #include "llvm/CodeGen/MachineConstantPool.h"
  47 #include "llvm/CodeGen/MachineFrameInfo.h"
  48 #include "llvm/CodeGen/MachineFunction.h"
  49 #include "llvm/CodeGen/MachineInstr.h"
  50 #include "llvm/CodeGen/MachineInstrBuilder.h"
  51 #include "llvm/CodeGen/MachineJumpTableInfo.h"
  52 #include "llvm/CodeGen/MachineMemOperand.h"
  53 #include "llvm/CodeGen/MachineOperand.h"
  54 #include "llvm/CodeGen/MachineRegisterInfo.h"
  55 #include "llvm/CodeGen/RuntimeLibcalls.h"
  56 #include "llvm/CodeGen/SelectionDAG.h"
  57 #include "llvm/CodeGen/SelectionDAGNodes.h"
  58 #include "llvm/CodeGen/TargetInstrInfo.h"
  59 #include "llvm/CodeGen/TargetLowering.h"
  60 #include "llvm/CodeGen/TargetOpcodes.h"
  61 #include "llvm/CodeGen/TargetRegisterInfo.h"
  62 #include "llvm/CodeGen/TargetSubtargetInfo.h"
  63 #include "llvm/CodeGen/ValueTypes.h"
  64 #include "llvm/IR/Attributes.h"
  65 #include "llvm/IR/CallingConv.h"
  66 #include "llvm/IR/Constant.h"
  67 #include "llvm/IR/Constants.h"
  68 #include "llvm/IR/DataLayout.h"
  69 #include "llvm/IR/DebugLoc.h"
  70 #include "llvm/IR/DerivedTypes.h"
  71 #include "llvm/IR/Function.h"
  72 #include "llvm/IR/GlobalAlias.h"
  73 #include "llvm/IR/GlobalValue.h"
  74 #include "llvm/IR/GlobalVariable.h"
  75 #include "llvm/IR/IRBuilder.h"
  76 #include "llvm/IR/InlineAsm.h"
  77 #include "llvm/IR/Instruction.h"
  78 #include "llvm/IR/Instructions.h"
  79 #include "llvm/IR/IntrinsicInst.h"
  80 #include "llvm/IR/Intrinsics.h"
  81 #include "llvm/IR/Module.h"
  82 #include "llvm/IR/PatternMatch.h"
  83 #include "llvm/IR/Type.h"
  84 #include "llvm/IR/User.h"
  85 #include "llvm/IR/Value.h"
  86 #include "llvm/MC/MCInstrDesc.h"
  87 #include "llvm/MC/MCInstrItineraries.h"
  88 #include "llvm/MC/MCRegisterInfo.h"
  89 #include "llvm/MC/MCSchedule.h"
  90 #include "llvm/Support/AtomicOrdering.h"
  91 #include "llvm/Support/BranchProbability.h"
  92 #include "llvm/Support/Casting.h"
  93 #include "llvm/Support/CodeGen.h"
  94 #include "llvm/Support/CommandLine.h"
  95 #include "llvm/Support/Compiler.h"
  96 #include "llvm/Support/Debug.h"
  97 #include "llvm/Support/ErrorHandling.h"
  98 #include "llvm/Support/KnownBits.h"
  99 #include "llvm/Support/MachineValueType.h"
 100 #include "llvm/Support/MathExtras.h"
 101 #include "llvm/Support/raw_ostream.h"
 102 #include "llvm/Target/TargetMachine.h"
 103 #include "llvm/Target/TargetOptions.h"
 104 #include <algorithm>
 105 #include <cassert>
 106 #include <cstdint>
 107 #include <cstdlib>
 108 #include <iterator>
 109 #include <limits>
 110 #include <string>
 111 #include <tuple>
 112 #include <utility>
 113 #include <vector>
 114
 115 using namespace llvm;
 116 using namespace llvm::PatternMatch;
 117
 118 #define DEBUG_TYPE "arm-isel"
 119
 120 STATISTIC(NumTailCalls, "Number of tail calls");
 121 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
 122 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
 123 STATISTIC(NumConstpoolPromoted,
 124   "Number of constants with their storage promoted into constant pools");
 125
 126 static cl::opt<bool>
 127 ARMInterworking("arm-interworking", cl::Hidden,
 128   cl::desc("Enable / disable ARM interworking (for debugging only)"),
 129   cl::init(true));
 130
 131 static cl::opt<bool> EnableConstpoolPromotion(
 132     "arm-promote-constant", cl::Hidden,
 133     cl::desc("Enable / disable promotion of unnamed_addr constants into "
 134              "constant pools"),
 135     cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
 136 static cl::opt<unsigned> ConstpoolPromotionMaxSize(
 137     "arm-promote-constant-max-size", cl::Hidden,
 138     cl::desc("Maximum size of constant to promote into a constant pool"),
 139     cl::init(64));
 140 static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
 141     "arm-promote-constant-max-total", cl::Hidden,
 142     cl::desc("Maximum size of ALL constants to promote into a constant pool"),
 143     cl::init(128));
 144
 145 // The APCS parameter registers.
 146 static const MCPhysReg GPRArgRegs[] = {
 147   ARM::R0, ARM::R1, ARM::R2, ARM::R3
 148 };
 149
 150 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
 151                                        MVT PromotedBitwiseVT) {
 152   if (VT != PromotedLdStVT) {
 153     setOperationAction(ISD::LOAD, VT, Promote);
 154     AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
 155
 156     setOperationAction(ISD::STORE, VT, Promote);
 157     AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
 158   }
 159
 160   MVT ElemTy = VT.getVectorElementType();
 161   if (ElemTy != MVT::f64)
 162     setOperationAction(ISD::SETCC, VT, Custom);
 163   setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
 164   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 165   if (ElemTy == MVT::i32) {
 166     setOperationAction(ISD::SINT_TO_FP, VT, Custom);
 167     setOperationAction(ISD::UINT_TO_FP, VT, Custom);
 168     setOperationAction(ISD::FP_TO_SINT, VT, Custom);
 169     setOperationAction(ISD::FP_TO_UINT, VT, Custom);
 170   } else {
 171     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 172     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 173     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 174     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 175   }
 176   setOperationAction(ISD::BUILD_VECTOR,      VT, Custom);
 177   setOperationAction(ISD::VECTOR_SHUFFLE,    VT, Custom);
 178   setOperationAction(ISD::CONCAT_VECTORS,    VT, Legal);
 179   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
 180   setOperationAction(ISD::SELECT,            VT, Expand);
 181   setOperationAction(ISD::SELECT_CC,         VT, Expand);
 182   setOperationAction(ISD::VSELECT,           VT, Expand);
 183   setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
 184   if (VT.isInteger()) {
 185     setOperationAction(ISD::SHL, VT, Custom);
 186     setOperationAction(ISD::SRA, VT, Custom);
 187     setOperationAction(ISD::SRL, VT, Custom);
 188   }
 189
 190   // Promote all bit-wise operations.
 191   if (VT.isInteger() && VT != PromotedBitwiseVT) {
 192     setOperationAction(ISD::AND, VT, Promote);
 193     AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
 194     setOperationAction(ISD::OR,  VT, Promote);
 195     AddPromotedToType (ISD::OR,  VT, PromotedBitwiseVT);
 196     setOperationAction(ISD::XOR, VT, Promote);
 197     AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
 198   }
 199
 200   // Neon does not support vector divide/remainder operations.
 201   setOperationAction(ISD::SDIV, VT, Expand);
 202   setOperationAction(ISD::UDIV, VT, Expand);
 203   setOperationAction(ISD::FDIV, VT, Expand);
 204   setOperationAction(ISD::SREM, VT, Expand);
 205   setOperationAction(ISD::UREM, VT, Expand);
 206   setOperationAction(ISD::FREM, VT, Expand);
 207
 208   if (!VT.isFloatingPoint() &&
 209       VT != MVT::v2i64 && VT != MVT::v1i64)
 210     for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
 211       setOperationAction(Opcode, VT, Legal);
 212 }
 213
 214 void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
 215   addRegisterClass(VT, &ARM::DPRRegClass);
 216   addTypeForNEON(VT, MVT::f64, MVT::v2i32);
 217 }
 218
 219 void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
 220   addRegisterClass(VT, &ARM::DPairRegClass);
 221   addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
 222 }
 223
 224 void ARMTargetLowering::setAllExpand(MVT VT) {
 225   for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
 226     setOperationAction(Opc, VT, Expand);
 227
 228   // We support these really simple operations even on types where all
 229   // the actual arithmetic has to be broken down into simpler
 230   // operations or turned into library calls.
 231   setOperationAction(ISD::BITCAST, VT, Legal);
 232   setOperationAction(ISD::LOAD, VT, Legal);
 233   setOperationAction(ISD::STORE, VT, Legal);
 234   setOperationAction(ISD::UNDEF, VT, Legal);
 235 }
 236
 237 void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
 238                                        LegalizeAction Action) {
 239   setLoadExtAction(ISD::EXTLOAD,  From, To, Action);
 240   setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
 241   setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
 242 }
 243
 244 void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
 245   const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
 246
 247   for (auto VT : IntTypes) {
 248     addRegisterClass(VT, &ARM::QPRRegClass);
 249     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
 250     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
 251     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 252     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
 253     setOperationAction(ISD::SHL, VT, Custom);
 254     setOperationAction(ISD::SRA, VT, Custom);
 255     setOperationAction(ISD::SRL, VT, Custom);
 256     setOperationAction(ISD::SMIN, VT, Legal);
 257     setOperationAction(ISD::SMAX, VT, Legal);
 258     setOperationAction(ISD::UMIN, VT, Legal);
 259     setOperationAction(ISD::UMAX, VT, Legal);
 260     setOperationAction(ISD::ABS, VT, Legal);
 261     setOperationAction(ISD::SETCC, VT, Custom);
 262
 263     // No native support for these.
 264     setOperationAction(ISD::UDIV, VT, Expand);
 265     setOperationAction(ISD::SDIV, VT, Expand);
 266     setOperationAction(ISD::UREM, VT, Expand);
 267     setOperationAction(ISD::SREM, VT, Expand);
 268
 269     if (!HasMVEFP) {
 270       setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 271       setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 272       setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 273       setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 274     }
 275   }
 276
 277   const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
 278   for (auto VT : FloatTypes) {
 279     addRegisterClass(VT, &ARM::QPRRegClass);
 280     if (!HasMVEFP)
 281       setAllExpand(VT);
 282
 283     // These are legal or custom whether we have MVE.fp or not
 284     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
 285     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
 286     setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getVectorElementType(), Custom);
 287     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 288     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
 289     setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom);
 290     setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
 291     setOperationAction(ISD::SETCC, VT, Custom);
 292
 293     if (HasMVEFP) {
 294       setOperationAction(ISD::FMINNUM, VT, Legal);
 295       setOperationAction(ISD::FMAXNUM, VT, Legal);
 296       setOperationAction(ISD::FROUND, VT, Legal);
 297
 298       // No native support for these.
 299       setOperationAction(ISD::FDIV, VT, Expand);
 300       setOperationAction(ISD::FREM, VT, Expand);
 301       setOperationAction(ISD::FSQRT, VT, Expand);
 302       setOperationAction(ISD::FSIN, VT, Expand);
 303       setOperationAction(ISD::FCOS, VT, Expand);
 304       setOperationAction(ISD::FPOW, VT, Expand);
 305       setOperationAction(ISD::FLOG, VT, Expand);
 306       setOperationAction(ISD::FLOG2, VT, Expand);
 307       setOperationAction(ISD::FLOG10, VT, Expand);
 308       setOperationAction(ISD::FEXP, VT, Expand);
 309       setOperationAction(ISD::FEXP2, VT, Expand);
 310       setOperationAction(ISD::FNEARBYINT, VT, Expand);
 311     }
 312   }
 313
 314   // We 'support' these types up to bitcast/load/store level, regardless of
 315   // MVE integer-only / float support. Only doing FP data processing on the FP
 316   // vector types is inhibited at integer-only level.
 317   const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
 318   for (auto VT : LongTypes) {
 319     addRegisterClass(VT, &ARM::QPRRegClass);
 320     setAllExpand(VT);
 321     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
 322     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 323     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
 324   }
 325   // We can do bitwise operations on v2i64 vectors
 326   setOperationAction(ISD::AND, MVT::v2i64, Legal);
 327   setOperationAction(ISD::OR, MVT::v2i64, Legal);
 328   setOperationAction(ISD::XOR, MVT::v2i64, Legal);
 329
 330   // It is legal to extload from v4i8 to v4i16 or v4i32.
 331   addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
 332   addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
 333   addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
 334
 335   // Some truncating stores are legal too.
 336   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
 337   setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
 338   setTruncStoreAction(MVT::v8i16, MVT::v8i8,  Legal);
 339
 340   // Predicate types
 341   const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1};
 342   for (auto VT : pTypes) {
 343     addRegisterClass(VT, &ARM::VCCRRegClass);
 344     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
 345     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
 346     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
 347     setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
 348     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
 349     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 350     setOperationAction(ISD::SETCC, VT, Custom);
 351     setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
 352   }
 353 }
 354
 355 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
 356                                      const ARMSubtarget &STI)
 357     : TargetLowering(TM), Subtarget(&STI) {
 358   RegInfo = Subtarget->getRegisterInfo();
 359   Itins = Subtarget->getInstrItineraryData();
 360
 361   setBooleanContents(ZeroOrOneBooleanContent);
 362   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 363
 364   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() &&
 365       !Subtarget->isTargetWatchOS()) {
 366     bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard;
 367     for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID)
 368       setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID),
 369                             IsHFTarget ? CallingConv::ARM_AAPCS_VFP
 370                                        : CallingConv::ARM_AAPCS);
 371   }
 372
 373   if (Subtarget->isTargetMachO()) {
 374     // Uses VFP for Thumb libfuncs if available.
 375     if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
 376         Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
 377       static const struct {
 378         const RTLIB::Libcall Op;
 379         const char * const Name;
 380         const ISD::CondCode Cond;
 381       } LibraryCalls[] = {
 382         // Single-precision floating-point arithmetic.
 383         { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
 384         { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
 385         { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
 386         { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
 387
 388         // Double-precision floating-point arithmetic.
 389         { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
 390         { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
 391         { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
 392         { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
 393
 394         // Single-precision comparisons.
 395         { RTLIB::OEQ_F32, "__eqsf2vfp",    ISD::SETNE },
 396         { RTLIB::UNE_F32, "__nesf2vfp",    ISD::SETNE },
 397         { RTLIB::OLT_F32, "__ltsf2vfp",    ISD::SETNE },
 398         { RTLIB::OLE_F32, "__lesf2vfp",    ISD::SETNE },
 399         { RTLIB::OGE_F32, "__gesf2vfp",    ISD::SETNE },
 400         { RTLIB::OGT_F32, "__gtsf2vfp",    ISD::SETNE },
 401         { RTLIB::UO_F32,  "__unordsf2vfp", ISD::SETNE },
 402         { RTLIB::O_F32,   "__unordsf2vfp", ISD::SETEQ },
 403
 404         // Double-precision comparisons.
 405         { RTLIB::OEQ_F64, "__eqdf2vfp",    ISD::SETNE },
 406         { RTLIB::UNE_F64, "__nedf2vfp",    ISD::SETNE },
 407         { RTLIB::OLT_F64, "__ltdf2vfp",    ISD::SETNE },
 408         { RTLIB::OLE_F64, "__ledf2vfp",    ISD::SETNE },
 409         { RTLIB::OGE_F64, "__gedf2vfp",    ISD::SETNE },
 410         { RTLIB::OGT_F64, "__gtdf2vfp",    ISD::SETNE },
 411         { RTLIB::UO_F64,  "__unorddf2vfp", ISD::SETNE },
 412         { RTLIB::O_F64,   "__unorddf2vfp", ISD::SETEQ },
 413
 414         // Floating-point to integer conversions.
 415         // i64 conversions are done via library routines even when generating VFP
 416         // instructions, so use the same ones.
 417         { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp",    ISD::SETCC_INVALID },
 418         { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
 419         { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp",    ISD::SETCC_INVALID },
 420         { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
 421
 422         // Conversions between floating types.
 423         { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp",  ISD::SETCC_INVALID },
 424         { RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp", ISD::SETCC_INVALID },
 425
 426         // Integer to floating-point conversions.
 427         // i64 conversions are done via library routines even when generating VFP
 428         // instructions, so use the same ones.
 429         // FIXME: There appears to be some naming inconsistency in ARM libgcc:
 430         // e.g., __floatunsidf vs. __floatunssidfvfp.
 431         { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp",    ISD::SETCC_INVALID },
 432         { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
 433         { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp",    ISD::SETCC_INVALID },
 434         { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
 435       };
 436
 437       for (const auto &LC : LibraryCalls) {
 438         setLibcallName(LC.Op, LC.Name);
 439         if (LC.Cond != ISD::SETCC_INVALID)
 440           setCmpLibcallCC(LC.Op, LC.Cond);
 441       }
 442     }
 443   }
 444
 445   // These libcalls are not available in 32-bit.
 446   setLibcallName(RTLIB::SHL_I128, nullptr);
 447   setLibcallName(RTLIB::SRL_I128, nullptr);
 448   setLibcallName(RTLIB::SRA_I128, nullptr);
 449
 450   // RTLIB
 451   if (Subtarget->isAAPCS_ABI() &&
 452       (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
 453        Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
 454     static const struct {
 455       const RTLIB::Libcall Op;
 456       const char * const Name;
 457       const CallingConv::ID CC;
 458       const ISD::CondCode Cond;
 459     } LibraryCalls[] = {
 460       // Double-precision floating-point arithmetic helper functions
 461       // RTABI chapter 4.1.2, Table 2
 462       { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 463       { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 464       { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 465       { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 466
 467       // Double-precision floating-point comparison helper functions
 468       // RTABI chapter 4.1.2, Table 3
 469       { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
 470       { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
 471       { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
 472       { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
 473       { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
 474       { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
 475       { RTLIB::UO_F64,  "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
 476       { RTLIB::O_F64,   "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
 477
 478       // Single-precision floating-point arithmetic helper functions
 479       // RTABI chapter 4.1.2, Table 4
 480       { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 481       { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 482       { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 483       { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 484
 485       // Single-precision floating-point comparison helper functions
 486       // RTABI chapter 4.1.2, Table 5
 487       { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
 488       { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
 489       { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
 490       { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
 491       { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
 492       { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
 493       { RTLIB::UO_F32,  "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
 494       { RTLIB::O_F32,   "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
 495
 496       // Floating-point to integer conversions.
 497       // RTABI chapter 4.1.2, Table 6
 498       { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 499       { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 500       { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 501       { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 502       { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 503       { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 504       { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 505       { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 506
 507       // Conversions between floating types.
 508       // RTABI chapter 4.1.2, Table 7
 509       { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 510       { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 511       { RTLIB::FPEXT_F32_F64,   "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 512
 513       // Integer to floating-point conversions.
 514       // RTABI chapter 4.1.2, Table 8
 515       { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 516       { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 517       { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 518       { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 519       { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 520       { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 521       { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 522       { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 523
 524       // Long long helper functions
 525       // RTABI chapter 4.2, Table 9
 526       { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 527       { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 528       { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 529       { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 530
 531       // Integer division functions
 532       // RTABI chapter 4.3.1
 533       { RTLIB::SDIV_I8,  "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 534       { RTLIB::SDIV_I16, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 535       { RTLIB::SDIV_I32, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 536       { RTLIB::SDIV_I64, "__aeabi_ldivmod",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 537       { RTLIB::UDIV_I8,  "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 538       { RTLIB::UDIV_I16, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 539       { RTLIB::UDIV_I32, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 540       { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 541     };
 542
 543     for (const auto &LC : LibraryCalls) {
 544       setLibcallName(LC.Op, LC.Name);
 545       setLibcallCallingConv(LC.Op, LC.CC);
 546       if (LC.Cond != ISD::SETCC_INVALID)
 547         setCmpLibcallCC(LC.Op, LC.Cond);
 548     }
 549
 550     // EABI dependent RTLIB
 551     if (TM.Options.EABIVersion == EABI::EABI4 ||
 552         TM.Options.EABIVersion == EABI::EABI5) {
 553       static const struct {
 554         const RTLIB::Libcall Op;
 555         const char *const Name;
 556         const CallingConv::ID CC;
 557         const ISD::CondCode Cond;
 558       } MemOpsLibraryCalls[] = {
 559         // Memory operations
 560         // RTABI chapter 4.3.4
 561         { RTLIB::MEMCPY,  "__aeabi_memcpy",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 562         { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 563         { RTLIB::MEMSET,  "__aeabi_memset",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 564       };
 565
 566       for (const auto &LC : MemOpsLibraryCalls) {
 567         setLibcallName(LC.Op, LC.Name);
 568         setLibcallCallingConv(LC.Op, LC.CC);
 569         if (LC.Cond != ISD::SETCC_INVALID)
 570           setCmpLibcallCC(LC.Op, LC.Cond);
 571       }
 572     }
 573   }
 574
 575   if (Subtarget->isTargetWindows()) {
 576     static const struct {
 577       const RTLIB::Libcall Op;
 578       const char * const Name;
 579       const CallingConv::ID CC;
 580     } LibraryCalls[] = {
 581       { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
 582       { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
 583       { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
 584       { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
 585       { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
 586       { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
 587       { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
 588       { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
 589     };
 590
 591     for (const auto &LC : LibraryCalls) {
 592       setLibcallName(LC.Op, LC.Name);
 593       setLibcallCallingConv(LC.Op, LC.CC);
 594     }
 595   }
 596
 597   // Use divmod compiler-rt calls for iOS 5.0 and later.
 598   if (Subtarget->isTargetMachO() &&
 599       !(Subtarget->isTargetIOS() &&
 600         Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
 601     setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
 602     setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
 603   }
 604
 605   // The half <-> float conversion functions are always soft-float on
 606   // non-watchos platforms, but are needed for some targets which use a
 607   // hard-float calling convention by default.
 608   if (!Subtarget->isTargetWatchABI()) {
 609     if (Subtarget->isAAPCS_ABI()) {
 610       setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
 611       setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
 612       setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
 613     } else {
 614       setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
 615       setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
 616       setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
 617     }
 618   }
 619
 620   // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
 621   // a __gnu_ prefix (which is the default).
 622   if (Subtarget->isTargetAEABI()) {
 623     static const struct {
 624       const RTLIB::Libcall Op;
 625       const char * const Name;
 626       const CallingConv::ID CC;
 627     } LibraryCalls[] = {
 628       { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS },
 629       { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS },
 630       { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS },
 631     };
 632
 633     for (const auto &LC : LibraryCalls) {
 634       setLibcallName(LC.Op, LC.Name);
 635       setLibcallCallingConv(LC.Op, LC.CC);
 636     }
 637   }
 638
 639   if (Subtarget->isThumb1Only())
 640     addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
 641   else
 642     addRegisterClass(MVT::i32, &ARM::GPRRegClass);
 643
 644   if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
 645       Subtarget->hasFPRegs()) {
 646     addRegisterClass(MVT::f32, &ARM::SPRRegClass);
 647     addRegisterClass(MVT::f64, &ARM::DPRRegClass);
 648     if (!Subtarget->hasVFP2Base())
 649       setAllExpand(MVT::f32);
 650     if (!Subtarget->hasFP64())
 651       setAllExpand(MVT::f64);
 652   }
 653
 654   if (Subtarget->hasFullFP16()) {
 655     addRegisterClass(MVT::f16, &ARM::HPRRegClass);
 656     setOperationAction(ISD::BITCAST, MVT::i16, Custom);
 657     setOperationAction(ISD::BITCAST, MVT::i32, Custom);
 658     setOperationAction(ISD::BITCAST, MVT::f16, Custom);
 659
 660     setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
 661     setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
 662   }
 663
 664   for (MVT VT : MVT::vector_valuetypes()) {
 665     for (MVT InnerVT : MVT::vector_valuetypes()) {
 666       setTruncStoreAction(VT, InnerVT, Expand);
 667       addAllExtLoads(VT, InnerVT, Expand);
 668     }
 669
 670     setOperationAction(ISD::MULHS, VT, Expand);
 671     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 672     setOperationAction(ISD::MULHU, VT, Expand);
 673     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 674
 675     setOperationAction(ISD::BSWAP, VT, Expand);
 676   }
 677
 678   setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
 679   setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
 680
 681   setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom);
 682   setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);
 683
 684   if (Subtarget->hasMVEIntegerOps())
 685     addMVEVectorTypes(Subtarget->hasMVEFloatOps());
 686
 687   // Combine low-overhead loop intrinsics so that we can lower i1 types.
 688   if (Subtarget->hasLOB()) {
 689     setTargetDAGCombine(ISD::BRCOND);
 690     setTargetDAGCombine(ISD::BR_CC);
 691   }
 692
 693   if (Subtarget->hasNEON()) {
 694     addDRTypeForNEON(MVT::v2f32);
 695     addDRTypeForNEON(MVT::v8i8);
 696     addDRTypeForNEON(MVT::v4i16);
 697     addDRTypeForNEON(MVT::v2i32);
 698     addDRTypeForNEON(MVT::v1i64);
 699
 700     addQRTypeForNEON(MVT::v4f32);
 701     addQRTypeForNEON(MVT::v2f64);
 702     addQRTypeForNEON(MVT::v16i8);
 703     addQRTypeForNEON(MVT::v8i16);
 704     addQRTypeForNEON(MVT::v4i32);
 705     addQRTypeForNEON(MVT::v2i64);
 706
 707     if (Subtarget->hasFullFP16()) {
 708       addQRTypeForNEON(MVT::v8f16);
 709       addDRTypeForNEON(MVT::v4f16);
 710     }
 711   }
 712
 713   if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
 714     // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
 715     // none of Neon, MVE or VFP supports any arithmetic operations on it.
 716     setOperationAction(ISD::FADD, MVT::v2f64, Expand);
 717     setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
 718     setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
 719     // FIXME: Code duplication: FDIV and FREM are expanded always, see
 720     // ARMTargetLowering::addTypeForNEON method for details.
 721     setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
 722     setOperationAction(ISD::FREM, MVT::v2f64, Expand);
 723     // FIXME: Create unittest.
 724     // In another words, find a way when "copysign" appears in DAG with vector
 725     // operands.
 726     setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
 727     // FIXME: Code duplication: SETCC has custom operation action, see
 728     // ARMTargetLowering::addTypeForNEON method for details.
 729     setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
 730     // FIXME: Create unittest for FNEG and for FABS.
 731     setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
 732     setOperationAction(ISD::FABS, MVT::v2f64, Expand);
 733     setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
 734     setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
 735     setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
 736     setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
 737     setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
 738     setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
 739     setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
 740     setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
 741     setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
 742     // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
 743     setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
 744     setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
 745     setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
 746     setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
 747     setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
 748     setOperationAction(ISD::FMA, MVT::v2f64, Expand);
 749   }
 750
 751   if (Subtarget->hasNEON()) {
 752     // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
 753     // supported for v4f32.
 754     setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
 755     setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
 756     setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
 757     setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
 758     setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
 759     setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
 760     setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
 761     setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
 762     setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
 763     setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
 764     setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
 765     setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
 766     setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
 767     setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
 768
 769     // Mark v2f32 intrinsics.
 770     setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
 771     setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
 772     setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
 773     setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
 774     setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
 775     setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
 776     setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
 777     setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
 778     setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
 779     setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
 780     setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
 781     setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
 782     setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
 783     setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
 784
 785     // Neon does not support some operations on v1i64 and v2i64 types.
 786     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
 787     // Custom handling for some quad-vector types to detect VMULL.
 788     setOperationAction(ISD::MUL, MVT::v8i16, Custom);
 789     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
 790     setOperationAction(ISD::MUL, MVT::v2i64, Custom);
 791     // Custom handling for some vector types to avoid expensive expansions
 792     setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
 793     setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
 794     setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
 795     setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
 796     // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
 797     // a destination type that is wider than the source, and nor does
 798     // it have a FP_TO_[SU]INT instruction with a narrower destination than
 799     // source.
 800     setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
 801     setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
 802     setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
 803     setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
 804     setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
 805     setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
 806     setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
 807     setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
 808
 809     setOperationAction(ISD::FP_ROUND,   MVT::v2f32, Expand);
 810     setOperationAction(ISD::FP_EXTEND,  MVT::v2f64, Expand);
 811
 812     // NEON does not have single instruction CTPOP for vectors with element
 813     // types wider than 8-bits.  However, custom lowering can leverage the
 814     // v8i8/v16i8 vcnt instruction.
 815     setOperationAction(ISD::CTPOP,      MVT::v2i32, Custom);
 816     setOperationAction(ISD::CTPOP,      MVT::v4i32, Custom);
 817     setOperationAction(ISD::CTPOP,      MVT::v4i16, Custom);
 818     setOperationAction(ISD::CTPOP,      MVT::v8i16, Custom);
 819     setOperationAction(ISD::CTPOP,      MVT::v1i64, Custom);
 820     setOperationAction(ISD::CTPOP,      MVT::v2i64, Custom);
 821
 822     setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
 823     setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
 824
 825     // NEON does not have single instruction CTTZ for vectors.
 826     setOperationAction(ISD::CTTZ, MVT::v8i8, Custom);
 827     setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
 828     setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
 829     setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
 830
 831     setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
 832     setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
 833     setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
 834     setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
 835
 836     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom);
 837     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom);
 838     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom);
 839     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom);
 840
 841     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom);
 842     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom);
 843     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
 844     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
 845
 846     // NEON only has FMA instructions as of VFP4.
 847     if (!Subtarget->hasVFP4Base()) {
 848       setOperationAction(ISD::FMA, MVT::v2f32, Expand);
 849       setOperationAction(ISD::FMA, MVT::v4f32, Expand);
 850     }
 851
 852     setTargetDAGCombine(ISD::INTRINSIC_VOID);
 853     setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
 854     setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
 855     setTargetDAGCombine(ISD::SHL);
 856     setTargetDAGCombine(ISD::SRL);
 857     setTargetDAGCombine(ISD::SRA);
 858     setTargetDAGCombine(ISD::SIGN_EXTEND);
 859     setTargetDAGCombine(ISD::ZERO_EXTEND);
 860     setTargetDAGCombine(ISD::ANY_EXTEND);
 861     setTargetDAGCombine(ISD::STORE);
 862     setTargetDAGCombine(ISD::FP_TO_SINT);
 863     setTargetDAGCombine(ISD::FP_TO_UINT);
 864     setTargetDAGCombine(ISD::FDIV);
 865     setTargetDAGCombine(ISD::LOAD);
 866
 867     // It is legal to extload from v4i8 to v4i16 or v4i32.
 868     for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
 869                    MVT::v2i32}) {
 870       for (MVT VT : MVT::integer_vector_valuetypes()) {
 871         setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal);
 872         setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal);
 873         setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal);
 874       }
 875     }
 876   }
 877
 878   if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
 879     setTargetDAGCombine(ISD::BUILD_VECTOR);
 880     setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
 881     setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 882   }
 883
 884   if (!Subtarget->hasFP64()) {
 885     // When targeting a floating-point unit with only single-precision
 886     // operations, f64 is legal for the few double-precision instructions which
 887     // are present However, no double-precision operations other than moves,
 888     // loads and stores are provided by the hardware.
 889     setOperationAction(ISD::FADD,       MVT::f64, Expand);
 890     setOperationAction(ISD::FSUB,       MVT::f64, Expand);
 891     setOperationAction(ISD::FMUL,       MVT::f64, Expand);
 892     setOperationAction(ISD::FMA,        MVT::f64, Expand);
 893     setOperationAction(ISD::FDIV,       MVT::f64, Expand);
 894     setOperationAction(ISD::FREM,       MVT::f64, Expand);
 895     setOperationAction(ISD::FCOPYSIGN,  MVT::f64, Expand);
 896     setOperationAction(ISD::FGETSIGN,   MVT::f64, Expand);
 897     setOperationAction(ISD::FNEG,       MVT::f64, Expand);
 898     setOperationAction(ISD::FABS,       MVT::f64, Expand);
 899     setOperationAction(ISD::FSQRT,      MVT::f64, Expand);
 900     setOperationAction(ISD::FSIN,       MVT::f64, Expand);
 901     setOperationAction(ISD::FCOS,       MVT::f64, Expand);
 902     setOperationAction(ISD::FPOW,       MVT::f64, Expand);
 903     setOperationAction(ISD::FLOG,       MVT::f64, Expand);
 904     setOperationAction(ISD::FLOG2,      MVT::f64, Expand);
 905     setOperationAction(ISD::FLOG10,     MVT::f64, Expand);
 906     setOperationAction(ISD::FEXP,       MVT::f64, Expand);
 907     setOperationAction(ISD::FEXP2,      MVT::f64, Expand);
 908     setOperationAction(ISD::FCEIL,      MVT::f64, Expand);
 909     setOperationAction(ISD::FTRUNC,     MVT::f64, Expand);
 910     setOperationAction(ISD::FRINT,      MVT::f64, Expand);
 911     setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
 912     setOperationAction(ISD::FFLOOR,     MVT::f64, Expand);
 913     setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
 914     setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
 915     setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
 916     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
 917     setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
 918     setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
 919     setOperationAction(ISD::FP_ROUND,   MVT::f32, Custom);
 920   }
 921
 922   if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()){
 923     setOperationAction(ISD::FP_EXTEND,  MVT::f64, Custom);
 924     setOperationAction(ISD::FP_ROUND,  MVT::f16, Custom);
 925   }
 926
 927   if (!Subtarget->hasFP16())
 928     setOperationAction(ISD::FP_EXTEND,  MVT::f32, Custom);
 929
 930   if (!Subtarget->hasFP64())
 931     setOperationAction(ISD::FP_ROUND,  MVT::f32, Custom);
 932
 933   computeRegisterProperties(Subtarget->getRegisterInfo());
 934
 935   // ARM does not have floating-point extending loads.
 936   for (MVT VT : MVT::fp_valuetypes()) {
 937     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
 938     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
 939   }
 940
 941   // ... or truncating stores
 942   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 943   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 944   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 945
 946   // ARM does not have i1 sign extending load.
 947   for (MVT VT : MVT::integer_valuetypes())
 948     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 949
 950   // ARM supports all 4 flavors of integer indexed load / store.
 951   if (!Subtarget->isThumb1Only()) {
 952     for (unsigned im = (unsigned)ISD::PRE_INC;
 953          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
 954       setIndexedLoadAction(im,  MVT::i1,  Legal);
 955       setIndexedLoadAction(im,  MVT::i8,  Legal);
 956       setIndexedLoadAction(im,  MVT::i16, Legal);
 957       setIndexedLoadAction(im,  MVT::i32, Legal);
 958       setIndexedStoreAction(im, MVT::i1,  Legal);
 959       setIndexedStoreAction(im, MVT::i8,  Legal);
 960       setIndexedStoreAction(im, MVT::i16, Legal);
 961       setIndexedStoreAction(im, MVT::i32, Legal);
 962     }
 963   } else {
 964     // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
 965     setIndexedLoadAction(ISD::POST_INC, MVT::i32,  Legal);
 966     setIndexedStoreAction(ISD::POST_INC, MVT::i32,  Legal);
 967   }
 968
 969   setOperationAction(ISD::SADDO, MVT::i32, Custom);
 970   setOperationAction(ISD::UADDO, MVT::i32, Custom);
 971   setOperationAction(ISD::SSUBO, MVT::i32, Custom);
 972   setOperationAction(ISD::USUBO, MVT::i32, Custom);
 973
 974   setOperationAction(ISD::ADDCARRY, MVT::i32, Custom);
 975   setOperationAction(ISD::SUBCARRY, MVT::i32, Custom);
 976
 977   // i64 operation support.
 978   setOperationAction(ISD::MUL,     MVT::i64, Expand);
 979   setOperationAction(ISD::MULHU,   MVT::i32, Expand);
 980   if (Subtarget->isThumb1Only()) {
 981     setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
 982     setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
 983   }
 984   if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
 985       || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
 986     setOperationAction(ISD::MULHS, MVT::i32, Expand);
 987
 988   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
 989   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
 990   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
 991   setOperationAction(ISD::SRL,       MVT::i64, Custom);
 992   setOperationAction(ISD::SRA,       MVT::i64, Custom);
 993   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
 994
 995   // MVE lowers 64 bit shifts to lsll and lsrl
 996   // assuming that ISD::SRL and SRA of i64 are already marked custom
 997   if (Subtarget->hasMVEIntegerOps())
 998     setOperationAction(ISD::SHL, MVT::i64, Custom);
 999
1000   // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
1001   if (Subtarget->isThumb1Only()) {
1002     setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
1003     setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
1004     setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
1005   }
1006
1007   if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
1008     setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
1009
1010   // ARM does not have ROTL.
1011   setOperationAction(ISD::ROTL, MVT::i32, Expand);
1012   for (MVT VT : MVT::vector_valuetypes()) {
1013     setOperationAction(ISD::ROTL, VT, Expand);
1014     setOperationAction(ISD::ROTR, VT, Expand);
1015   }
1016   setOperationAction(ISD::CTTZ,  MVT::i32, Custom);
1017   setOperationAction(ISD::CTPOP, MVT::i32, Expand);
1018   if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1019     setOperationAction(ISD::CTLZ, MVT::i32, Expand);
1020     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall);
1021   }
1022
1023   // @llvm.readcyclecounter requires the Performance Monitors extension.
1024   // Default to the 0 expansion on unsupported platforms.
1025   // FIXME: Technically there are older ARM CPUs that have
1026   // implementation-specific ways of obtaining this information.
1027   if (Subtarget->hasPerfMon())
1028     setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
1029
1030   // Only ARMv6 has BSWAP.
1031   if (!Subtarget->hasV6Ops())
1032     setOperationAction(ISD::BSWAP, MVT::i32, Expand);
1033
1034   bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1035                                         : Subtarget->hasDivideInARMMode();
1036   if (!hasDivide) {
1037     // These are expanded into libcalls if the cpu doesn't have HW divider.
1038     setOperationAction(ISD::SDIV,  MVT::i32, LibCall);
1039     setOperationAction(ISD::UDIV,  MVT::i32, LibCall);
1040   }
1041
1042   if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) {
1043     setOperationAction(ISD::SDIV, MVT::i32, Custom);
1044     setOperationAction(ISD::UDIV, MVT::i32, Custom);
1045
1046     setOperationAction(ISD::SDIV, MVT::i64, Custom);
1047     setOperationAction(ISD::UDIV, MVT::i64, Custom);
1048   }
1049
1050   setOperationAction(ISD::SREM,  MVT::i32, Expand);
1051   setOperationAction(ISD::UREM,  MVT::i32, Expand);
1052
1053   // Register based DivRem for AEABI (RTABI 4.2)
1054   if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
1055       Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
1056       Subtarget->isTargetWindows()) {
1057     setOperationAction(ISD::SREM, MVT::i64, Custom);
1058     setOperationAction(ISD::UREM, MVT::i64, Custom);
1059     HasStandaloneRem = false;
1060
1061     if (Subtarget->isTargetWindows()) {
1062       const struct {
1063         const RTLIB::Libcall Op;
1064         const char * const Name;
1065         const CallingConv::ID CC;
1066       } LibraryCalls[] = {
1067         { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS },
1068         { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS },
1069         { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS },
1070         { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS },
1071
1072         { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS },
1073         { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS },
1074         { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS },
1075         { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS },
1076       };
1077
1078       for (const auto &LC : LibraryCalls) {
1079         setLibcallName(LC.Op, LC.Name);
1080         setLibcallCallingConv(LC.Op, LC.CC);
1081       }
1082     } else {
1083       const struct {
1084         const RTLIB::Libcall Op;
1085         const char * const Name;
1086         const CallingConv::ID CC;
1087       } LibraryCalls[] = {
1088         { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1089         { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1090         { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1091         { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS },
1092
1093         { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1094         { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1095         { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1096         { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS },
1097       };
1098
1099       for (const auto &LC : LibraryCalls) {
1100         setLibcallName(LC.Op, LC.Name);
1101         setLibcallCallingConv(LC.Op, LC.CC);
1102       }
1103     }
1104
1105     setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
1106     setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
1107     setOperationAction(ISD::SDIVREM, MVT::i64, Custom);
1108     setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
1109   } else {
1110     setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
1111     setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
1112   }
1113
1114   if (Subtarget->isTargetWindows() && Subtarget->getTargetTriple().isOSMSVCRT())
1115     for (auto &VT : {MVT::f32, MVT::f64})
1116       setOperationAction(ISD::FPOWI, VT, Custom);
1117
1118   setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
1119   setOperationAction(ISD::ConstantPool,  MVT::i32,   Custom);
1120   setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
1121   setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
1122
1123   setOperationAction(ISD::TRAP, MVT::Other, Legal);
1124   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
1125
1126   // Use the default implementation.
1127   setOperationAction(ISD::VASTART,            MVT::Other, Custom);
1128   setOperationAction(ISD::VAARG,              MVT::Other, Expand);
1129   setOperationAction(ISD::VACOPY,             MVT::Other, Expand);
1130   setOperationAction(ISD::VAEND,              MVT::Other, Expand);
1131   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
1132   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
1133
1134   if (Subtarget->isTargetWindows())
1135     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
1136   else
1137     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
1138
1139   // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1140   // the default expansion.
1141   InsertFencesForAtomic = false;
1142   if (Subtarget->hasAnyDataBarrier() &&
1143       (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1144     // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1145     // to ldrex/strex loops already.
1146     setOperationAction(ISD::ATOMIC_FENCE,     MVT::Other, Custom);
1147     if (!Subtarget->isThumb() || !Subtarget->isMClass())
1148       setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
1149
1150     // On v8, we have particularly efficient implementations of atomic fences
1151     // if they can be combined with nearby atomic loads and stores.
1152     if (!Subtarget->hasAcquireRelease() ||
1153         getTargetMachine().getOptLevel() == 0) {
1154       // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1155       InsertFencesForAtomic = true;
1156     }
1157   } else {
1158     // If there's anything we can use as a barrier, go through custom lowering
1159     // for ATOMIC_FENCE.
1160     // If target has DMB in thumb, Fences can be inserted.
1161     if (Subtarget->hasDataBarrier())
1162       InsertFencesForAtomic = true;
1163
1164     setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other,
1165                        Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1166
1167     // Set them all for expansion, which will force libcalls.
1168     setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
1169     setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
1170     setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
1171     setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Expand);
1172     setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Expand);
1173     setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Expand);
1174     setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Expand);
1175     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
1176     setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
1177     setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
1178     setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
1179     setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
1180     // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1181     // Unordered/Monotonic case.
1182     if (!InsertFencesForAtomic) {
1183       setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
1184       setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
1185     }
1186   }
1187
1188   setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
1189
1190   // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1191   if (!Subtarget->hasV6Ops()) {
1192     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
1193     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
1194   }
1195   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
1196
1197   if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1198       !Subtarget->isThumb1Only()) {
1199     // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1200     // iff target supports vfp2.
1201     setOperationAction(ISD::BITCAST, MVT::i64, Custom);
1202     setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
1203   }
1204
1205   // We want to custom lower some of our intrinsics.
1206   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1207   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
1208   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
1209   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
1210   if (Subtarget->useSjLjEH())
1211     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
1212
1213   setOperationAction(ISD::SETCC,     MVT::i32, Expand);
1214   setOperationAction(ISD::SETCC,     MVT::f32, Expand);
1215   setOperationAction(ISD::SETCC,     MVT::f64, Expand);
1216   setOperationAction(ISD::SELECT,    MVT::i32, Custom);
1217   setOperationAction(ISD::SELECT,    MVT::f32, Custom);
1218   setOperationAction(ISD::SELECT,    MVT::f64, Custom);
1219   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
1220   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
1221   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
1222   if (Subtarget->hasFullFP16()) {
1223     setOperationAction(ISD::SETCC,     MVT::f16, Expand);
1224     setOperationAction(ISD::SELECT,    MVT::f16, Custom);
1225     setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
1226   }
1227
1228   setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom);
1229
1230   setOperationAction(ISD::BRCOND,    MVT::Other, Custom);
1231   setOperationAction(ISD::BR_CC,     MVT::i32,   Custom);
1232   if (Subtarget->hasFullFP16())
1233       setOperationAction(ISD::BR_CC, MVT::f16,   Custom);
1234   setOperationAction(ISD::BR_CC,     MVT::f32,   Custom);
1235   setOperationAction(ISD::BR_CC,     MVT::f64,   Custom);
1236   setOperationAction(ISD::BR_JT,     MVT::Other, Custom);
1237
1238   // We don't support sin/cos/fmod/copysign/pow
1239   setOperationAction(ISD::FSIN,      MVT::f64, Expand);
1240   setOperationAction(ISD::FSIN,      MVT::f32, Expand);
1241   setOperationAction(ISD::FCOS,      MVT::f32, Expand);
1242   setOperationAction(ISD::FCOS,      MVT::f64, Expand);
1243   setOperationAction(ISD::FSINCOS,   MVT::f64, Expand);
1244   setOperationAction(ISD::FSINCOS,   MVT::f32, Expand);
1245   setOperationAction(ISD::FREM,      MVT::f64, Expand);
1246   setOperationAction(ISD::FREM,      MVT::f32, Expand);
1247   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1248       !Subtarget->isThumb1Only()) {
1249     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
1250     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
1251   }
1252   setOperationAction(ISD::FPOW,      MVT::f64, Expand);
1253   setOperationAction(ISD::FPOW,      MVT::f32, Expand);
1254
1255   if (!Subtarget->hasVFP4Base()) {
1256     setOperationAction(ISD::FMA, MVT::f64, Expand);
1257     setOperationAction(ISD::FMA, MVT::f32, Expand);
1258   }
1259
1260   // Various VFP goodness
1261   if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1262     // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1263     if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1264       setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
1265       setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
1266     }
1267
1268     // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1269     if (!Subtarget->hasFP16()) {
1270       setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
1271       setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
1272     }
1273   }
1274
1275   // Use __sincos_stret if available.
1276   if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1277       getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1278     setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1279     setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1280   }
1281
1282   // FP-ARMv8 implements a lot of rounding-like FP operations.
1283   if (Subtarget->hasFPARMv8Base()) {
1284     setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
1285     setOperationAction(ISD::FCEIL, MVT::f32, Legal);
1286     setOperationAction(ISD::FROUND, MVT::f32, Legal);
1287     setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
1288     setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
1289     setOperationAction(ISD::FRINT, MVT::f32, Legal);
1290     setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
1291     setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
1292     if (Subtarget->hasNEON()) {
1293       setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
1294       setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
1295       setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
1296       setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
1297     }
1298
1299     if (Subtarget->hasFP64()) {
1300       setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
1301       setOperationAction(ISD::FCEIL, MVT::f64, Legal);
1302       setOperationAction(ISD::FROUND, MVT::f64, Legal);
1303       setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
1304       setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
1305       setOperationAction(ISD::FRINT, MVT::f64, Legal);
1306       setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
1307       setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
1308     }
1309   }
1310
1311   // FP16 often need to be promoted to call lib functions
1312   if (Subtarget->hasFullFP16()) {
1313     setOperationAction(ISD::FREM, MVT::f16, Promote);
1314     setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
1315     setOperationAction(ISD::FSIN, MVT::f16, Promote);
1316     setOperationAction(ISD::FCOS, MVT::f16, Promote);
1317     setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
1318     setOperationAction(ISD::FPOWI, MVT::f16, Promote);
1319     setOperationAction(ISD::FPOW, MVT::f16, Promote);
1320     setOperationAction(ISD::FEXP, MVT::f16, Promote);
1321     setOperationAction(ISD::FEXP2, MVT::f16, Promote);
1322     setOperationAction(ISD::FLOG, MVT::f16, Promote);
1323     setOperationAction(ISD::FLOG10, MVT::f16, Promote);
1324     setOperationAction(ISD::FLOG2, MVT::f16, Promote);
1325
1326     setOperationAction(ISD::FROUND, MVT::f16, Legal);
1327   }
1328
1329   if (Subtarget->hasNEON()) {
1330     // vmin and vmax aren't available in a scalar form, so we use
1331     // a NEON instruction with an undef lane instead.
1332     setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
1333     setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
1334     setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
1335     setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
1336     setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);
1337     setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);
1338     setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
1339     setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);
1340
1341     if (Subtarget->hasFullFP16()) {
1342       setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal);
1343       setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal);
1344       setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal);
1345       setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal);
1346
1347       setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal);
1348       setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal);
1349       setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal);
1350       setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal);
1351     }
1352   }
1353
1354   // We have target-specific dag combine patterns for the following nodes:
1355   // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
1356   setTargetDAGCombine(ISD::ADD);
1357   setTargetDAGCombine(ISD::SUB);
1358   setTargetDAGCombine(ISD::MUL);
1359   setTargetDAGCombine(ISD::AND);
1360   setTargetDAGCombine(ISD::OR);
1361   setTargetDAGCombine(ISD::XOR);
1362
1363   if (Subtarget->hasV6Ops())
1364     setTargetDAGCombine(ISD::SRL);
1365   if (Subtarget->isThumb1Only())
1366     setTargetDAGCombine(ISD::SHL);
1367
1368   setStackPointerRegisterToSaveRestore(ARM::SP);
1369
1370   if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1371       !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1372     setSchedulingPreference(Sched::RegPressure);
1373   else
1374     setSchedulingPreference(Sched::Hybrid);
1375
1376   //// temporary - rewrite interface to use type
1377   MaxStoresPerMemset = 8;
1378   MaxStoresPerMemsetOptSize = 4;
1379   MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1380   MaxStoresPerMemcpyOptSize = 2;
1381   MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1382   MaxStoresPerMemmoveOptSize = 2;
1383
1384   // On ARM arguments smaller than 4 bytes are extended, so all arguments
1385   // are at least 4 bytes aligned.
1386   setMinStackArgumentAlignment(4);
1387
1388   // Prefer likely predicted branches to selects on out-of-order cores.
1389   PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1390
1391   setPrefLoopAlignment(Subtarget->getPrefLoopAlignment());
1392
1393   setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
1394
1395   if (Subtarget->isThumb() || Subtarget->isThumb2())
1396     setTargetDAGCombine(ISD::ABS);
1397 }
1398
1399 bool ARMTargetLowering::useSoftFloat() const {
1400   return Subtarget->useSoftFloat();
1401 }
1402
1403 // FIXME: It might make sense to define the representative register class as the
1404 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1405 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1406 // SPR's representative would be DPR_VFP2. This should work well if register
1407 // pressure tracking were modified such that a register use would increment the
1408 // pressure of the register class's representative and all of it's super
1409 // classes' representatives transitively. We have not implemented this because
1410 // of the difficulty prior to coalescing of modeling operand register classes
1411 // due to the common occurrence of cross class copies and subregister insertions
1412 // and extractions.
1413 std::pair<const TargetRegisterClass *, uint8_t>
1414 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1415                                            MVT VT) const {
1416   const TargetRegisterClass *RRC = nullptr;
1417   uint8_t Cost = 1;
1418   switch (VT.SimpleTy) {
1419   default:
1420     return TargetLowering::findRepresentativeClass(TRI, VT);
1421   // Use DPR as representative register class for all floating point
1422   // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1423   // the cost is 1 for both f32 and f64.
1424   case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1425   case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1426     RRC = &ARM::DPRRegClass;
1427     // When NEON is used for SP, only half of the register file is available
1428     // because operations that define both SP and DP results will be constrained
1429     // to the VFP2 class (D0-D15). We currently model this constraint prior to
1430     // coalescing by double-counting the SP regs. See the FIXME above.
1431     if (Subtarget->useNEONForSinglePrecisionFP())
1432       Cost = 2;
1433     break;
1434   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1435   case MVT::v4f32: case MVT::v2f64:
1436     RRC = &ARM::DPRRegClass;
1437     Cost = 2;
1438     break;
1439   case MVT::v4i64:
1440     RRC = &ARM::DPRRegClass;
1441     Cost = 4;
1442     break;
1443   case MVT::v8i64:
1444     RRC = &ARM::DPRRegClass;
1445     Cost = 8;
1446     break;
1447   }
1448   return std::make_pair(RRC, Cost);
1449 }
1450
1451 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
1452   switch ((ARMISD::NodeType)Opcode) {
1453   case ARMISD::FIRST_NUMBER:  break;
1454   case ARMISD::Wrapper:       return "ARMISD::Wrapper";
1455   case ARMISD::WrapperPIC:    return "ARMISD::WrapperPIC";
1456   case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
1457   case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL";
1458   case ARMISD::CALL:          return "ARMISD::CALL";
1459   case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
1460   case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
1461   case ARMISD::BRCOND:        return "ARMISD::BRCOND";
1462   case ARMISD::BR_JT:         return "ARMISD::BR_JT";
1463   case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
1464   case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
1465   case ARMISD::INTRET_FLAG:   return "ARMISD::INTRET_FLAG";
1466   case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
1467   case ARMISD::CMP:           return "ARMISD::CMP";
1468   case ARMISD::CMN:           return "ARMISD::CMN";
1469   case ARMISD::CMPZ:          return "ARMISD::CMPZ";
1470   case ARMISD::CMPFP:         return "ARMISD::CMPFP";
1471   case ARMISD::CMPFPw0:       return "ARMISD::CMPFPw0";
1472   case ARMISD::BCC_i64:       return "ARMISD::BCC_i64";
1473   case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
1474
1475   case ARMISD::CMOV:          return "ARMISD::CMOV";
1476   case ARMISD::SUBS:          return "ARMISD::SUBS";
1477
1478   case ARMISD::SSAT:          return "ARMISD::SSAT";
1479   case ARMISD::USAT:          return "ARMISD::USAT";
1480
1481   case ARMISD::ASRL:          return "ARMISD::ASRL";
1482   case ARMISD::LSRL:          return "ARMISD::LSRL";
1483   case ARMISD::LSLL:          return "ARMISD::LSLL";
1484
1485   case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
1486   case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
1487   case ARMISD::RRX:           return "ARMISD::RRX";
1488
1489   case ARMISD::ADDC:          return "ARMISD::ADDC";
1490   case ARMISD::ADDE:          return "ARMISD::ADDE";
1491   case ARMISD::SUBC:          return "ARMISD::SUBC";
1492   case ARMISD::SUBE:          return "ARMISD::SUBE";
1493   case ARMISD::LSLS:          return "ARMISD::LSLS";
1494
1495   case ARMISD::VMOVRRD:       return "ARMISD::VMOVRRD";
1496   case ARMISD::VMOVDRR:       return "ARMISD::VMOVDRR";
1497   case ARMISD::VMOVhr:        return "ARMISD::VMOVhr";
1498   case ARMISD::VMOVrh:        return "ARMISD::VMOVrh";
1499   case ARMISD::VMOVSR:        return "ARMISD::VMOVSR";
1500
1501   case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
1502   case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP";
1503   case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH";
1504
1505   case ARMISD::TC_RETURN:     return "ARMISD::TC_RETURN";
1506
1507   case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
1508
1509   case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
1510
1511   case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
1512
1513   case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
1514
1515   case ARMISD::WIN__CHKSTK:   return "ARMISD::WIN__CHKSTK";
1516   case ARMISD::WIN__DBZCHK:   return "ARMISD::WIN__DBZCHK";
1517
1518   case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST";
1519   case ARMISD::VCMP:          return "ARMISD::VCMP";
1520   case ARMISD::VCMPZ:         return "ARMISD::VCMPZ";
1521   case ARMISD::VTST:          return "ARMISD::VTST";
1522
1523   case ARMISD::VSHLs:         return "ARMISD::VSHLs";
1524   case ARMISD::VSHLu:         return "ARMISD::VSHLu";
1525   case ARMISD::VSHLIMM:       return "ARMISD::VSHLIMM";
1526   case ARMISD::VSHRsIMM:      return "ARMISD::VSHRsIMM";
1527   case ARMISD::VSHRuIMM:      return "ARMISD::VSHRuIMM";
1528   case ARMISD::VRSHRsIMM:     return "ARMISD::VRSHRsIMM";
1529   case ARMISD::VRSHRuIMM:     return "ARMISD::VRSHRuIMM";
1530   case ARMISD::VRSHRNIMM:     return "ARMISD::VRSHRNIMM";
1531   case ARMISD::VQSHLsIMM:     return "ARMISD::VQSHLsIMM";
1532   case ARMISD::VQSHLuIMM:     return "ARMISD::VQSHLuIMM";
1533   case ARMISD::VQSHLsuIMM:    return "ARMISD::VQSHLsuIMM";
1534   case ARMISD::VQSHRNsIMM:    return "ARMISD::VQSHRNsIMM";
1535   case ARMISD::VQSHRNuIMM:    return "ARMISD::VQSHRNuIMM";
1536   case ARMISD::VQSHRNsuIMM:   return "ARMISD::VQSHRNsuIMM";
1537   case ARMISD::VQRSHRNsIMM:   return "ARMISD::VQRSHRNsIMM";
1538   case ARMISD::VQRSHRNuIMM:   return "ARMISD::VQRSHRNuIMM";
1539   case ARMISD::VQRSHRNsuIMM:  return "ARMISD::VQRSHRNsuIMM";
1540   case ARMISD::VSLIIMM:       return "ARMISD::VSLIIMM";
1541   case ARMISD::VSRIIMM:       return "ARMISD::VSRIIMM";
1542   case ARMISD::VGETLANEu:     return "ARMISD::VGETLANEu";
1543   case ARMISD::VGETLANEs:     return "ARMISD::VGETLANEs";
1544   case ARMISD::VMOVIMM:       return "ARMISD::VMOVIMM";
1545   case ARMISD::VMVNIMM:       return "ARMISD::VMVNIMM";
1546   case ARMISD::VMOVFPIMM:     return "ARMISD::VMOVFPIMM";
1547   case ARMISD::VDUP:          return "ARMISD::VDUP";
1548   case ARMISD::VDUPLANE:      return "ARMISD::VDUPLANE";
1549   case ARMISD::VEXT:          return "ARMISD::VEXT";
1550   case ARMISD::VREV64:        return "ARMISD::VREV64";
1551   case ARMISD::VREV32:        return "ARMISD::VREV32";
1552   case ARMISD::VREV16:        return "ARMISD::VREV16";
1553   case ARMISD::VZIP:          return "ARMISD::VZIP";
1554   case ARMISD::VUZP:          return "ARMISD::VUZP";
1555   case ARMISD::VTRN:          return "ARMISD::VTRN";
1556   case ARMISD::VTBL1:         return "ARMISD::VTBL1";
1557   case ARMISD::VTBL2:         return "ARMISD::VTBL2";
1558   case ARMISD::VMULLs:        return "ARMISD::VMULLs";
1559   case ARMISD::VMULLu:        return "ARMISD::VMULLu";
1560   case ARMISD::UMAAL:         return "ARMISD::UMAAL";
1561   case ARMISD::UMLAL:         return "ARMISD::UMLAL";
1562   case ARMISD::SMLAL:         return "ARMISD::SMLAL";
1563   case ARMISD::SMLALBB:       return "ARMISD::SMLALBB";
1564   case ARMISD::SMLALBT:       return "ARMISD::SMLALBT";
1565   case ARMISD::SMLALTB:       return "ARMISD::SMLALTB";
1566   case ARMISD::SMLALTT:       return "ARMISD::SMLALTT";
1567   case ARMISD::SMULWB:        return "ARMISD::SMULWB";
1568   case ARMISD::SMULWT:        return "ARMISD::SMULWT";
1569   case ARMISD::SMLALD:        return "ARMISD::SMLALD";
1570   case ARMISD::SMLALDX:       return "ARMISD::SMLALDX";
1571   case ARMISD::SMLSLD:        return "ARMISD::SMLSLD";
1572   case ARMISD::SMLSLDX:       return "ARMISD::SMLSLDX";
1573   case ARMISD::SMMLAR:        return "ARMISD::SMMLAR";
1574   case ARMISD::SMMLSR:        return "ARMISD::SMMLSR";
1575   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
1576   case ARMISD::BFI:           return "ARMISD::BFI";
1577   case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
1578   case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
1579   case ARMISD::VBSL:          return "ARMISD::VBSL";
1580   case ARMISD::MEMCPY:        return "ARMISD::MEMCPY";
1581   case ARMISD::VLD1DUP:       return "ARMISD::VLD1DUP";
1582   case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
1583   case ARMISD::VLD3DUP:       return "ARMISD::VLD3DUP";
1584   case ARMISD::VLD4DUP:       return "ARMISD::VLD4DUP";
1585   case ARMISD::VLD1_UPD:      return "ARMISD::VLD1_UPD";
1586   case ARMISD::VLD2_UPD:      return "ARMISD::VLD2_UPD";
1587   case ARMISD::VLD3_UPD:      return "ARMISD::VLD3_UPD";
1588   case ARMISD::VLD4_UPD:      return "ARMISD::VLD4_UPD";
1589   case ARMISD::VLD2LN_UPD:    return "ARMISD::VLD2LN_UPD";
1590   case ARMISD::VLD3LN_UPD:    return "ARMISD::VLD3LN_UPD";
1591   case ARMISD::VLD4LN_UPD:    return "ARMISD::VLD4LN_UPD";
1592   case ARMISD::VLD1DUP_UPD:   return "ARMISD::VLD1DUP_UPD";
1593   case ARMISD::VLD2DUP_UPD:   return "ARMISD::VLD2DUP_UPD";
1594   case ARMISD::VLD3DUP_UPD:   return "ARMISD::VLD3DUP_UPD";
1595   case ARMISD::VLD4DUP_UPD:   return "ARMISD::VLD4DUP_UPD";
1596   case ARMISD::VST1_UPD:      return "ARMISD::VST1_UPD";
1597   case ARMISD::VST2_UPD:      return "ARMISD::VST2_UPD";
1598   case ARMISD::VST3_UPD:      return "ARMISD::VST3_UPD";
1599   case ARMISD::VST4_UPD:      return "ARMISD::VST4_UPD";
1600   case ARMISD::VST2LN_UPD:    return "ARMISD::VST2LN_UPD";
1601   case ARMISD::VST3LN_UPD:    return "ARMISD::VST3LN_UPD";
1602   case ARMISD::VST4LN_UPD:    return "ARMISD::VST4LN_UPD";
1603   case ARMISD::WLS:           return "ARMISD::WLS";
1604   case ARMISD::LE:            return "ARMISD::LE";
1605   case ARMISD::LOOP_DEC:      return "ARMISD::LOOP_DEC";
1606   }
1607   return nullptr;
1608 }
1609
1610 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
1611                                           EVT VT) const {
1612   if (!VT.isVector())
1613     return getPointerTy(DL);
1614
1615   // MVE has a predicate register.
1616   if (Subtarget->hasMVEIntegerOps() &&
1617       (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8))
1618     return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
1619   return VT.changeVectorElementTypeToInteger();
1620 }
1621
1622 /// getRegClassFor - Return the register class that should be used for the
1623 /// specified value type.
1624 const TargetRegisterClass *
1625 ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1626   (void)isDivergent;
1627   // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1628   // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1629   // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1630   // MVE Q registers.
1631   if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
1632     if (VT == MVT::v4i64)
1633       return &ARM::QQPRRegClass;
1634     if (VT == MVT::v8i64)
1635       return &ARM::QQQQPRRegClass;
1636   }
1637   return TargetLowering::getRegClassFor(VT);
1638 }
1639
1640 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1641 // source/dest is aligned and the copy size is large enough. We therefore want
1642 // to align such objects passed to memory intrinsics.
1643 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
1644                                                unsigned &PrefAlign) const {
1645   if (!isa<MemIntrinsic>(CI))
1646     return false;
1647   MinSize = 8;
1648   // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1649   // cycle faster than 4-byte aligned LDM.
1650   PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4);
1651   return true;
1652 }
1653
1654 // Create a fast isel object.
1655 FastISel *
1656 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
1657                                   const TargetLibraryInfo *libInfo) const {
1658   return ARM::createFastISel(funcInfo, libInfo);
1659 }
1660
1661 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
1662   unsigned NumVals = N->getNumValues();
1663   if (!NumVals)
1664     return Sched::RegPressure;
1665
1666   for (unsigned i = 0; i != NumVals; ++i) {
1667     EVT VT = N->getValueType(i);
1668     if (VT == MVT::Glue || VT == MVT::Other)
1669       continue;
1670     if (VT.isFloatingPoint() || VT.isVector())
1671       return Sched::ILP;
1672   }
1673
1674   if (!N->isMachineOpcode())
1675     return Sched::RegPressure;
1676
1677   // Load are scheduled for latency even if there instruction itinerary
1678   // is not available.
1679   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1680   const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1681
1682   if (MCID.getNumDefs() == 0)
1683     return Sched::RegPressure;
1684   if (!Itins->isEmpty() &&
1685       Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2)
1686     return Sched::ILP;
1687
1688   return Sched::RegPressure;
1689 }
1690
1691 //===----------------------------------------------------------------------===//
1692 // Lowering Code
1693 //===----------------------------------------------------------------------===//
1694
1695 static bool isSRL16(const SDValue &Op) {
1696   if (Op.getOpcode() != ISD::SRL)
1697     return false;
1698   if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1699     return Const->getZExtValue() == 16;
1700   return false;
1701 }
1702
1703 static bool isSRA16(const SDValue &Op) {
1704   if (Op.getOpcode() != ISD::SRA)
1705     return false;
1706   if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1707     return Const->getZExtValue() == 16;
1708   return false;
1709 }
1710
1711 static bool isSHL16(const SDValue &Op) {
1712   if (Op.getOpcode() != ISD::SHL)
1713     return false;
1714   if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1715     return Const->getZExtValue() == 16;
1716   return false;
1717 }
1718
1719 // Check for a signed 16-bit value. We special case SRA because it makes it
1720 // more simple when also looking for SRAs that aren't sign extending a
1721 // smaller value. Without the check, we'd need to take extra care with
1722 // checking order for some operations.
1723 static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
1724   if (isSRA16(Op))
1725     return isSHL16(Op.getOperand(0));
1726   return DAG.ComputeNumSignBits(Op) == 17;
1727 }
1728
1729 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1730 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
1731   switch (CC) {
1732   default: llvm_unreachable("Unknown condition code!");
1733   case ISD::SETNE:  return ARMCC::NE;
1734   case ISD::SETEQ:  return ARMCC::EQ;
1735   case ISD::SETGT:  return ARMCC::GT;
1736   case ISD::SETGE:  return ARMCC::GE;
1737   case ISD::SETLT:  return ARMCC::LT;
1738   case ISD::SETLE:  return ARMCC::LE;
1739   case ISD::SETUGT: return ARMCC::HI;
1740   case ISD::SETUGE: return ARMCC::HS;
1741   case ISD::SETULT: return ARMCC::LO;
1742   case ISD::SETULE: return ARMCC::LS;
1743   }
1744 }
1745
1746 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1747 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
1748                         ARMCC::CondCodes &CondCode2, bool &InvalidOnQNaN) {
1749   CondCode2 = ARMCC::AL;
1750   InvalidOnQNaN = true;
1751   switch (CC) {
1752   default: llvm_unreachable("Unknown FP condition!");
1753   case ISD::SETEQ:
1754   case ISD::SETOEQ:
1755     CondCode = ARMCC::EQ;
1756     InvalidOnQNaN = false;
1757     break;
1758   case ISD::SETGT:
1759   case ISD::SETOGT: CondCode = ARMCC::GT; break;
1760   case ISD::SETGE:
1761   case ISD::SETOGE: CondCode = ARMCC::GE; break;
1762   case ISD::SETOLT: CondCode = ARMCC::MI; break;
1763   case ISD::SETOLE: CondCode = ARMCC::LS; break;
1764   case ISD::SETONE:
1765     CondCode = ARMCC::MI;
1766     CondCode2 = ARMCC::GT;
1767     InvalidOnQNaN = false;
1768     break;
1769   case ISD::SETO:   CondCode = ARMCC::VC; break;
1770   case ISD::SETUO:  CondCode = ARMCC::VS; break;
1771   case ISD::SETUEQ:
1772     CondCode = ARMCC::EQ;
1773     CondCode2 = ARMCC::VS;
1774     InvalidOnQNaN = false;
1775     break;
1776   case ISD::SETUGT: CondCode = ARMCC::HI; break;
1777   case ISD::SETUGE: CondCode = ARMCC::PL; break;
1778   case ISD::SETLT:
1779   case ISD::SETULT: CondCode = ARMCC::LT; break;
1780   case ISD::SETLE:
1781   case ISD::SETULE: CondCode = ARMCC::LE; break;
1782   case ISD::SETNE:
1783   case ISD::SETUNE:
1784     CondCode = ARMCC::NE;
1785     InvalidOnQNaN = false;
1786     break;
1787   }
1788 }
1789
1790 //===----------------------------------------------------------------------===//
1791 //                      Calling Convention Implementation
1792 //===----------------------------------------------------------------------===//
1793
1794 /// getEffectiveCallingConv - Get the effective calling convention, taking into
1795 /// account presence of floating point hardware and calling convention
1796 /// limitations, such as support for variadic functions.
1797 CallingConv::ID
1798 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
1799                                            bool isVarArg) const {
1800   switch (CC) {
1801   default:
1802     report_fatal_error("Unsupported calling convention");
1803   case CallingConv::ARM_AAPCS:
1804   case CallingConv::ARM_APCS:
1805   case CallingConv::GHC:
1806     return CC;
1807   case CallingConv::PreserveMost:
1808     return CallingConv::PreserveMost;
1809   case CallingConv::ARM_AAPCS_VFP:
1810   case CallingConv::Swift:
1811     return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
1812   case CallingConv::C:
1813     if (!Subtarget->isAAPCS_ABI())
1814       return CallingConv::ARM_APCS;
1815     else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
1816              getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
1817              !isVarArg)
1818       return CallingConv::ARM_AAPCS_VFP;
1819     else
1820       return CallingConv::ARM_AAPCS;
1821   case CallingConv::Fast:
1822   case CallingConv::CXX_FAST_TLS:
1823     if (!Subtarget->isAAPCS_ABI()) {
1824       if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
1825         return CallingConv::Fast;
1826       return CallingConv::ARM_APCS;
1827     } else if (Subtarget->hasVFP2Base() &&
1828                !Subtarget->isThumb1Only() && !isVarArg)
1829       return CallingConv::ARM_AAPCS_VFP;
1830     else
1831       return CallingConv::ARM_AAPCS;
1832   }
1833 }
1834
1835 CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1836                                                  bool isVarArg) const {
1837   return CCAssignFnForNode(CC, false, isVarArg);
1838 }
1839
1840 CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1841                                                    bool isVarArg) const {
1842   return CCAssignFnForNode(CC, true, isVarArg);
1843 }
1844
1845 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given
1846 /// CallingConvention.
1847 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
1848                                                  bool Return,
1849                                                  bool isVarArg) const {
1850   switch (getEffectiveCallingConv(CC, isVarArg)) {
1851   default:
1852     report_fatal_error("Unsupported calling convention");
1853   case CallingConv::ARM_APCS:
1854     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1855   case CallingConv::ARM_AAPCS:
1856     return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1857   case CallingConv::ARM_AAPCS_VFP:
1858     return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1859   case CallingConv::Fast:
1860     return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
1861   case CallingConv::GHC:
1862     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
1863   case CallingConv::PreserveMost:
1864     return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1865   }
1866 }
1867
1868 /// LowerCallResult - Lower the result values of a call into the
1869 /// appropriate copies out of appropriate physical registers.
1870 SDValue ARMTargetLowering::LowerCallResult(
1871     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
1872     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1873     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
1874     SDValue ThisVal) const {
1875   // Assign locations to each value returned by this call.
1876   SmallVector<CCValAssign, 16> RVLocs;
1877   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1878                  *DAG.getContext());
1879   CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
1880
1881   // Copy all of the result registers out of their specified physreg.
1882   for (unsigned i = 0; i != RVLocs.size(); ++i) {
1883     CCValAssign VA = RVLocs[i];
1884
1885     // Pass 'this' value directly from the argument to return value, to avoid
1886     // reg unit interference
1887     if (i == 0 && isThisReturn) {
1888       assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
1889              "unexpected return calling convention register assignment");
1890       InVals.push_back(ThisVal);
1891       continue;
1892     }
1893
1894     SDValue Val;
1895     if (VA.needsCustom()) {
1896       // Handle f64 or half of a v2f64.
1897       SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1898                                       InFlag);
1899       Chain = Lo.getValue(1);
1900       InFlag = Lo.getValue(2);
1901       VA = RVLocs[++i]; // skip ahead to next loc
1902       SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1903                                       InFlag);
1904       Chain = Hi.getValue(1);
1905       InFlag = Hi.getValue(2);
1906       if (!Subtarget->isLittle())
1907         std::swap (Lo, Hi);
1908       Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1909
1910       if (VA.getLocVT() == MVT::v2f64) {
1911         SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
1912         Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1913                           DAG.getConstant(0, dl, MVT::i32));
1914
1915         VA = RVLocs[++i]; // skip ahead to next loc
1916         Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
1917         Chain = Lo.getValue(1);
1918         InFlag = Lo.getValue(2);
1919         VA = RVLocs[++i]; // skip ahead to next loc
1920         Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
1921         Chain = Hi.getValue(1);
1922         InFlag = Hi.getValue(2);
1923         if (!Subtarget->isLittle())
1924           std::swap (Lo, Hi);
1925         Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1926         Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1927                           DAG.getConstant(1, dl, MVT::i32));
1928       }
1929     } else {
1930       Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1931                                InFlag);
1932       Chain = Val.getValue(1);
1933       InFlag = Val.getValue(2);
1934     }
1935
1936     switch (VA.getLocInfo()) {
1937     default: llvm_unreachable("Unknown loc info!");
1938     case CCValAssign::Full: break;
1939     case CCValAssign::BCvt:
1940       Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
1941       break;
1942     }
1943
1944     InVals.push_back(Val);
1945   }
1946
1947   return Chain;
1948 }
1949
1950 /// LowerMemOpCallTo - Store the argument to the stack.
1951 SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
1952                                             SDValue Arg, const SDLoc &dl,
1953                                             SelectionDAG &DAG,
1954                                             const CCValAssign &VA,
1955                                             ISD::ArgFlagsTy Flags) const {
1956   unsigned LocMemOffset = VA.getLocMemOffset();
1957   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
1958   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
1959                        StackPtr, PtrOff);
1960   return DAG.getStore(
1961       Chain, dl, Arg, PtrOff,
1962       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
1963 }
1964
1965 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
1966                                          SDValue Chain, SDValue &Arg,
1967                                          RegsToPassVector &RegsToPass,
1968                                          CCValAssign &VA, CCValAssign &NextVA,
1969                                          SDValue &StackPtr,
1970                                          SmallVectorImpl<SDValue> &MemOpChains,
1971                                          ISD::ArgFlagsTy Flags) const {
1972   SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
1973                               DAG.getVTList(MVT::i32, MVT::i32), Arg);
1974   unsigned id = Subtarget->isLittle() ? 0 : 1;
1975   RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
1976
1977   if (NextVA.isRegLoc())
1978     RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
1979   else {
1980     assert(NextVA.isMemLoc());
1981     if (!StackPtr.getNode())
1982       StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
1983                                     getPointerTy(DAG.getDataLayout()));
1984
1985     MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id),
1986                                            dl, DAG, NextVA,
1987                                            Flags));
1988   }
1989 }
1990
1991 /// LowerCall - Lowering a call into a callseq_start <-
1992 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
1993 /// nodes.
1994 SDValue
1995 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1996                              SmallVectorImpl<SDValue> &InVals) const {
1997   SelectionDAG &DAG                     = CLI.DAG;
1998   SDLoc &dl                             = CLI.DL;
1999   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2000   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
2001   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
2002   SDValue Chain                         = CLI.Chain;
2003   SDValue Callee                        = CLI.Callee;
2004   bool &isTailCall                      = CLI.IsTailCall;
2005   CallingConv::ID CallConv              = CLI.CallConv;
2006   bool doesNotRet                       = CLI.DoesNotReturn;
2007   bool isVarArg                         = CLI.IsVarArg;
2008
2009   MachineFunction &MF = DAG.getMachineFunction();
2010   bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2011   bool isThisReturn = false;
2012   auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
2013   bool PreferIndirect = false;
2014
2015   // Disable tail calls if they're not supported.
2016   if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true")
2017     isTailCall = false;
2018
2019   if (isa<GlobalAddressSDNode>(Callee)) {
2020     // If we're optimizing for minimum size and the function is called three or
2021     // more times in this block, we can improve codesize by calling indirectly
2022     // as BLXr has a 16-bit encoding.
2023     auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2024     if (CLI.CS) {
2025       auto *BB = CLI.CS.getParent();
2026       PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2027                        count_if(GV->users(), [&BB](const User *U) {
2028                          return isa<Instruction>(U) &&
2029                                 cast<Instruction>(U)->getParent() == BB;
2030                        }) > 2;
2031     }
2032   }
2033   if (isTailCall) {
2034     // Check if it's really possible to do a tail call.
2035     isTailCall = IsEligibleForTailCallOptimization(
2036         Callee, CallConv, isVarArg, isStructRet,
2037         MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG,
2038         PreferIndirect);
2039     if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall())
2040       report_fatal_error("failed to perform tail call elimination on a call "
2041                          "site marked musttail");
2042     // We don't support GuaranteedTailCallOpt for ARM, only automatically
2043     // detected sibcalls.
2044     if (isTailCall)
2045       ++NumTailCalls;
2046   }
2047
2048   // Analyze operands of the call, assigning locations to each operand.
2049   SmallVector<CCValAssign, 16> ArgLocs;
2050   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2051                  *DAG.getContext());
2052   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
2053
2054   // Get a count of how many bytes are to be pushed on the stack.
2055   unsigned NumBytes = CCInfo.getNextStackOffset();
2056
2057   if (isTailCall) {
2058     // For tail calls, memory operands are available in our caller's stack.
2059     NumBytes = 0;
2060   } else {
2061     // Adjust the stack pointer for the new arguments...
2062     // These operations are automatically eliminated by the prolog/epilog pass
2063     Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
2064   }
2065
2066   SDValue StackPtr =
2067       DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2068
2069   RegsToPassVector RegsToPass;
2070   SmallVector<SDValue, 8> MemOpChains;
2071
2072   // Walk the register/memloc assignments, inserting copies/loads.  In the case
2073   // of tail call optimization, arguments are handled later.
2074   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2075        i != e;
2076        ++i, ++realArgIdx) {
2077     CCValAssign &VA = ArgLocs[i];
2078     SDValue Arg = OutVals[realArgIdx];
2079     ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2080     bool isByVal = Flags.isByVal();
2081
2082     // Promote the value if needed.
2083     switch (VA.getLocInfo()) {
2084     default: llvm_unreachable("Unknown loc info!");
2085     case CCValAssign::Full: break;
2086     case CCValAssign::SExt:
2087       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
2088       break;
2089     case CCValAssign::ZExt:
2090       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
2091       break;
2092     case CCValAssign::AExt:
2093       Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
2094       break;
2095     case CCValAssign::BCvt:
2096       Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2097       break;
2098     }
2099
2100     // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2101     if (VA.needsCustom()) {
2102       if (VA.getLocVT() == MVT::v2f64) {
2103         SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2104                                   DAG.getConstant(0, dl, MVT::i32));
2105         SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2106                                   DAG.getConstant(1, dl, MVT::i32));
2107
2108         PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
2109                          VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
2110
2111         VA = ArgLocs[++i]; // skip ahead to next loc
2112         if (VA.isRegLoc()) {
2113           PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
2114                            VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
2115         } else {
2116           assert(VA.isMemLoc());
2117
2118           MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
2119                                                  dl, DAG, VA, Flags));
2120         }
2121       } else {
2122         PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
2123                          StackPtr, MemOpChains, Flags);
2124       }
2125     } else if (VA.isRegLoc()) {
2126       if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2127           Outs[0].VT == MVT::i32) {
2128         assert(VA.getLocVT() == MVT::i32 &&
2129                "unexpected calling convention register assignment");
2130         assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2131                "unexpected use of 'returned'");
2132         isThisReturn = true;
2133       }
2134       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2135     } else if (isByVal) {
2136       assert(VA.isMemLoc());
2137       unsigned offset = 0;
2138
2139       // True if this byval aggregate will be split between registers
2140       // and memory.
2141       unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2142       unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2143
2144       if (CurByValIdx < ByValArgsCount) {
2145
2146         unsigned RegBegin, RegEnd;
2147         CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
2148
2149         EVT PtrVT =
2150             DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
2151         unsigned int i, j;
2152         for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2153           SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2154           SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
2155           SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
2156                                      MachinePointerInfo(),
2157                                      DAG.InferPtrAlignment(AddArg));
2158           MemOpChains.push_back(Load.getValue(1));
2159           RegsToPass.push_back(std::make_pair(j, Load));
2160         }
2161
2162         // If parameter size outsides register area, "offset" value
2163         // helps us to calculate stack slot for remained part properly.
2164         offset = RegEnd - RegBegin;
2165
2166         CCInfo.nextInRegsParam();
2167       }
2168
2169       if (Flags.getByValSize() > 4*offset) {
2170         auto PtrVT = getPointerTy(DAG.getDataLayout());
2171         unsigned LocMemOffset = VA.getLocMemOffset();
2172         SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
2173         SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff);
2174         SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2175         SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
2176         SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2177                                            MVT::i32);
2178         SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl,
2179                                             MVT::i32);
2180
2181         SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2182         SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2183         MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2184                                           Ops));
2185       }
2186     } else if (!isTailCall) {
2187       assert(VA.isMemLoc());
2188
2189       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2190                                              dl, DAG, VA, Flags));
2191     }
2192   }
2193
2194   if (!MemOpChains.empty())
2195     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2196
2197   // Build a sequence of copy-to-reg nodes chained together with token chain
2198   // and flag operands which copy the outgoing args into the appropriate regs.
2199   SDValue InFlag;
2200   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2201     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2202                              RegsToPass[i].second, InFlag);
2203     InFlag = Chain.getValue(1);
2204   }
2205
2206   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2207   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2208   // node so that legalize doesn't hack it.
2209   bool isDirect = false;
2210
2211   const TargetMachine &TM = getTargetMachine();
2212   const Module *Mod = MF.getFunction().getParent();
2213   const GlobalValue *GV = nullptr;
2214   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2215     GV = G->getGlobal();
2216   bool isStub =
2217       !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO();
2218
2219   bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2220   bool isLocalARMFunc = false;
2221   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2222   auto PtrVt = getPointerTy(DAG.getDataLayout());
2223
2224   if (Subtarget->genLongCalls()) {
2225     assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
2226            "long-calls codegen is not position independent!");
2227     // Handle a global address or an external symbol. If it's not one of
2228     // those, the target's already in a register, so we don't need to do
2229     // anything extra.
2230     if (isa<GlobalAddressSDNode>(Callee)) {
2231       // Create a constant pool entry for the callee address
2232       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2233       ARMConstantPoolValue *CPV =
2234         ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
2235
2236       // Get the address of the callee into a register
2237       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
2238       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2239       Callee = DAG.getLoad(
2240           PtrVt, dl, DAG.getEntryNode(), CPAddr,
2241           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2242     } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2243       const char *Sym = S->getSymbol();
2244
2245       // Create a constant pool entry for the callee address
2246       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2247       ARMConstantPoolValue *CPV =
2248         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
2249                                       ARMPCLabelIndex, 0);
2250       // Get the address of the callee into a register
2251       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
2252       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2253       Callee = DAG.getLoad(
2254           PtrVt, dl, DAG.getEntryNode(), CPAddr,
2255           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2256     }
2257   } else if (isa<GlobalAddressSDNode>(Callee)) {
2258     if (!PreferIndirect) {
2259       isDirect = true;
2260       bool isDef = GV->isStrongDefinitionForLinker();
2261
2262       // ARM call to a local ARM function is predicable.
2263       isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2264       // tBX takes a register source operand.
2265       if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2266         assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
2267         Callee = DAG.getNode(
2268             ARMISD::WrapperPIC, dl, PtrVt,
2269             DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2270         Callee = DAG.getLoad(
2271             PtrVt, dl, DAG.getEntryNode(), Callee,
2272             MachinePointerInfo::getGOT(DAG.getMachineFunction()),
2273             /* Alignment = */ 0, MachineMemOperand::MODereferenceable |
2274                                      MachineMemOperand::MOInvariant);
2275       } else if (Subtarget->isTargetCOFF()) {
2276         assert(Subtarget->isTargetWindows() &&
2277                "Windows is the only supported COFF target");
2278         unsigned TargetFlags = GV->hasDLLImportStorageClass()
2279                                    ? ARMII::MO_DLLIMPORT
2280                                    : ARMII::MO_NO_FLAG;
2281         Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0,
2282                                             TargetFlags);
2283         if (GV->hasDLLImportStorageClass())
2284           Callee =
2285               DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2286                           DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2287                           MachinePointerInfo::getGOT(DAG.getMachineFunction()));
2288       } else {
2289         Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0);
2290       }
2291     }
2292   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2293     isDirect = true;
2294     // tBX takes a register source operand.
2295     const char *Sym = S->getSymbol();
2296     if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2297       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2298       ARMConstantPoolValue *CPV =
2299         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
2300                                       ARMPCLabelIndex, 4);
2301       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
2302       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2303       Callee = DAG.getLoad(
2304           PtrVt, dl, DAG.getEntryNode(), CPAddr,
2305           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2306       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2307       Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2308     } else {
2309       Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2310     }
2311   }
2312
2313   // FIXME: handle tail calls differently.
2314   unsigned CallOpc;
2315   if (Subtarget->isThumb()) {
2316     if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2317       CallOpc = ARMISD::CALL_NOLINK;
2318     else
2319       CallOpc = ARMISD::CALL;
2320   } else {
2321     if (!isDirect && !Subtarget->hasV5TOps())
2322       CallOpc = ARMISD::CALL_NOLINK;
2323     else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2324              // Emit regular call when code size is the priority
2325              !Subtarget->hasMinSize())
2326       // "mov lr, pc; b _foo" to avoid confusing the RSP
2327       CallOpc = ARMISD::CALL_NOLINK;
2328     else
2329       CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2330   }
2331
2332   std::vector<SDValue> Ops;
2333   Ops.push_back(Chain);
2334   Ops.push_back(Callee);
2335
2336   // Add argument registers to the end of the list so that they are known live
2337   // into the call.
2338   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2339     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2340                                   RegsToPass[i].second.getValueType()));
2341
2342   // Add a register mask operand representing the call-preserved registers.
2343   if (!isTailCall) {
2344     const uint32_t *Mask;
2345     const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2346     if (isThisReturn) {
2347       // For 'this' returns, use the R0-preserving mask if applicable
2348       Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2349       if (!Mask) {
2350         // Set isThisReturn to false if the calling convention is not one that
2351         // allows 'returned' to be modeled in this way, so LowerCallResult does
2352         // not try to pass 'this' straight through
2353         isThisReturn = false;
2354         Mask = ARI->getCallPreservedMask(MF, CallConv);
2355       }
2356     } else
2357       Mask = ARI->getCallPreservedMask(MF, CallConv);
2358
2359     assert(Mask && "Missing call preserved mask for calling convention");
2360     Ops.push_back(DAG.getRegisterMask(Mask));
2361   }
2362
2363   if (InFlag.getNode())
2364     Ops.push_back(InFlag);
2365
2366   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2367   if (isTailCall) {
2368     MF.getFrameInfo().setHasTailCall();
2369     return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
2370   }
2371
2372   // Returns a chain and a flag for retval copy to use.
2373   Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
2374   InFlag = Chain.getValue(1);
2375
2376   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
2377                              DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
2378   if (!Ins.empty())
2379     InFlag = Chain.getValue(1);
2380
2381   // Handle result values, copying them out of physregs into vregs that we
2382   // return.
2383   return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
2384                          InVals, isThisReturn,
2385                          isThisReturn ? OutVals[0] : SDValue());
2386 }
2387
2388 /// HandleByVal - Every parameter *after* a byval parameter is passed
2389 /// on the stack.  Remember the next parameter register to allocate,
2390 /// and then confiscate the rest of the parameter registers to insure
2391 /// this.
2392 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2393                                     unsigned Align) const {
2394   // Byval (as with any stack) slots are always at least 4 byte aligned.
2395   Align = std::max(Align, 4U);
2396
2397   unsigned Reg = State->AllocateReg(GPRArgRegs);
2398   if (!Reg)
2399     return;
2400
2401   unsigned AlignInRegs = Align / 4;
2402   unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2403   for (unsigned i = 0; i < Waste; ++i)
2404     Reg = State->AllocateReg(GPRArgRegs);
2405
2406   if (!Reg)
2407     return;
2408
2409   unsigned Excess = 4 * (ARM::R4 - Reg);
2410
2411   // Special case when NSAA != SP and parameter size greater than size of
2412   // all remained GPR regs. In that case we can't split parameter, we must
2413   // send it to stack. We also must set NCRN to R4, so waste all
2414   // remained registers.
2415   const unsigned NSAAOffset = State->getNextStackOffset();
2416   if (NSAAOffset != 0 && Size > Excess) {
2417     while (State->AllocateReg(GPRArgRegs))
2418       ;
2419     return;
2420   }
2421
2422   // First register for byval parameter is the first register that wasn't
2423   // allocated before this method call, so it would be "reg".
2424   // If parameter is small enough to be saved in range [reg, r4), then
2425   // the end (first after last) register would be reg + param-size-in-regs,
2426   // else parameter would be splitted between registers and stack,
2427   // end register would be r4 in this case.
2428   unsigned ByValRegBegin = Reg;
2429   unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2430   State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2431   // Note, first register is allocated in the beginning of function already,
2432   // allocate remained amount of registers we need.
2433   for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2434     State->AllocateReg(GPRArgRegs);
2435   // A byval parameter that is split between registers and memory needs its
2436   // size truncated here.
2437   // In the case where the entire structure fits in registers, we set the
2438   // size in memory to zero.
2439   Size = std::max<int>(Size - Excess, 0);
2440 }
2441
2442 /// MatchingStackOffset - Return true if the given stack call argument is
2443 /// already available in the same position (relatively) of the caller's
2444 /// incoming argument stack.
2445 static
2446 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2447                          MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
2448                          const TargetInstrInfo *TII) {
2449   unsigned Bytes = Arg.getValueSizeInBits() / 8;
2450   int FI = std::numeric_limits<int>::max();
2451   if (Arg.getOpcode() == ISD::CopyFromReg) {
2452     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2453     if (!Register::isVirtualRegister(VR))
2454       return false;
2455     MachineInstr *Def = MRI->getVRegDef(VR);
2456     if (!Def)
2457       return false;
2458     if (!Flags.isByVal()) {
2459       if (!TII->isLoadFromStackSlot(*Def, FI))
2460         return false;
2461     } else {
2462       return false;
2463     }
2464   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2465     if (Flags.isByVal())
2466       // ByVal argument is passed in as a pointer but it's now being
2467       // dereferenced. e.g.
2468       // define @foo(%struct.X* %A) {
2469       //   tail call @bar(%struct.X* byval %A)
2470       // }
2471       return false;
2472     SDValue Ptr = Ld->getBasePtr();
2473     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2474     if (!FINode)
2475       return false;
2476     FI = FINode->getIndex();
2477   } else
2478     return false;
2479
2480   assert(FI != std::numeric_limits<int>::max());
2481   if (!MFI.isFixedObjectIndex(FI))
2482     return false;
2483   return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
2484 }
2485
2486 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
2487 /// for tail call optimization. Targets which want to do tail call
2488 /// optimization should implement this function.
2489 bool ARMTargetLowering::IsEligibleForTailCallOptimization(
2490     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
2491     bool isCalleeStructRet, bool isCallerStructRet,
2492     const SmallVectorImpl<ISD::OutputArg> &Outs,
2493     const SmallVectorImpl<SDValue> &OutVals,
2494     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG,
2495     const bool isIndirect) const {
2496   MachineFunction &MF = DAG.getMachineFunction();
2497   const Function &CallerF = MF.getFunction();
2498   CallingConv::ID CallerCC = CallerF.getCallingConv();
2499
2500   assert(Subtarget->supportsTailCall());
2501
2502   // Indirect tail calls cannot be optimized for Thumb1 if the args
2503   // to the call take up r0-r3. The reason is that there are no legal registers
2504   // left to hold the pointer to the function to be called.
2505   if (Subtarget->isThumb1Only() && Outs.size() >= 4 &&
2506       (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect))
2507     return false;
2508
2509   // Look for obvious safe cases to perform tail call optimization that do not
2510   // require ABI changes. This is what gcc calls sibcall.
2511
2512   // Exception-handling functions need a special set of instructions to indicate
2513   // a return to the hardware. Tail-calling another function would probably
2514   // break this.
2515   if (CallerF.hasFnAttribute("interrupt"))
2516     return false;
2517
2518   // Also avoid sibcall optimization if either caller or callee uses struct
2519   // return semantics.
2520   if (isCalleeStructRet || isCallerStructRet)
2521     return false;
2522
2523   // Externally-defined functions with weak linkage should not be
2524   // tail-called on ARM when the OS does not support dynamic
2525   // pre-emption of symbols, as the AAELF spec requires normal calls
2526   // to undefined weak functions to be replaced with a NOP or jump to the
2527   // next instruction. The behaviour of branch instructions in this
2528   // situation (as used for tail calls) is implementation-defined, so we
2529   // cannot rely on the linker replacing the tail call with a return.
2530   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2531     const GlobalValue *GV = G->getGlobal();
2532     const Triple &TT = getTargetMachine().getTargetTriple();
2533     if (GV->hasExternalWeakLinkage() &&
2534         (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
2535       return false;
2536   }
2537
2538   // Check that the call results are passed in the same way.
2539   LLVMContext &C = *DAG.getContext();
2540   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
2541                                   CCAssignFnForReturn(CalleeCC, isVarArg),
2542                                   CCAssignFnForReturn(CallerCC, isVarArg)))
2543     return false;
2544   // The callee has to preserve all registers the caller needs to preserve.
2545   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
2546   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2547   if (CalleeCC != CallerCC) {
2548     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2549     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2550       return false;
2551   }
2552
2553   // If Caller's vararg or byval argument has been split between registers and
2554   // stack, do not perform tail call, since part of the argument is in caller's
2555   // local frame.
2556   const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
2557   if (AFI_Caller->getArgRegsSaveSize())
2558     return false;
2559
2560   // If the callee takes no arguments then go on to check the results of the
2561   // call.
2562   if (!Outs.empty()) {
2563     // Check if stack adjustment is needed. For now, do not do this if any
2564     // argument is passed on the stack.
2565     SmallVector<CCValAssign, 16> ArgLocs;
2566     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
2567     CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
2568     if (CCInfo.getNextStackOffset()) {
2569       // Check if the arguments are already laid out in the right way as
2570       // the caller's fixed stack objects.
2571       MachineFrameInfo &MFI = MF.getFrameInfo();
2572       const MachineRegisterInfo *MRI = &MF.getRegInfo();
2573       const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2574       for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2575            i != e;
2576            ++i, ++realArgIdx) {
2577         CCValAssign &VA = ArgLocs[i];
2578         EVT RegVT = VA.getLocVT();
2579         SDValue Arg = OutVals[realArgIdx];
2580         ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2581         if (VA.getLocInfo() == CCValAssign::Indirect)
2582           return false;
2583         if (VA.needsCustom()) {
2584           // f64 and vector types are split into multiple registers or
2585           // register/stack-slot combinations.  The types will not match
2586           // the registers; give up on memory f64 refs until we figure
2587           // out what to do about this.
2588           if (!VA.isRegLoc())
2589             return false;
2590           if (!ArgLocs[++i].isRegLoc())
2591             return false;
2592           if (RegVT == MVT::v2f64) {
2593             if (!ArgLocs[++i].isRegLoc())
2594               return false;
2595             if (!ArgLocs[++i].isRegLoc())
2596               return false;
2597           }
2598         } else if (!VA.isRegLoc()) {
2599           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
2600                                    MFI, MRI, TII))
2601             return false;
2602         }
2603       }
2604     }
2605
2606     const MachineRegisterInfo &MRI = MF.getRegInfo();
2607     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
2608       return false;
2609   }
2610
2611   return true;
2612 }
2613
2614 bool
2615 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2616                                   MachineFunction &MF, bool isVarArg,
2617                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
2618                                   LLVMContext &Context) const {
2619   SmallVector<CCValAssign, 16> RVLocs;
2620   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2621   return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2622 }
2623
2624 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
2625                                     const SDLoc &DL, SelectionDAG &DAG) {
2626   const MachineFunction &MF = DAG.getMachineFunction();
2627   const Function &F = MF.getFunction();
2628
2629   StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
2630
2631   // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
2632   // version of the "preferred return address". These offsets affect the return
2633   // instruction if this is a return from PL1 without hypervisor extensions.
2634   //    IRQ/FIQ: +4     "subs pc, lr, #4"
2635   //    SWI:     0      "subs pc, lr, #0"
2636   //    ABORT:   +4     "subs pc, lr, #4"
2637   //    UNDEF:   +4/+2  "subs pc, lr, #0"
2638   // UNDEF varies depending on where the exception came from ARM or Thumb
2639   // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
2640
2641   int64_t LROffset;
2642   if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
2643       IntKind == "ABORT")
2644     LROffset = 4;
2645   else if (IntKind == "SWI" || IntKind == "UNDEF")
2646     LROffset = 0;
2647   else
2648     report_fatal_error("Unsupported interrupt attribute. If present, value "
2649                        "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
2650
2651   RetOps.insert(RetOps.begin() + 1,
2652                 DAG.getConstant(LROffset, DL, MVT::i32, false));
2653
2654   return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps);
2655 }
2656
2657 SDValue
2658 ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2659                                bool isVarArg,
2660                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2661                                const SmallVectorImpl<SDValue> &OutVals,
2662                                const SDLoc &dl, SelectionDAG &DAG) const {
2663   // CCValAssign - represent the assignment of the return value to a location.
2664   SmallVector<CCValAssign, 16> RVLocs;
2665
2666   // CCState - Info about the registers and stack slots.
2667   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2668                  *DAG.getContext());
2669
2670   // Analyze outgoing return values.
2671   CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2672
2673   SDValue Flag;
2674   SmallVector<SDValue, 4> RetOps;
2675   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2676   bool isLittleEndian = Subtarget->isLittle();
2677
2678   MachineFunction &MF = DAG.getMachineFunction();
2679   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2680   AFI->setReturnRegsCount(RVLocs.size());
2681
2682   // Copy the result values into the output registers.
2683   for (unsigned i = 0, realRVLocIdx = 0;
2684        i != RVLocs.size();
2685        ++i, ++realRVLocIdx) {
2686     CCValAssign &VA = RVLocs[i];
2687     assert(VA.isRegLoc() && "Can only return in registers!");
2688
2689     SDValue Arg = OutVals[realRVLocIdx];
2690     bool ReturnF16 = false;
2691
2692     if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {
2693       // Half-precision return values can be returned like this:
2694       //
2695       // t11 f16 = fadd ...
2696       // t12: i16 = bitcast t11
2697       //   t13: i32 = zero_extend t12
2698       // t14: f32 = bitcast t13  <~~~~~~~ Arg
2699       //
2700       // to avoid code generation for bitcasts, we simply set Arg to the node
2701       // that produces the f16 value, t11 in this case.
2702       //
2703       if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
2704         SDValue ZE = Arg.getOperand(0);
2705         if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
2706           SDValue BC = ZE.getOperand(0);
2707           if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
2708             Arg = BC.getOperand(0);
2709             ReturnF16 = true;
2710           }
2711         }
2712       }
2713     }
2714
2715     switch (VA.getLocInfo()) {
2716     default: llvm_unreachable("Unknown loc info!");
2717     case CCValAssign::Full: break;
2718     case CCValAssign::BCvt:
2719       if (!ReturnF16)
2720         Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2721       break;
2722     }
2723
2724     if (VA.needsCustom()) {
2725       if (VA.getLocVT() == MVT::v2f64) {
2726         // Extract the first half and return it in two registers.
2727         SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2728                                    DAG.getConstant(0, dl, MVT::i32));
2729         SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
2730                                        DAG.getVTList(MVT::i32, MVT::i32), Half);
2731
2732         Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2733                                  HalfGPRs.getValue(isLittleEndian ? 0 : 1),
2734                                  Flag);
2735         Flag = Chain.getValue(1);
2736         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2737         VA = RVLocs[++i]; // skip ahead to next loc
2738         Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2739                                  HalfGPRs.getValue(isLittleEndian ? 1 : 0),
2740                                  Flag);
2741         Flag = Chain.getValue(1);
2742         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2743         VA = RVLocs[++i]; // skip ahead to next loc
2744
2745         // Extract the 2nd half and fall through to handle it as an f64 value.
2746         Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2747                           DAG.getConstant(1, dl, MVT::i32));
2748       }
2749       // Legalize ret f64 -> ret 2 x i32.  We always have fmrrd if f64 is
2750       // available.
2751       SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2752                                   DAG.getVTList(MVT::i32, MVT::i32), Arg);
2753       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2754                                fmrrd.getValue(isLittleEndian ? 0 : 1),
2755                                Flag);
2756       Flag = Chain.getValue(1);
2757       RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2758       VA = RVLocs[++i]; // skip ahead to next loc
2759       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2760                                fmrrd.getValue(isLittleEndian ? 1 : 0),
2761                                Flag);
2762     } else
2763       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
2764
2765     // Guarantee that all emitted copies are
2766     // stuck together, avoiding something bad.
2767     Flag = Chain.getValue(1);
2768     RetOps.push_back(DAG.getRegister(VA.getLocReg(),
2769                                      ReturnF16 ? MVT::f16 : VA.getLocVT()));
2770   }
2771   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
2772   const MCPhysReg *I =
2773       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2774   if (I) {
2775     for (; *I; ++I) {
2776       if (ARM::GPRRegClass.contains(*I))
2777         RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2778       else if (ARM::DPRRegClass.contains(*I))
2779         RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
2780       else
2781         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2782     }
2783   }
2784
2785   // Update chain and glue.
2786   RetOps[0] = Chain;
2787   if (Flag.getNode())
2788     RetOps.push_back(Flag);
2789
2790   // CPUs which aren't M-class use a special sequence to return from
2791   // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
2792   // though we use "subs pc, lr, #N").
2793   //
2794   // M-class CPUs actually use a normal return sequence with a special
2795   // (hardware-provided) value in LR, so the normal code path works.
2796   if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
2797       !Subtarget->isMClass()) {
2798     if (Subtarget->isThumb1Only())
2799       report_fatal_error("interrupt attribute is not supported in Thumb1");
2800     return LowerInterruptReturn(RetOps, dl, DAG);
2801   }
2802
2803   return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps);
2804 }
2805
2806 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2807   if (N->getNumValues() != 1)
2808     return false;
2809   if (!N->hasNUsesOfValue(1, 0))
2810     return false;
2811
2812   SDValue TCChain = Chain;
2813   SDNode *Copy = *N->use_begin();
2814   if (Copy->getOpcode() == ISD::CopyToReg) {
2815     // If the copy has a glue operand, we conservatively assume it isn't safe to
2816     // perform a tail call.
2817     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2818       return false;
2819     TCChain = Copy->getOperand(0);
2820   } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
2821     SDNode *VMov = Copy;
2822     // f64 returned in a pair of GPRs.
2823     SmallPtrSet<SDNode*, 2> Copies;
2824     for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
2825          UI != UE; ++UI) {
2826       if (UI->getOpcode() != ISD::CopyToReg)
2827         return false;
2828       Copies.insert(*UI);
2829     }
2830     if (Copies.size() > 2)
2831       return false;
2832
2833     for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
2834          UI != UE; ++UI) {
2835       SDValue UseChain = UI->getOperand(0);
2836       if (Copies.count(UseChain.getNode()))
2837         // Second CopyToReg
2838         Copy = *UI;
2839       else {
2840         // We are at the top of this chain.
2841         // If the copy has a glue operand, we conservatively assume it
2842         // isn't safe to perform a tail call.
2843         if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue)
2844           return false;
2845         // First CopyToReg
2846         TCChain = UseChain;
2847       }
2848     }
2849   } else if (Copy->getOpcode() == ISD::BITCAST) {
2850     // f32 returned in a single GPR.
2851     if (!Copy->hasOneUse())
2852       return false;
2853     Copy = *Copy->use_begin();
2854     if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
2855       return false;
2856     // If the copy has a glue operand, we conservatively assume it isn't safe to
2857     // perform a tail call.
2858     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2859       return false;
2860     TCChain = Copy->getOperand(0);
2861   } else {
2862     return false;
2863   }
2864
2865   bool HasRet = false;
2866   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2867        UI != UE; ++UI) {
2868     if (UI->getOpcode() != ARMISD::RET_FLAG &&
2869         UI->getOpcode() != ARMISD::INTRET_FLAG)
2870       return false;
2871     HasRet = true;
2872   }
2873
2874   if (!HasRet)
2875     return false;
2876
2877   Chain = TCChain;
2878   return true;
2879 }
2880
2881 bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2882   if (!Subtarget->supportsTailCall())
2883     return false;
2884
2885   auto Attr =
2886       CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2887   if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2888     return false;
2889
2890   return true;
2891 }
2892
2893 // Trying to write a 64 bit value so need to split into two 32 bit values first,
2894 // and pass the lower and high parts through.
2895 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) {
2896   SDLoc DL(Op);
2897   SDValue WriteValue = Op->getOperand(2);
2898
2899   // This function is only supposed to be called for i64 type argument.
2900   assert(WriteValue.getValueType() == MVT::i64
2901           && "LowerWRITE_REGISTER called for non-i64 type argument.");
2902
2903   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
2904                            DAG.getConstant(0, DL, MVT::i32));
2905   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
2906                            DAG.getConstant(1, DL, MVT::i32));
2907   SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
2908   return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
2909 }
2910
2911 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
2912 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
2913 // one of the above mentioned nodes. It has to be wrapped because otherwise
2914 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
2915 // be used to form addressing mode. These wrapped nodes will be selected
2916 // into MOVi.
2917 SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
2918                                              SelectionDAG &DAG) const {
2919   EVT PtrVT = Op.getValueType();
2920   // FIXME there is no actual debug info here
2921   SDLoc dl(Op);
2922   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
2923   SDValue Res;
2924
2925   // When generating execute-only code Constant Pools must be promoted to the
2926   // global data section. It's a bit ugly that we can't share them across basic
2927   // blocks, but this way we guarantee that execute-only behaves correct with
2928   // position-independent addressing modes.
2929   if (Subtarget->genExecuteOnly()) {
2930     auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
2931     auto T = const_cast<Type*>(CP->getType());
2932     auto C = const_cast<Constant*>(CP->getConstVal());
2933     auto M = const_cast<Module*>(DAG.getMachineFunction().
2934                                  getFunction().getParent());
2935     auto GV = new GlobalVariable(
2936                     *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
2937                     Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
2938                     Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
2939                     Twine(AFI->createPICLabelUId())
2940                   );
2941     SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV),
2942                                             dl, PtrVT);
2943     return LowerGlobalAddress(GA, DAG);
2944   }
2945
2946   if (CP->isMachineConstantPoolEntry())
2947     Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
2948                                     CP->getAlignment());
2949   else
2950     Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
2951                                     CP->getAlignment());
2952   return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
2953 }
2954
2955 unsigned ARMTargetLowering::getJumpTableEncoding() const {
2956   return MachineJumpTableInfo::EK_Inline;
2957 }
2958
2959 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
2960                                              SelectionDAG &DAG) const {
2961   MachineFunction &MF = DAG.getMachineFunction();
2962   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2963   unsigned ARMPCLabelIndex = 0;
2964   SDLoc DL(Op);
2965   EVT PtrVT = getPointerTy(DAG.getDataLayout());
2966   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
2967   SDValue CPAddr;
2968   bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
2969   if (!IsPositionIndependent) {
2970     CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
2971   } else {
2972     unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
2973     ARMPCLabelIndex = AFI->createPICLabelUId();
2974     ARMConstantPoolValue *CPV =
2975       ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
2976                                       ARMCP::CPBlockAddress, PCAdj);
2977     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
2978   }
2979   CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
2980   SDValue Result = DAG.getLoad(
2981       PtrVT, DL, DAG.getEntryNode(), CPAddr,
2982       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2983   if (!IsPositionIndependent)
2984     return Result;
2985   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
2986   return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
2987 }
2988
2989 /// Convert a TLS address reference into the correct sequence of loads
2990 /// and calls to compute the variable's address for Darwin, and return an
2991 /// SDValue containing the final node.
2992
2993 /// Darwin only has one TLS scheme which must be capable of dealing with the
2994 /// fully general situation, in the worst case. This means:
2995 ///     + "extern __thread" declaration.
2996 ///     + Defined in a possibly unknown dynamic library.
2997 ///
2998 /// The general system is that each __thread variable has a [3 x i32] descriptor
2999 /// which contains information used by the runtime to calculate the address. The
3000 /// only part of this the compiler needs to know about is the first word, which
3001 /// contains a function pointer that must be called with the address of the
3002 /// entire descriptor in "r0".
3003 ///
3004 /// Since this descriptor may be in a different unit, in general access must
3005 /// proceed along the usual ARM rules. A common sequence to produce is:
3006 ///
3007 ///     movw rT1, :lower16:_var$non_lazy_ptr
3008 ///     movt rT1, :upper16:_var$non_lazy_ptr
3009 ///     ldr r0, [rT1]
3010 ///     ldr rT2, [r0]
3011 ///     blx rT2
3012 ///     [...address now in r0...]
3013 SDValue
3014 ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
3015                                                SelectionDAG &DAG) const {
3016   assert(Subtarget->isTargetDarwin() &&
3017          "This function expects a Darwin target");
3018   SDLoc DL(Op);
3019
3020   // First step is to get the address of the actua global symbol. This is where
3021   // the TLS descriptor lives.
3022   SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
3023
3024   // The first entry in the descriptor is a function pointer that we must call
3025   // to obtain the address of the variable.
3026   SDValue Chain = DAG.getEntryNode();
3027   SDValue FuncTLVGet = DAG.getLoad(
3028       MVT::i32, DL, Chain, DescAddr,
3029       MachinePointerInfo::getGOT(DAG.getMachineFunction()),
3030       /* Alignment = */ 4,
3031       MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable |
3032           MachineMemOperand::MOInvariant);
3033   Chain = FuncTLVGet.getValue(1);
3034
3035   MachineFunction &F = DAG.getMachineFunction();
3036   MachineFrameInfo &MFI = F.getFrameInfo();
3037   MFI.setAdjustsStack(true);
3038
3039   // TLS calls preserve all registers except those that absolutely must be
3040   // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3041   // silly).
3042   auto TRI =
3043       getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo();
3044   auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
3045   const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
3046
3047   // Finally, we can make the call. This is just a degenerate version of a
3048   // normal AArch64 call node: r0 takes the address of the descriptor, and
3049   // returns the address of the variable in this thread.
3050   Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
3051   Chain =
3052       DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3053                   Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
3054                   DAG.getRegisterMask(Mask), Chain.getValue(1));
3055   return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
3056 }
3057
3058 SDValue
3059 ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
3060                                                 SelectionDAG &DAG) const {
3061   assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
3062
3063   SDValue Chain = DAG.getEntryNode();
3064   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3065   SDLoc DL(Op);
3066
3067   // Load the current TEB (thread environment block)
3068   SDValue Ops[] = {Chain,
3069                    DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
3070                    DAG.getConstant(15, DL, MVT::i32),
3071                    DAG.getConstant(0, DL, MVT::i32),
3072                    DAG.getConstant(13, DL, MVT::i32),
3073                    DAG.getConstant(0, DL, MVT::i32),
3074                    DAG.getConstant(2, DL, MVT::i32)};
3075   SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
3076                                    DAG.getVTList(MVT::i32, MVT::Other), Ops);
3077
3078   SDValue TEB = CurrentTEB.getValue(0);
3079   Chain = CurrentTEB.getValue(1);
3080
3081   // Load the ThreadLocalStoragePointer from the TEB
3082   // A pointer to the TLS array is located at offset 0x2c from the TEB.
3083   SDValue TLSArray =
3084       DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
3085   TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
3086
3087   // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3088   // offset into the TLSArray.
3089
3090   // Load the TLS index from the C runtime
3091   SDValue TLSIndex =
3092       DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
3093   TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
3094   TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
3095
3096   SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
3097                               DAG.getConstant(2, DL, MVT::i32));
3098   SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
3099                             DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
3100                             MachinePointerInfo());
3101
3102   // Get the offset of the start of the .tls section (section base)
3103   const auto *GA = cast<GlobalAddressSDNode>(Op);
3104   auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
3105   SDValue Offset = DAG.getLoad(
3106       PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
3107                                     DAG.getTargetConstantPool(CPV, PtrVT, 4)),
3108       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3109
3110   return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
3111 }
3112
3113 // Lower ISD::GlobalTLSAddress using the "general dynamic" model
3114 SDValue
3115 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
3116                                                  SelectionDAG &DAG) const {
3117   SDLoc dl(GA);
3118   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3119   unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3120   MachineFunction &MF = DAG.getMachineFunction();
3121   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3122   unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3123   ARMConstantPoolValue *CPV =
3124     ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3125                                     ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
3126   SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
3127   Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
3128   Argument = DAG.getLoad(
3129       PtrVT, dl, DAG.getEntryNode(), Argument,
3130       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3131   SDValue Chain = Argument.getValue(1);
3132
3133   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3134   Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
3135
3136   // call __tls_get_addr.
3137   ArgListTy Args;
3138   ArgListEntry Entry;
3139   Entry.Node = Argument;
3140   Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
3141   Args.push_back(Entry);
3142
3143   // FIXME: is there useful debug info available here?
3144   TargetLowering::CallLoweringInfo CLI(DAG);
3145   CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3146       CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
3147       DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
3148
3149   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3150   return CallResult.first;
3151 }
3152
3153 // Lower ISD::GlobalTLSAddress using the "initial exec" or
3154 // "local exec" model.
3155 SDValue
3156 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
3157                                         SelectionDAG &DAG,
3158                                         TLSModel::Model model) const {
3159   const GlobalValue *GV = GA->getGlobal();
3160   SDLoc dl(GA);
3161   SDValue Offset;
3162   SDValue Chain = DAG.getEntryNode();
3163   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3164   // Get the Thread Pointer
3165   SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3166
3167   if (model == TLSModel::InitialExec) {
3168     MachineFunction &MF = DAG.getMachineFunction();
3169     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3170     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3171     // Initial exec model.
3172     unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3173     ARMConstantPoolValue *CPV =
3174       ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3175                                       ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
3176                                       true);
3177     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
3178     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3179     Offset = DAG.getLoad(
3180         PtrVT, dl, Chain, Offset,
3181         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3182     Chain = Offset.getValue(1);
3183
3184     SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3185     Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
3186
3187     Offset = DAG.getLoad(
3188         PtrVT, dl, Chain, Offset,
3189         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3190   } else {
3191     // local exec model
3192     assert(model == TLSModel::LocalExec);
3193     ARMConstantPoolValue *CPV =
3194       ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
3195     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
3196     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3197     Offset = DAG.getLoad(
3198         PtrVT, dl, Chain, Offset,
3199         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3200   }
3201
3202   // The address of the thread local variable is the add of the thread
3203   // pointer with the offset of the variable.
3204   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
3205 }
3206
3207 SDValue
3208 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
3209   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3210   if (DAG.getTarget().useEmulatedTLS())
3211     return LowerToTLSEmulatedModel(GA, DAG);
3212
3213   if (Subtarget->isTargetDarwin())
3214     return LowerGlobalTLSAddressDarwin(Op, DAG);
3215
3216   if (Subtarget->isTargetWindows())
3217     return LowerGlobalTLSAddressWindows(Op, DAG);
3218
3219   // TODO: implement the "local dynamic" model
3220   assert(Subtarget->isTargetELF() && "Only ELF implemented here");
3221   TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());
3222
3223   switch (model) {
3224     case TLSModel::GeneralDynamic:
3225     case TLSModel::LocalDynamic:
3226       return LowerToTLSGeneralDynamicModel(GA, DAG);
3227     case TLSModel::InitialExec:
3228     case TLSModel::LocalExec:
3229       return LowerToTLSExecModels(GA, DAG, model);
3230   }
3231   llvm_unreachable("bogus TLS model");
3232 }
3233
3234 /// Return true if all users of V are within function F, looking through
3235 /// ConstantExprs.
3236 static bool allUsersAreInFunction(const Value *V, const Function *F) {
3237   SmallVector<const User*,4> Worklist;
3238   for (auto *U : V->users())
3239     Worklist.push_back(U);
3240   while (!Worklist.empty()) {
3241     auto *U = Worklist.pop_back_val();
3242     if (isa<ConstantExpr>(U)) {
3243       for (auto *UU : U->users())
3244         Worklist.push_back(UU);
3245       continue;
3246     }
3247
3248     auto *I = dyn_cast<Instruction>(U);
3249     if (!I || I->getParent()->getParent() != F)
3250       return false;
3251   }
3252   return true;
3253 }
3254
3255 static SDValue promoteToConstantPool(const ARMTargetLowering *TLI,
3256                                      const GlobalValue *GV, SelectionDAG &DAG,
3257                                      EVT PtrVT, const SDLoc &dl) {
3258   // If we're creating a pool entry for a constant global with unnamed address,
3259   // and the global is small enough, we can emit it inline into the constant pool
3260   // to save ourselves an indirection.
3261   //
3262   // This is a win if the constant is only used in one function (so it doesn't
3263   // need to be duplicated) or duplicating the constant wouldn't increase code
3264   // size (implying the constant is no larger than 4 bytes).
3265   const Function &F = DAG.getMachineFunction().getFunction();
3266
3267   // We rely on this decision to inline being idemopotent and unrelated to the
3268   // use-site. We know that if we inline a variable at one use site, we'll
3269   // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3270   // doesn't know about this optimization, so bail out if it's enabled else
3271   // we could decide to inline here (and thus never emit the GV) but require
3272   // the GV from fast-isel generated code.
3273   if (!EnableConstpoolPromotion ||
3274       DAG.getMachineFunction().getTarget().Options.EnableFastISel)
3275       return SDValue();
3276
3277   auto *GVar = dyn_cast<GlobalVariable>(GV);
3278   if (!GVar || !GVar->hasInitializer() ||
3279       !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3280       !GVar->hasLocalLinkage())
3281     return SDValue();
3282
3283   // If we inline a value that contains relocations, we move the relocations
3284   // from .data to .text. This is not allowed in position-independent code.
3285   auto *Init = GVar->getInitializer();
3286   if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
3287       Init->needsRelocation())
3288     return SDValue();
3289
3290   // The constant islands pass can only really deal with alignment requests
3291   // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3292   // any type wanting greater alignment requirements than 4 bytes. We also
3293   // can only promote constants that are multiples of 4 bytes in size or
3294   // are paddable to a multiple of 4. Currently we only try and pad constants
3295   // that are strings for simplicity.
3296   auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
3297   unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
3298   unsigned Align = DAG.getDataLayout().getPreferredAlignment(GVar);
3299   unsigned RequiredPadding = 4 - (Size % 4);
3300   bool PaddingPossible =
3301     RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3302   if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize ||
3303       Size == 0)
3304     return SDValue();
3305
3306   unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3307   MachineFunction &MF = DAG.getMachineFunction();
3308   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3309
3310   // We can't bloat the constant pool too much, else the ConstantIslands pass
3311   // may fail to converge. If we haven't promoted this global yet (it may have
3312   // multiple uses), and promoting it would increase the constant pool size (Sz
3313   // > 4), ensure we have space to do so up to MaxTotal.
3314   if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
3315     if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3316         ConstpoolPromotionMaxTotal)
3317       return SDValue();
3318
3319   // This is only valid if all users are in a single function; we can't clone
3320   // the constant in general. The LLVM IR unnamed_addr allows merging
3321   // constants, but not cloning them.
3322   //
3323   // We could potentially allow cloning if we could prove all uses of the
3324   // constant in the current function don't care about the address, like
3325   // printf format strings. But that isn't implemented for now.
3326   if (!allUsersAreInFunction(GVar, &F))
3327     return SDValue();
3328
3329   // We're going to inline this global. Pad it out if needed.
3330   if (RequiredPadding != 4) {
3331     StringRef S = CDAInit->getAsString();
3332
3333     SmallVector<uint8_t,16> V(S.size());
3334     std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
3335     while (RequiredPadding--)
3336       V.push_back(0);
3337     Init = ConstantDataArray::get(*DAG.getContext(), V);
3338   }
3339
3340   auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
3341   SDValue CPAddr =
3342     DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4);
3343   if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
3344     AFI->markGlobalAsPromotedToConstantPool(GVar);
3345     AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() +
3346                                       PaddedSize - 4);
3347   }
3348   ++NumConstpoolPromoted;
3349   return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3350 }
3351
3352 bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const {
3353   if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
3354     if (!(GV = GA->getBaseObject()))
3355       return false;
3356   if (const auto *V = dyn_cast<GlobalVariable>(GV))
3357     return V->isConstant();
3358   return isa<Function>(GV);
3359 }
3360
3361 SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
3362                                               SelectionDAG &DAG) const {
3363   switch (Subtarget->getTargetTriple().getObjectFormat()) {
3364   default: llvm_unreachable("unknown object format");
3365   case Triple::COFF:
3366     return LowerGlobalAddressWindows(Op, DAG);
3367   case Triple::ELF:
3368     return LowerGlobalAddressELF(Op, DAG);
3369   case Triple::MachO:
3370     return LowerGlobalAddressDarwin(Op, DAG);
3371   }
3372 }
3373
3374 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3375                                                  SelectionDAG &DAG) const {
3376   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3377   SDLoc dl(Op);
3378   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3379   const TargetMachine &TM = getTargetMachine();
3380   bool IsRO = isReadOnly(GV);
3381
3382   // promoteToConstantPool only if not generating XO text section
3383   if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly())
3384     if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
3385       return V;
3386
3387   if (isPositionIndependent()) {
3388     bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
3389     SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
3390                                            UseGOT_PREL ? ARMII::MO_GOT : 0);
3391     SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3392     if (UseGOT_PREL)
3393       Result =
3394           DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3395                       MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3396     return Result;
3397   } else if (Subtarget->isROPI() && IsRO) {
3398     // PC-relative.
3399     SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3400     SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3401     return Result;
3402   } else if (Subtarget->isRWPI() && !IsRO) {
3403     // SB-relative.
3404     SDValue RelAddr;
3405     if (Subtarget->useMovt()) {
3406       ++NumMovwMovt;
3407       SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
3408       RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
3409     } else { // use literal pool for address constant
3410       ARMConstantPoolValue *CPV =
3411         ARMConstantPoolConstant::Create(GV, ARMCP::SBREL);
3412       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
3413       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3414       RelAddr = DAG.getLoad(
3415           PtrVT, dl, DAG.getEntryNode(), CPAddr,
3416           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3417     }
3418     SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3419     SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
3420     return Result;
3421   }
3422
3423   // If we have T2 ops, we can materialize the address directly via movt/movw
3424   // pair. This is always cheaper.
3425   if (Subtarget->useMovt()) {
3426     ++NumMovwMovt;
3427     // FIXME: Once remat is capable of dealing with instructions with register
3428     // operands, expand this into two nodes.
3429     return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
3430                        DAG.getTargetGlobalAddress(GV, dl, PtrVT));
3431   } else {
3432     SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
3433     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3434     return DAG.getLoad(
3435         PtrVT, dl, DAG.getEntryNode(), CPAddr,
3436         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3437   }
3438 }
3439
3440 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
3441                                                     SelectionDAG &DAG) const {
3442   assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3443          "ROPI/RWPI not currently supported for Darwin");
3444   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3445   SDLoc dl(Op);
3446   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3447
3448   if (Subtarget->useMovt())
3449     ++NumMovwMovt;
3450
3451   // FIXME: Once remat is capable of dealing with instructions with register
3452   // operands, expand this into multiple nodes
3453   unsigned Wrapper =
3454       isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;
3455
3456   SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
3457   SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
3458
3459   if (Subtarget->isGVIndirectSymbol(GV))
3460     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3461                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3462   return Result;
3463 }
3464
3465 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
3466                                                      SelectionDAG &DAG) const {
3467   assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
3468   assert(Subtarget->useMovt() &&
3469          "Windows on ARM expects to use movw/movt");
3470   assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3471          "ROPI/RWPI not currently supported for Windows");
3472
3473   const TargetMachine &TM = getTargetMachine();
3474   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3475   ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
3476   if (GV->hasDLLImportStorageClass())
3477     TargetFlags = ARMII::MO_DLLIMPORT;
3478   else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
3479     TargetFlags = ARMII::MO_COFFSTUB;
3480   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3481   SDValue Result;
3482   SDLoc DL(Op);
3483
3484   ++NumMovwMovt;
3485
3486   // FIXME: Once remat is capable of dealing with instructions with register
3487   // operands, expand this into two nodes.
3488   Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
3489                        DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
3490                                                   TargetFlags));
3491   if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
3492     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3493                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3494   return Result;
3495 }
3496
3497 SDValue
3498 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
3499   SDLoc dl(Op);
3500   SDValue Val = DAG.getConstant(0, dl, MVT::i32);
3501   return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
3502                      DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
3503                      Op.getOperand(1), Val);
3504 }
3505
3506 SDValue
3507 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
3508   SDLoc dl(Op);
3509   return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
3510                      Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
3511 }
3512
3513 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
3514                                                       SelectionDAG &DAG) const {
3515   SDLoc dl(Op);
3516   return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
3517                      Op.getOperand(0));
3518 }
3519
3520 SDValue
3521 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
3522                                           const ARMSubtarget *Subtarget) const {
3523   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3524   SDLoc dl(Op);
3525   switch (IntNo) {
3526   default: return SDValue();    // Don't custom lower most intrinsics.
3527   case Intrinsic::thread_pointer: {
3528     EVT PtrVT = getPointerTy(DAG.getDataLayout());
3529     return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3530   }
3531   case Intrinsic::eh_sjlj_lsda: {
3532     MachineFunction &MF = DAG.getMachineFunction();
3533     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3534     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3535     EVT PtrVT = getPointerTy(DAG.getDataLayout());
3536     SDValue CPAddr;
3537     bool IsPositionIndependent = isPositionIndependent();
3538     unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
3539     ARMConstantPoolValue *CPV =
3540       ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
3541                                       ARMCP::CPLSDA, PCAdj);
3542     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
3543     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3544     SDValue Result = DAG.getLoad(
3545         PtrVT, dl, DAG.getEntryNode(), CPAddr,
3546         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3547
3548     if (IsPositionIndependent) {
3549       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3550       Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
3551     }
3552     return Result;
3553   }
3554   case Intrinsic::arm_neon_vabs:
3555     return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
3556                         Op.getOperand(1));
3557   case Intrinsic::arm_neon_vmulls:
3558   case Intrinsic::arm_neon_vmullu: {
3559     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
3560       ? ARMISD::VMULLs : ARMISD::VMULLu;
3561     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3562                        Op.getOperand(1), Op.getOperand(2));
3563   }
3564   case Intrinsic::arm_neon_vminnm:
3565   case Intrinsic::arm_neon_vmaxnm: {
3566     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
3567       ? ISD::FMINNUM : ISD::FMAXNUM;
3568     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3569                        Op.getOperand(1), Op.getOperand(2));
3570   }
3571   case Intrinsic::arm_neon_vminu:
3572   case Intrinsic::arm_neon_vmaxu: {
3573     if (Op.getValueType().isFloatingPoint())
3574       return SDValue();
3575     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
3576       ? ISD::UMIN : ISD::UMAX;
3577     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3578                          Op.getOperand(1), Op.getOperand(2));
3579   }
3580   case Intrinsic::arm_neon_vmins:
3581   case Intrinsic::arm_neon_vmaxs: {
3582     // v{min,max}s is overloaded between signed integers and floats.
3583     if (!Op.getValueType().isFloatingPoint()) {
3584       unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3585         ? ISD::SMIN : ISD::SMAX;
3586       return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3587                          Op.getOperand(1), Op.getOperand(2));
3588     }
3589     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3590       ? ISD::FMINIMUM : ISD::FMAXIMUM;
3591     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3592                        Op.getOperand(1), Op.getOperand(2));
3593   }
3594   case Intrinsic::arm_neon_vtbl1:
3595     return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
3596                        Op.getOperand(1), Op.getOperand(2));
3597   case Intrinsic::arm_neon_vtbl2:
3598     return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
3599                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3600   }
3601 }
3602
3603 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
3604                                  const ARMSubtarget *Subtarget) {
3605   SDLoc dl(Op);
3606   ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2));
3607   auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue());
3608   if (SSID == SyncScope::SingleThread)
3609     return Op;
3610
3611   if (!Subtarget->hasDataBarrier()) {
3612     // Some ARMv6 cpus can support data barriers with an mcr instruction.
3613     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
3614     // here.
3615     assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
3616            "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
3617     return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
3618                        DAG.getConstant(0, dl, MVT::i32));
3619   }
3620
3621   ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
3622   AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
3623   ARM_MB::MemBOpt Domain = ARM_MB::ISH;
3624   if (Subtarget->isMClass()) {
3625     // Only a full system barrier exists in the M-class architectures.
3626     Domain = ARM_MB::SY;
3627   } else if (Subtarget->preferISHSTBarriers() &&
3628              Ord == AtomicOrdering::Release) {
3629     // Swift happens to implement ISHST barriers in a way that's compatible with
3630     // Release semantics but weaker than ISH so we'd be fools not to use
3631     // it. Beware: other processors probably don't!
3632     Domain = ARM_MB::ISHST;
3633   }
3634
3635   return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
3636                      DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
3637                      DAG.getConstant(Domain, dl, MVT::i32));
3638 }
3639
3640 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
3641                              const ARMSubtarget *Subtarget) {
3642   // ARM pre v5TE and Thumb1 does not have preload instructions.
3643   if (!(Subtarget->isThumb2() ||
3644         (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
3645     // Just preserve the chain.
3646     return Op.getOperand(0);
3647
3648   SDLoc dl(Op);
3649   unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
3650   if (!isRead &&
3651       (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
3652     // ARMv7 with MP extension has PLDW.
3653     return Op.getOperand(0);
3654
3655   unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
3656   if (Subtarget->isThumb()) {
3657     // Invert the bits.
3658     isRead = ~isRead & 1;
3659     isData = ~isData & 1;
3660   }
3661
3662   return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
3663                      Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
3664                      DAG.getConstant(isData, dl, MVT::i32));
3665 }
3666
3667 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
3668   MachineFunction &MF = DAG.getMachineFunction();
3669   ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
3670
3671   // vastart just stores the address of the VarArgsFrameIndex slot into the
3672   // memory location argument.
3673   SDLoc dl(Op);
3674   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
3675   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
3676   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3677   return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
3678                       MachinePointerInfo(SV));
3679 }
3680
3681 SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
3682                                                 CCValAssign &NextVA,
3683                                                 SDValue &Root,
3684                                                 SelectionDAG &DAG,
3685                                                 const SDLoc &dl) const {
3686   MachineFunction &MF = DAG.getMachineFunction();
3687   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3688
3689   const TargetRegisterClass *RC;
3690   if (AFI->isThumb1OnlyFunction())
3691     RC = &ARM::tGPRRegClass;
3692   else
3693     RC = &ARM::GPRRegClass;
3694
3695   // Transform the arguments stored in physical registers into virtual ones.
3696   unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3697   SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
3698
3699   SDValue ArgValue2;
3700   if (NextVA.isMemLoc()) {
3701     MachineFrameInfo &MFI = MF.getFrameInfo();
3702     int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
3703
3704     // Create load node to retrieve arguments from the stack.
3705     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3706     ArgValue2 = DAG.getLoad(
3707         MVT::i32, dl, Root, FIN,
3708         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3709   } else {
3710     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
3711     ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
3712   }
3713   if (!Subtarget->isLittle())
3714     std::swap (ArgValue, ArgValue2);
3715   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
3716 }
3717
3718 // The remaining GPRs hold either the beginning of variable-argument
3719 // data, or the beginning of an aggregate passed by value (usually
3720 // byval).  Either way, we allocate stack slots adjacent to the data
3721 // provided by our caller, and store the unallocated registers there.
3722 // If this is a variadic function, the va_list pointer will begin with
3723 // these values; otherwise, this reassembles a (byval) structure that
3724 // was split between registers and memory.
3725 // Return: The frame index registers were stored into.
3726 int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
3727                                       const SDLoc &dl, SDValue &Chain,
3728                                       const Value *OrigArg,
3729                                       unsigned InRegsParamRecordIdx,
3730                                       int ArgOffset, unsigned ArgSize) const {
3731   // Currently, two use-cases possible:
3732   // Case #1. Non-var-args function, and we meet first byval parameter.
3733   //          Setup first unallocated register as first byval register;
3734   //          eat all remained registers
3735   //          (these two actions are performed by HandleByVal method).
3736   //          Then, here, we initialize stack frame with
3737   //          "store-reg" instructions.
3738   // Case #2. Var-args function, that doesn't contain byval parameters.
3739   //          The same: eat all remained unallocated registers,
3740   //          initialize stack frame.
3741
3742   MachineFunction &MF = DAG.getMachineFunction();
3743   MachineFrameInfo &MFI = MF.getFrameInfo();
3744   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3745   unsigned RBegin, REnd;
3746   if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
3747     CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
3748   } else {
3749     unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
3750     RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
3751     REnd = ARM::R4;
3752   }
3753
3754   if (REnd != RBegin)
3755     ArgOffset = -4 * (ARM::R4 - RBegin);
3756
3757   auto PtrVT = getPointerTy(DAG.getDataLayout());
3758   int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
3759   SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
3760
3761   SmallVector<SDValue, 4> MemOps;
3762   const TargetRegisterClass *RC =
3763       AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
3764
3765   for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
3766     unsigned VReg = MF.addLiveIn(Reg, RC);
3767     SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
3768     SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
3769                                  MachinePointerInfo(OrigArg, 4 * i));
3770     MemOps.push_back(Store);
3771     FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
3772   }
3773
3774   if (!MemOps.empty())
3775     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3776   return FrameIndex;
3777 }
3778
3779 // Setup stack frame, the va_list pointer will start from.
3780 void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
3781                                              const SDLoc &dl, SDValue &Chain,
3782                                              unsigned ArgOffset,
3783                                              unsigned TotalArgRegsSaveSize,
3784                                              bool ForceMutable) const {
3785   MachineFunction &MF = DAG.getMachineFunction();
3786   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3787
3788   // Try to store any remaining integer argument regs
3789   // to their spots on the stack so that they may be loaded by dereferencing
3790   // the result of va_next.
3791   // If there is no regs to be stored, just point address after last
3792   // argument passed via stack.
3793   int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
3794                                   CCInfo.getInRegsParamsCount(),
3795                                   CCInfo.getNextStackOffset(),
3796                                   std::max(4U, TotalArgRegsSaveSize));
3797   AFI->setVarArgsFrameIndex(FrameIndex);
3798 }
3799
3800 SDValue ARMTargetLowering::LowerFormalArguments(
3801     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3802     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3803     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3804   MachineFunction &MF = DAG.getMachineFunction();
3805   MachineFrameInfo &MFI = MF.getFrameInfo();
3806
3807   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3808
3809   // Assign locations to all of the incoming arguments.
3810   SmallVector<CCValAssign, 16> ArgLocs;
3811   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3812                  *DAG.getContext());
3813   CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
3814
3815   SmallVector<SDValue, 16> ArgValues;
3816   SDValue ArgValue;
3817   Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
3818   unsigned CurArgIdx = 0;
3819
3820   // Initially ArgRegsSaveSize is zero.
3821   // Then we increase this value each time we meet byval parameter.
3822   // We also increase this value in case of varargs function.
3823   AFI->setArgRegsSaveSize(0);
3824
3825   // Calculate the amount of stack space that we need to allocate to store
3826   // byval and variadic arguments that are passed in registers.
3827   // We need to know this before we allocate the first byval or variadic
3828   // argument, as they will be allocated a stack slot below the CFA (Canonical
3829   // Frame Address, the stack pointer at entry to the function).
3830   unsigned ArgRegBegin = ARM::R4;
3831   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3832     if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
3833       break;
3834
3835     CCValAssign &VA = ArgLocs[i];
3836     unsigned Index = VA.getValNo();
3837     ISD::ArgFlagsTy Flags = Ins[Index].Flags;
3838     if (!Flags.isByVal())
3839       continue;
3840
3841     assert(VA.isMemLoc() && "unexpected byval pointer in reg");
3842     unsigned RBegin, REnd;
3843     CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
3844     ArgRegBegin = std::min(ArgRegBegin, RBegin);
3845
3846     CCInfo.nextInRegsParam();
3847   }
3848   CCInfo.rewindByValRegsInfo();
3849
3850   int lastInsIndex = -1;
3851   if (isVarArg && MFI.hasVAStart()) {
3852     unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
3853     if (RegIdx != array_lengthof(GPRArgRegs))
3854       ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
3855   }
3856
3857   unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
3858   AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
3859   auto PtrVT = getPointerTy(DAG.getDataLayout());
3860
3861   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3862     CCValAssign &VA = ArgLocs[i];
3863     if (Ins[VA.getValNo()].isOrigArg()) {
3864       std::advance(CurOrigArg,
3865                    Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
3866       CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
3867     }
3868     // Arguments stored in registers.
3869     if (VA.isRegLoc()) {
3870       EVT RegVT = VA.getLocVT();
3871
3872       if (VA.needsCustom()) {
3873         // f64 and vector types are split up into multiple registers or
3874         // combinations of registers and stack slots.
3875         if (VA.getLocVT() == MVT::v2f64) {
3876           SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
3877                                                    Chain, DAG, dl);
3878           VA = ArgLocs[++i]; // skip ahead to next loc
3879           SDValue ArgValue2;
3880           if (VA.isMemLoc()) {
3881             int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
3882             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3883             ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
3884                                     MachinePointerInfo::getFixedStack(
3885                                         DAG.getMachineFunction(), FI));
3886           } else {
3887             ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
3888                                              Chain, DAG, dl);
3889           }
3890           ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
3891           ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
3892                                  ArgValue, ArgValue1,
3893                                  DAG.getIntPtrConstant(0, dl));
3894           ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
3895                                  ArgValue, ArgValue2,
3896                                  DAG.getIntPtrConstant(1, dl));
3897         } else
3898           ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
3899       } else {
3900         const TargetRegisterClass *RC;
3901
3902
3903         if (RegVT == MVT::f16)
3904           RC = &ARM::HPRRegClass;
3905         else if (RegVT == MVT::f32)
3906           RC = &ARM::SPRRegClass;
3907         else if (RegVT == MVT::f64 || RegVT == MVT::v4f16)
3908           RC = &ARM::DPRRegClass;
3909         else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16)
3910           RC = &ARM::QPRRegClass;
3911         else if (RegVT == MVT::i32)
3912           RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
3913                                            : &ARM::GPRRegClass;
3914         else
3915           llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
3916
3917         // Transform the arguments in physical registers into virtual ones.
3918         unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3919         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3920
3921         // If this value is passed in r0 and has the returned attribute (e.g.
3922         // C++ 'structors), record this fact for later use.
3923         if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
3924           AFI->setPreservesR0();
3925         }
3926       }
3927
3928       // If this is an 8 or 16-bit value, it is really passed promoted
3929       // to 32 bits.  Insert an assert[sz]ext to capture this, then
3930       // truncate to the right size.
3931       switch (VA.getLocInfo()) {
3932       default: llvm_unreachable("Unknown loc info!");
3933       case CCValAssign::Full: break;
3934       case CCValAssign::BCvt:
3935         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
3936         break;
3937       case CCValAssign::SExt:
3938         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3939                                DAG.getValueType(VA.getValVT()));
3940         ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3941         break;
3942       case CCValAssign::ZExt:
3943         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3944                                DAG.getValueType(VA.getValVT()));
3945         ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3946         break;
3947       }
3948
3949       InVals.push_back(ArgValue);
3950     } else { // VA.isRegLoc()
3951       // sanity check
3952       assert(VA.isMemLoc());
3953       assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
3954
3955       int index = VA.getValNo();
3956
3957       // Some Ins[] entries become multiple ArgLoc[] entries.
3958       // Process them only once.
3959       if (index != lastInsIndex)
3960         {
3961           ISD::ArgFlagsTy Flags = Ins[index].Flags;
3962           // FIXME: For now, all byval parameter objects are marked mutable.
3963           // This can be changed with more analysis.
3964           // In case of tail call optimization mark all arguments mutable.
3965           // Since they could be overwritten by lowering of arguments in case of
3966           // a tail call.
3967           if (Flags.isByVal()) {
3968             assert(Ins[index].isOrigArg() &&
3969                    "Byval arguments cannot be implicit");
3970             unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
3971
3972             int FrameIndex = StoreByValRegs(
3973                 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
3974                 VA.getLocMemOffset(), Flags.getByValSize());
3975             InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
3976             CCInfo.nextInRegsParam();
3977           } else {
3978             unsigned FIOffset = VA.getLocMemOffset();
3979             int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
3980                                            FIOffset, true);
3981
3982             // Create load nodes to retrieve arguments from the stack.
3983             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3984             InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
3985                                          MachinePointerInfo::getFixedStack(
3986                                              DAG.getMachineFunction(), FI)));
3987           }
3988           lastInsIndex = index;
3989         }
3990     }
3991   }
3992
3993   // varargs
3994   if (isVarArg && MFI.hasVAStart())
3995     VarArgStyleRegisters(CCInfo, DAG, dl, Chain,
3996                          CCInfo.getNextStackOffset(),
3997                          TotalArgRegsSaveSize);
3998
3999   AFI->setArgumentStackSize(CCInfo.getNextStackOffset());
4000
4001   return Chain;
4002 }
4003
4004 /// isFloatingPointZero - Return true if this is +0.0.
4005 static bool isFloatingPointZero(SDValue Op) {
4006   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
4007     return CFP->getValueAPF().isPosZero();
4008   else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
4009     // Maybe this has already been legalized into the constant pool?
4010     if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
4011       SDValue WrapperOp = Op.getOperand(1).getOperand(0);
4012       if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
4013         if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
4014           return CFP->getValueAPF().isPosZero();
4015     }
4016   } else if (Op->getOpcode() == ISD::BITCAST &&
4017              Op->getValueType(0) == MVT::f64) {
4018     // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
4019     // created by LowerConstantFP().
4020     SDValue BitcastOp = Op->getOperand(0);
4021     if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
4022         isNullConstant(BitcastOp->getOperand(0)))
4023       return true;
4024   }
4025   return false;
4026 }
4027
4028 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for
4029 /// the given operands.
4030 SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
4031                                      SDValue &ARMcc, SelectionDAG &DAG,
4032                                      const SDLoc &dl) const {
4033   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4034     unsigned C = RHSC->getZExtValue();
4035     if (!isLegalICmpImmediate((int32_t)C)) {
4036       // Constant does not fit, try adjusting it by one.
4037       switch (CC) {
4038       default: break;
4039       case ISD::SETLT:
4040       case ISD::SETGE:
4041         if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
4042           CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4043           RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4044         }
4045         break;
4046       case ISD::SETULT:
4047       case ISD::SETUGE:
4048         if (C != 0 && isLegalICmpImmediate(C-1)) {
4049           CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4050           RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4051         }
4052         break;
4053       case ISD::SETLE:
4054       case ISD::SETGT:
4055         if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
4056           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4057           RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4058         }
4059         break;
4060       case ISD::SETULE:
4061       case ISD::SETUGT:
4062         if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
4063           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4064           RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4065         }
4066         break;
4067       }
4068     }
4069   } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
4070              (ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) {
4071     // In ARM and Thumb-2, the compare instructions can shift their second
4072     // operand.
4073     CC = ISD::getSetCCSwappedOperands(CC);
4074     std::swap(LHS, RHS);
4075   }
4076
4077   // Thumb1 has very limited immediate modes, so turning an "and" into a
4078   // shift can save multiple instructions.
4079   //
4080   // If we have (x & C1), and C1 is an appropriate mask, we can transform it
4081   // into "((x << n) >> n)".  But that isn't necessarily profitable on its
4082   // own. If it's the operand to an unsigned comparison with an immediate,
4083   // we can eliminate one of the shifts: we transform
4084   // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
4085   //
4086   // We avoid transforming cases which aren't profitable due to encoding
4087   // details:
4088   //
4089   // 1. C2 fits into the immediate field of a cmp, and the transformed version
4090   // would not; in that case, we're essentially trading one immediate load for
4091   // another.
4092   // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
4093   // 3. C2 is zero; we have other code for this special case.
4094   //
4095   // FIXME: Figure out profitability for Thumb2; we usually can't save an
4096   // instruction, since the AND is always one instruction anyway, but we could
4097   // use narrow instructions in some cases.
4098   if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
4099       LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4100       LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
4101       !isSignedIntSetCC(CC)) {
4102     unsigned Mask = cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue();
4103     auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
4104     uint64_t RHSV = RHSC->getZExtValue();
4105     if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
4106       unsigned ShiftBits = countLeadingZeros(Mask);
4107       if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
4108         SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
4109         LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
4110         RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
4111       }
4112     }
4113   }
4114
4115   // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
4116   // single "lsls x, c+1".  The shift sets the "C" and "Z" flags the same
4117   // way a cmp would.
4118   // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
4119   // some tweaks to the heuristics for the previous and->shift transform.
4120   // FIXME: Optimize cases where the LHS isn't a shift.
4121   if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
4122       isa<ConstantSDNode>(RHS) &&
4123       cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U &&
4124       CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4125       cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() < 31) {
4126     unsigned ShiftAmt =
4127       cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() + 1;
4128     SDValue Shift = DAG.getNode(ARMISD::LSLS, dl,
4129                                 DAG.getVTList(MVT::i32, MVT::i32),
4130                                 LHS.getOperand(0),
4131                                 DAG.getConstant(ShiftAmt, dl, MVT::i32));
4132     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
4133                                      Shift.getValue(1), SDValue());
4134     ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
4135     return Chain.getValue(1);
4136   }
4137
4138   ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
4139
4140   // If the RHS is a constant zero then the V (overflow) flag will never be
4141   // set. This can allow us to simplify GE to PL or LT to MI, which can be
4142   // simpler for other passes (like the peephole optimiser) to deal with.
4143   if (isNullConstant(RHS)) {
4144     switch (CondCode) {
4145       default: break;
4146       case ARMCC::GE:
4147         CondCode = ARMCC::PL;
4148         break;
4149       case ARMCC::LT:
4150         CondCode = ARMCC::MI;
4151         break;
4152     }
4153   }
4154
4155   ARMISD::NodeType CompareType;
4156   switch (CondCode) {
4157   default:
4158     CompareType = ARMISD::CMP;
4159     break;
4160   case ARMCC::EQ:
4161   case ARMCC::NE:
4162     // Uses only Z Flag
4163     CompareType = ARMISD::CMPZ;
4164     break;
4165   }
4166   ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4167   return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
4168 }
4169
4170 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
4171 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
4172                                      SelectionDAG &DAG, const SDLoc &dl,
4173                                      bool InvalidOnQNaN) const {
4174   assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
4175   SDValue Cmp;
4176   SDValue C = DAG.getConstant(InvalidOnQNaN, dl, MVT::i32);
4177   if (!isFloatingPointZero(RHS))
4178     Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS, C);
4179   else
4180     Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS, C);
4181   return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
4182 }
4183
4184 /// duplicateCmp - Glue values can have only one use, so this function
4185 /// duplicates a comparison node.
4186 SDValue
4187 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
4188   unsigned Opc = Cmp.getOpcode();
4189   SDLoc DL(Cmp);
4190   if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
4191     return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
4192
4193   assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
4194   Cmp = Cmp.getOperand(0);
4195   Opc = Cmp.getOpcode();
4196   if (Opc == ARMISD::CMPFP)
4197     Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),
4198                       Cmp.getOperand(1), Cmp.getOperand(2));
4199   else {
4200     assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
4201     Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),
4202                       Cmp.getOperand(1));
4203   }
4204   return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
4205 }
4206
4207 // This function returns three things: the arithmetic computation itself
4208 // (Value), a comparison (OverflowCmp), and a condition code (ARMcc).  The
4209 // comparison and the condition code define the case in which the arithmetic
4210 // computation *does not* overflow.
4211 std::pair<SDValue, SDValue>
4212 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
4213                                  SDValue &ARMcc) const {
4214   assert(Op.getValueType() == MVT::i32 &&  "Unsupported value type");
4215
4216   SDValue Value, OverflowCmp;
4217   SDValue LHS = Op.getOperand(0);
4218   SDValue RHS = Op.getOperand(1);
4219   SDLoc dl(Op);
4220
4221   // FIXME: We are currently always generating CMPs because we don't support
4222   // generating CMN through the backend. This is not as good as the natural
4223   // CMP case because it causes a register dependency and cannot be folded
4224   // later.
4225
4226   switch (Op.getOpcode()) {
4227   default:
4228     llvm_unreachable("Unknown overflow instruction!");
4229   case ISD::SADDO:
4230     ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4231     Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
4232     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
4233     break;
4234   case ISD::UADDO:
4235     ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4236     // We use ADDC here to correspond to its use in LowerUnsignedALUO.
4237     // We do not use it in the USUBO case as Value may not be used.
4238     Value = DAG.getNode(ARMISD::ADDC, dl,
4239                         DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
4240                 .getValue(0);
4241     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
4242     break;
4243   case ISD::SSUBO:
4244     ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4245     Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4246     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
4247     break;
4248   case ISD::USUBO:
4249     ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4250     Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4251     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
4252     break;
4253   case ISD::UMULO:
4254     // We generate a UMUL_LOHI and then check if the high word is 0.
4255     ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4256     Value = DAG.getNode(ISD::UMUL_LOHI, dl,
4257                         DAG.getVTList(Op.getValueType(), Op.getValueType()),
4258                         LHS, RHS);
4259     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
4260                               DAG.getConstant(0, dl, MVT::i32));
4261     Value = Value.getValue(0); // We only want the low 32 bits for the result.
4262     break;
4263   case ISD::SMULO:
4264     // We generate a SMUL_LOHI and then check if all the bits of the high word
4265     // are the same as the sign bit of the low word.
4266     ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4267     Value = DAG.getNode(ISD::SMUL_LOHI, dl,
4268                         DAG.getVTList(Op.getValueType(), Op.getValueType()),
4269                         LHS, RHS);
4270     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
4271                               DAG.getNode(ISD::SRA, dl, Op.getValueType(),
4272                                           Value.getValue(0),
4273                                           DAG.getConstant(31, dl, MVT::i32)));
4274     Value = Value.getValue(0); // We only want the low 32 bits for the result.
4275     break;
4276   } // switch (...)
4277
4278   return std::make_pair(Value, OverflowCmp);
4279 }
4280
4281 SDValue
4282 ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {
4283   // Let legalize expand this if it isn't a legal type yet.
4284   if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4285     return SDValue();
4286
4287   SDValue Value, OverflowCmp;
4288   SDValue ARMcc;
4289   std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
4290   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4291   SDLoc dl(Op);
4292   // We use 0 and 1 as false and true values.
4293   SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4294   SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4295   EVT VT = Op.getValueType();
4296
4297   SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal,
4298                                  ARMcc, CCR, OverflowCmp);
4299
4300   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4301   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4302 }
4303
4304 static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry,
4305                                               SelectionDAG &DAG) {
4306   SDLoc DL(BoolCarry);
4307   EVT CarryVT = BoolCarry.getValueType();
4308
4309   // This converts the boolean value carry into the carry flag by doing
4310   // ARMISD::SUBC Carry, 1
4311   SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
4312                               DAG.getVTList(CarryVT, MVT::i32),
4313                               BoolCarry, DAG.getConstant(1, DL, CarryVT));
4314   return Carry.getValue(1);
4315 }
4316
4317 static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT,
4318                                               SelectionDAG &DAG) {
4319   SDLoc DL(Flags);
4320
4321   // Now convert the carry flag into a boolean carry. We do this
4322   // using ARMISD:ADDE 0, 0, Carry
4323   return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
4324                      DAG.getConstant(0, DL, MVT::i32),
4325                      DAG.getConstant(0, DL, MVT::i32), Flags);
4326 }
4327
4328 SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
4329                                              SelectionDAG &DAG) const {
4330   // Let legalize expand this if it isn't a legal type yet.
4331   if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4332     return SDValue();
4333
4334   SDValue LHS = Op.getOperand(0);
4335   SDValue RHS = Op.getOperand(1);
4336   SDLoc dl(Op);
4337
4338   EVT VT = Op.getValueType();
4339   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
4340   SDValue Value;
4341   SDValue Overflow;
4342   switch (Op.getOpcode()) {
4343   default:
4344     llvm_unreachable("Unknown overflow instruction!");
4345   case ISD::UADDO:
4346     Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
4347     // Convert the carry flag into a boolean value.
4348     Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4349     break;
4350   case ISD::USUBO: {
4351     Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
4352     // Convert the carry flag into a boolean value.
4353     Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4354     // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
4355     // value. So compute 1 - C.
4356     Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,
4357                            DAG.getConstant(1, dl, MVT::i32), Overflow);
4358     break;
4359   }
4360   }
4361
4362   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4363 }
4364
4365 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
4366   SDValue Cond = Op.getOperand(0);
4367   SDValue SelectTrue = Op.getOperand(1);
4368   SDValue SelectFalse = Op.getOperand(2);
4369   SDLoc dl(Op);
4370   unsigned Opc = Cond.getOpcode();
4371
4372   if (Cond.getResNo() == 1 &&
4373       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
4374        Opc == ISD::USUBO)) {
4375     if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
4376       return SDValue();
4377
4378     SDValue Value, OverflowCmp;
4379     SDValue ARMcc;
4380     std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
4381     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4382     EVT VT = Op.getValueType();
4383
4384     return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR,
4385                    OverflowCmp, DAG);
4386   }
4387
4388   // Convert:
4389   //
4390   //   (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
4391   //   (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
4392   //
4393   if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
4394     const ConstantSDNode *CMOVTrue =
4395       dyn_cast<ConstantSDNode>(Cond.getOperand(0));
4396     const ConstantSDNode *CMOVFalse =
4397       dyn_cast<ConstantSDNode>(Cond.getOperand(1));
4398
4399     if (CMOVTrue && CMOVFalse) {
4400       unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
4401       unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
4402
4403       SDValue True;
4404       SDValue False;
4405       if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
4406         True = SelectTrue;
4407         False = SelectFalse;
4408       } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
4409         True = SelectFalse;
4410         False = SelectTrue;
4411       }
4412
4413       if (True.getNode() && False.getNode()) {
4414         EVT VT = Op.getValueType();
4415         SDValue ARMcc = Cond.getOperand(2);
4416         SDValue CCR = Cond.getOperand(3);
4417         SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
4418         assert(True.getValueType() == VT);
4419         return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG);
4420       }
4421     }
4422   }
4423
4424   // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
4425   // undefined bits before doing a full-word comparison with zero.
4426   Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
4427                      DAG.getConstant(1, dl, Cond.getValueType()));
4428
4429   return DAG.getSelectCC(dl, Cond,
4430                          DAG.getConstant(0, dl, Cond.getValueType()),
4431                          SelectTrue, SelectFalse, ISD::SETNE);
4432 }
4433
4434 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
4435                                  bool &swpCmpOps, bool &swpVselOps) {
4436   // Start by selecting the GE condition code for opcodes that return true for
4437   // 'equality'
4438   if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
4439       CC == ISD::SETULE || CC == ISD::SETGE  || CC == ISD::SETLE)
4440     CondCode = ARMCC::GE;
4441
4442   // and GT for opcodes that return false for 'equality'.
4443   else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
4444            CC == ISD::SETULT || CC == ISD::SETGT  || CC == ISD::SETLT)
4445     CondCode = ARMCC::GT;
4446
4447   // Since we are constrained to GE/GT, if the opcode contains 'less', we need
4448   // to swap the compare operands.
4449   if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
4450       CC == ISD::SETULT || CC == ISD::SETLE  || CC == ISD::SETLT)
4451     swpCmpOps = true;
4452
4453   // Both GT and GE are ordered comparisons, and return false for 'unordered'.
4454   // If we have an unordered opcode, we need to swap the operands to the VSEL
4455   // instruction (effectively negating the condition).
4456   //
4457   // This also has the effect of swapping which one of 'less' or 'greater'
4458   // returns true, so we also swap the compare operands. It also switches
4459   // whether we return true for 'equality', so we compensate by picking the
4460   // opposite condition code to our original choice.
4461   if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
4462       CC == ISD::SETUGT) {
4463     swpCmpOps = !swpCmpOps;
4464     swpVselOps = !swpVselOps;
4465     CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
4466   }
4467
4468   // 'ordered' is 'anything but unordered', so use the VS condition code and
4469   // swap the VSEL operands.
4470   if (CC == ISD::SETO) {
4471     CondCode = ARMCC::VS;
4472     swpVselOps = true;
4473   }
4474
4475   // 'unordered or not equal' is 'anything but equal', so use the EQ condition
4476   // code and swap the VSEL operands. Also do this if we don't care about the
4477   // unordered case.
4478   if (CC == ISD::SETUNE || CC == ISD::SETNE) {
4479     CondCode = ARMCC::EQ;
4480     swpVselOps = true;
4481   }
4482 }
4483
4484 SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
4485                                    SDValue TrueVal, SDValue ARMcc, SDValue CCR,
4486                                    SDValue Cmp, SelectionDAG &DAG) const {
4487   if (!Subtarget->hasFP64() && VT == MVT::f64) {
4488     FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
4489                            DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
4490     TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
4491                           DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
4492
4493     SDValue TrueLow = TrueVal.getValue(0);
4494     SDValue TrueHigh = TrueVal.getValue(1);
4495     SDValue FalseLow = FalseVal.getValue(0);
4496     SDValue FalseHigh = FalseVal.getValue(1);
4497
4498     SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
4499                               ARMcc, CCR, Cmp);
4500     SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
4501                                ARMcc, CCR, duplicateCmp(Cmp, DAG));
4502
4503     return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
4504   } else {
4505     return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
4506                        Cmp);
4507   }
4508 }
4509
4510 static bool isGTorGE(ISD::CondCode CC) {
4511   return CC == ISD::SETGT || CC == ISD::SETGE;
4512 }
4513
4514 static bool isLTorLE(ISD::CondCode CC) {
4515   return CC == ISD::SETLT || CC == ISD::SETLE;
4516 }
4517
4518 // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
4519 // All of these conditions (and their <= and >= counterparts) will do:
4520 //          x < k ? k : x
4521 //          x > k ? x : k
4522 //          k < x ? x : k
4523 //          k > x ? k : x
4524 static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
4525                             const SDValue TrueVal, const SDValue FalseVal,
4526                             const ISD::CondCode CC, const SDValue K) {
4527   return (isGTorGE(CC) &&
4528           ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
4529          (isLTorLE(CC) &&
4530           ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
4531 }
4532
4533 // Similar to isLowerSaturate(), but checks for upper-saturating conditions.
4534 static bool isUpperSaturate(const SDValue LHS, const SDValue RHS,
4535                             const SDValue TrueVal, const SDValue FalseVal,
4536                             const ISD::CondCode CC, const SDValue K) {
4537   return (isGTorGE(CC) &&
4538           ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) ||
4539          (isLTorLE(CC) &&
4540           ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal)));
4541 }
4542
4543 // Check if two chained conditionals could be converted into SSAT or USAT.
4544 //
4545 // SSAT can replace a set of two conditional selectors that bound a number to an
4546 // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
4547 //
4548 //     x < -k ? -k : (x > k ? k : x)
4549 //     x < -k ? -k : (x < k ? x : k)
4550 //     x > -k ? (x > k ? k : x) : -k
4551 //     x < k ? (x < -k ? -k : x) : k
4552 //     etc.
4553 //
4554 // USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is
4555 // a power of 2.
4556 //
4557 // It returns true if the conversion can be done, false otherwise.
4558 // Additionally, the variable is returned in parameter V, the constant in K and
4559 // usat is set to true if the conditional represents an unsigned saturation
4560 static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
4561                                     uint64_t &K, bool &usat) {
4562   SDValue LHS1 = Op.getOperand(0);
4563   SDValue RHS1 = Op.getOperand(1);
4564   SDValue TrueVal1 = Op.getOperand(2);
4565   SDValue FalseVal1 = Op.getOperand(3);
4566   ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
4567
4568   const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
4569   if (Op2.getOpcode() != ISD::SELECT_CC)
4570     return false;
4571
4572   SDValue LHS2 = Op2.getOperand(0);
4573   SDValue RHS2 = Op2.getOperand(1);
4574   SDValue TrueVal2 = Op2.getOperand(2);
4575   SDValue FalseVal2 = Op2.getOperand(3);
4576   ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
4577
4578   // Find out which are the constants and which are the variables
4579   // in each conditional
4580   SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1)
4581                                                         ? &RHS1
4582                                                         : nullptr;
4583   SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2)
4584                                                         ? &RHS2
4585                                                         : nullptr;
4586   SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2;
4587   SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1;
4588   SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2;
4589   SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2;
4590
4591   // We must detect cases where the original operations worked with 16- or
4592   // 8-bit values. In such case, V2Tmp != V2 because the comparison operations
4593   // must work with sign-extended values but the select operations return
4594   // the original non-extended value.
4595   SDValue V2TmpReg = V2Tmp;
4596   if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG)
4597     V2TmpReg = V2Tmp->getOperand(0);
4598
4599   // Check that the registers and the constants have the correct values
4600   // in both conditionals
4601   if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp ||
4602       V2TmpReg != V2)
4603     return false;
4604
4605   // Figure out which conditional is saturating the lower/upper bound.
4606   const SDValue *LowerCheckOp =
4607       isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
4608           ? &Op
4609           : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2)
4610                 ? &Op2
4611                 : nullptr;
4612   const SDValue *UpperCheckOp =
4613       isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
4614           ? &Op
4615           : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2)
4616                 ? &Op2
4617                 : nullptr;
4618
4619   if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp)
4620     return false;
4621
4622   // Check that the constant in the lower-bound check is
4623   // the opposite of the constant in the upper-bound check
4624   // in 1's complement.
4625   int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue();
4626   int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue();
4627   int64_t PosVal = std::max(Val1, Val2);
4628   int64_t NegVal = std::min(Val1, Val2);
4629
4630   if (((Val1 > Val2 && UpperCheckOp == &Op) ||
4631        (Val1 < Val2 && UpperCheckOp == &Op2)) &&
4632       isPowerOf2_64(PosVal + 1)) {
4633
4634     // Handle the difference between USAT (unsigned) and SSAT (signed) saturation
4635     if (Val1 == ~Val2)
4636       usat = false;
4637     else if (NegVal == 0)
4638       usat = true;
4639     else
4640       return false;
4641
4642     V = V2;
4643     K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive
4644
4645     return true;
4646   }
4647
4648   return false;
4649 }
4650
4651 // Check if a condition of the type x < k ? k : x can be converted into a
4652 // bit operation instead of conditional moves.
4653 // Currently this is allowed given:
4654 // - The conditions and values match up
4655 // - k is 0 or -1 (all ones)
4656 // This function will not check the last condition, thats up to the caller
4657 // It returns true if the transformation can be made, and in such case
4658 // returns x in V, and k in SatK.
4659 static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V,
4660                                          SDValue &SatK)
4661 {
4662   SDValue LHS = Op.getOperand(0);
4663   SDValue RHS = Op.getOperand(1);
4664   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
4665   SDValue TrueVal = Op.getOperand(2);
4666   SDValue FalseVal = Op.getOperand(3);
4667
4668   SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS)
4669                                                ? &RHS
4670                                                : nullptr;
4671
4672   // No constant operation in comparison, early out
4673   if (!K)
4674     return false;
4675
4676   SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
4677   V = (KTmp == TrueVal) ? FalseVal : TrueVal;
4678   SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
4679
4680   // If the constant on left and right side, or variable on left and right,
4681   // does not match, early out
4682   if (*K != KTmp || V != VTmp)
4683     return false;
4684
4685   if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
4686     SatK = *K;
4687     return true;
4688   }
4689
4690   return false;
4691 }
4692
4693 bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
4694   if (VT == MVT::f32)
4695     return !Subtarget->hasVFP2Base();
4696   if (VT == MVT::f64)
4697     return !Subtarget->hasFP64();
4698   if (VT == MVT::f16)
4699     return !Subtarget->hasFullFP16();
4700   return false;
4701 }
4702
4703 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
4704   EVT VT = Op.getValueType();
4705   SDLoc dl(Op);
4706
4707   // Try to convert two saturating conditional selects into a single SSAT
4708   SDValue SatValue;
4709   uint64_t SatConstant;
4710   bool SatUSat;
4711   if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) &&
4712       isSaturatingConditional(Op, SatValue, SatConstant, SatUSat)) {
4713     if (SatUSat)
4714       return DAG.getNode(ARMISD::USAT, dl, VT, SatValue,
4715                          DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
4716     else
4717       return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue,
4718                          DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
4719   }
4720
4721   // Try to convert expressions of the form x < k ? k : x (and similar forms)
4722   // into more efficient bit operations, which is possible when k is 0 or -1
4723   // On ARM and Thumb-2 which have flexible operand 2 this will result in
4724   // single instructions. On Thumb the shift and the bit operation will be two
4725   // instructions.
4726   // Only allow this transformation on full-width (32-bit) operations
4727   SDValue LowerSatConstant;
4728   if (VT == MVT::i32 &&
4729       isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
4730     SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
4731                                  DAG.getConstant(31, dl, VT));
4732     if (isNullConstant(LowerSatConstant)) {
4733       SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
4734                                       DAG.getAllOnesConstant(dl, VT));
4735       return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
4736     } else if (isAllOnesConstant(LowerSatConstant))
4737       return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
4738   }
4739
4740   SDValue LHS = Op.getOperand(0);
4741   SDValue RHS = Op.getOperand(1);
4742   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
4743   SDValue TrueVal = Op.getOperand(2);
4744   SDValue FalseVal = Op.getOperand(3);
4745
4746   if (isUnsupportedFloatingType(LHS.getValueType())) {
4747     DAG.getTargetLoweringInfo().softenSetCCOperands(
4748         DAG, LHS.getValueType(), LHS, RHS, CC, dl);
4749
4750     // If softenSetCCOperands only returned one value, we should compare it to
4751     // zero.
4752     if (!RHS.getNode()) {
4753       RHS = DAG.getConstant(0, dl, LHS.getValueType());
4754       CC = ISD::SETNE;
4755     }
4756   }
4757
4758   if (LHS.getValueType() == MVT::i32) {
4759     // Try to generate VSEL on ARMv8.
4760     // The VSEL instruction can't use all the usual ARM condition
4761     // codes: it only has two bits to select the condition code, so it's
4762     // constrained to use only GE, GT, VS and EQ.
4763     //
4764     // To implement all the various ISD::SETXXX opcodes, we sometimes need to
4765     // swap the operands of the previous compare instruction (effectively
4766     // inverting the compare condition, swapping 'less' and 'greater') and
4767     // sometimes need to swap the operands to the VSEL (which inverts the
4768     // condition in the sense of firing whenever the previous condition didn't)
4769     if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
4770                                         TrueVal.getValueType() == MVT::f32 ||
4771                                         TrueVal.getValueType() == MVT::f64)) {
4772       ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
4773       if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
4774           CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
4775         CC = ISD::getSetCCInverse(CC, true);
4776         std::swap(TrueVal, FalseVal);
4777       }
4778     }
4779
4780     SDValue ARMcc;
4781     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4782     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
4783     // Choose GE over PL, which vsel does now support
4784     if (cast<ConstantSDNode>(ARMcc)->getZExtValue() == ARMCC::PL)
4785       ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
4786     return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
4787   }
4788
4789   ARMCC::CondCodes CondCode, CondCode2;
4790   bool InvalidOnQNaN;
4791   FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);
4792
4793   // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
4794   // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
4795   // must use VSEL (limited condition codes), due to not having conditional f16
4796   // moves.
4797   if (Subtarget->hasFPARMv8Base() &&
4798       !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
4799       (TrueVal.getValueType() == MVT::f16 ||
4800        TrueVal.getValueType() == MVT::f32 ||
4801        TrueVal.getValueType() == MVT::f64)) {
4802     bool swpCmpOps = false;
4803     bool swpVselOps = false;
4804     checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
4805
4806     if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
4807         CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
4808       if (swpCmpOps)
4809         std::swap(LHS, RHS);
4810       if (swpVselOps)
4811         std::swap(TrueVal, FalseVal);
4812     }
4813   }
4814
4815   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4816   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
4817   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4818   SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
4819   if (CondCode2 != ARMCC::AL) {
4820     SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
4821     // FIXME: Needs another CMP because flag can have but one use.
4822     SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
4823     Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
4824   }
4825   return Result;
4826 }
4827
4828 /// canChangeToInt - Given the fp compare operand, return true if it is suitable
4829 /// to morph to an integer compare sequence.
4830 static bool canChangeToInt(SDValue Op, bool &SeenZero,
4831                            const ARMSubtarget *Subtarget) {
4832   SDNode *N = Op.getNode();
4833   if (!N->hasOneUse())
4834     // Otherwise it requires moving the value from fp to integer registers.
4835     return false;
4836   if (!N->getNumValues())
4837     return false;
4838   EVT VT = Op.getValueType();
4839   if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
4840     // f32 case is generally profitable. f64 case only makes sense when vcmpe +
4841     // vmrs are very slow, e.g. cortex-a8.
4842     return false;
4843
4844   if (isFloatingPointZero(Op)) {
4845     SeenZero = true;
4846     return true;
4847   }
4848   return ISD::isNormalLoad(N);
4849 }
4850
4851 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
4852   if (isFloatingPointZero(Op))
4853     return DAG.getConstant(0, SDLoc(Op), MVT::i32);
4854
4855   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
4856     return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
4857                        Ld->getPointerInfo(), Ld->getAlignment(),
4858                        Ld->getMemOperand()->getFlags());
4859
4860   llvm_unreachable("Unknown VFP cmp argument!");
4861 }
4862
4863 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
4864                            SDValue &RetVal1, SDValue &RetVal2) {
4865   SDLoc dl(Op);
4866
4867   if (isFloatingPointZero(Op)) {
4868     RetVal1 = DAG.getConstant(0, dl, MVT::i32);
4869     RetVal2 = DAG.getConstant(0, dl, MVT::i32);
4870     return;
4871   }
4872
4873   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
4874     SDValue Ptr = Ld->getBasePtr();
4875     RetVal1 =
4876         DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
4877                     Ld->getAlignment(), Ld->getMemOperand()->getFlags());
4878
4879     EVT PtrType = Ptr.getValueType();
4880     unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
4881     SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
4882                                  PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
4883     RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
4884                           Ld->getPointerInfo().getWithOffset(4), NewAlign,
4885                           Ld->getMemOperand()->getFlags());
4886     return;
4887   }
4888
4889   llvm_unreachable("Unknown VFP cmp argument!");
4890 }
4891
4892 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
4893 /// f32 and even f64 comparisons to integer ones.
4894 SDValue
4895 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
4896   SDValue Chain = Op.getOperand(0);
4897   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
4898   SDValue LHS = Op.getOperand(2);
4899   SDValue RHS = Op.getOperand(3);
4900   SDValue Dest = Op.getOperand(4);
4901   SDLoc dl(Op);
4902
4903   bool LHSSeenZero = false;
4904   bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
4905   bool RHSSeenZero = false;
4906   bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
4907   if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
4908     // If unsafe fp math optimization is enabled and there are no other uses of
4909     // the CMP operands, and the condition code is EQ or NE, we can optimize it
4910     // to an integer comparison.
4911     if (CC == ISD::SETOEQ)
4912       CC = ISD::SETEQ;
4913     else if (CC == ISD::SETUNE)
4914       CC = ISD::SETNE;
4915
4916     SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
4917     SDValue ARMcc;
4918     if (LHS.getValueType() == MVT::f32) {
4919       LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
4920                         bitcastf32Toi32(LHS, DAG), Mask);
4921       RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
4922                         bitcastf32Toi32(RHS, DAG), Mask);
4923       SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
4924       SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4925       return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
4926                          Chain, Dest, ARMcc, CCR, Cmp);
4927     }
4928
4929     SDValue LHS1, LHS2;
4930     SDValue RHS1, RHS2;
4931     expandf64Toi32(LHS, DAG, LHS1, LHS2);
4932     expandf64Toi32(RHS, DAG, RHS1, RHS2);
4933     LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
4934     RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
4935     ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
4936     ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4937     SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
4938     SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
4939     return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops);
4940   }
4941
4942   return SDValue();
4943 }
4944
4945 SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
4946   SDValue Chain = Op.getOperand(0);
4947   SDValue Cond = Op.getOperand(1);
4948   SDValue Dest = Op.getOperand(2);
4949   SDLoc dl(Op);
4950
4951   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
4952   // instruction.
4953   unsigned Opc = Cond.getOpcode();
4954   bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
4955                       !Subtarget->isThumb1Only();
4956   if (Cond.getResNo() == 1 &&
4957       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
4958        Opc == ISD::USUBO || OptimizeMul)) {
4959     // Only lower legal XALUO ops.
4960     if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
4961       return SDValue();
4962
4963     // The actual operation with overflow check.
4964     SDValue Value, OverflowCmp;
4965     SDValue ARMcc;
4966     std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
4967
4968     // Reverse the condition code.
4969     ARMCC::CondCodes CondCode =
4970         (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
4971     CondCode = ARMCC::getOppositeCondition(CondCode);
4972     ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
4973     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4974
4975     return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
4976                        OverflowCmp);
4977   }
4978
4979   return SDValue();
4980 }
4981
4982 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
4983   SDValue Chain = Op.getOperand(0);
4984   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
4985   SDValue LHS = Op.getOperand(2);
4986   SDValue RHS = Op.getOperand(3);
4987   SDValue Dest = Op.getOperand(4);
4988   SDLoc dl(Op);
4989
4990   if (isUnsupportedFloatingType(LHS.getValueType())) {
4991     DAG.getTargetLoweringInfo().softenSetCCOperands(
4992         DAG, LHS.getValueType(), LHS, RHS, CC, dl);
4993
4994     // If softenSetCCOperands only returned one value, we should compare it to
4995     // zero.
4996     if (!RHS.getNode()) {
4997       RHS = DAG.getConstant(0, dl, LHS.getValueType());
4998       CC = ISD::SETNE;
4999     }
5000   }
5001
5002   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5003   // instruction.
5004   unsigned Opc = LHS.getOpcode();
5005   bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5006                       !Subtarget->isThumb1Only();
5007   if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
5008       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5009        Opc == ISD::USUBO || OptimizeMul) &&
5010       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5011     // Only lower legal XALUO ops.
5012     if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
5013       return SDValue();
5014
5015     // The actual operation with overflow check.
5016     SDValue Value, OverflowCmp;
5017     SDValue ARMcc;
5018     std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
5019
5020     if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
5021       // Reverse the condition code.
5022       ARMCC::CondCodes CondCode =
5023           (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5024       CondCode = ARMCC::getOppositeCondition(CondCode);
5025       ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5026     }
5027     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5028
5029     return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
5030                        OverflowCmp);
5031   }
5032
5033   if (LHS.getValueType() == MVT::i32) {
5034     SDValue ARMcc;
5035     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5036     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5037     return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
5038                        Chain, Dest, ARMcc, CCR, Cmp);
5039   }
5040
5041   if (getTargetMachine().Options.UnsafeFPMath &&
5042       (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
5043        CC == ISD::SETNE || CC == ISD::SETUNE)) {
5044     if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
5045       return Result;
5046   }
5047
5048   ARMCC::CondCodes CondCode, CondCode2;
5049   bool InvalidOnQNaN;
5050   FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);
5051
5052   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5053   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
5054   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5055   SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
5056   SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
5057   SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
5058   if (CondCode2 != ARMCC::AL) {
5059     ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
5060     SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
5061     Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
5062   }
5063   return Res;
5064 }
5065
5066 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
5067   SDValue Chain = Op.getOperand(0);
5068   SDValue Table = Op.getOperand(1);
5069   SDValue Index = Op.getOperand(2);
5070   SDLoc dl(Op);
5071
5072   EVT PTy = getPointerTy(DAG.getDataLayout());
5073   JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
5074   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
5075   Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
5076   Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
5077   SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
5078   if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
5079     // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
5080     // which does another jump to the destination. This also makes it easier
5081     // to translate it to TBB / TBH later (Thumb2 only).
5082     // FIXME: This might not work if the function is extremely large.
5083     return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
5084                        Addr, Op.getOperand(2), JTI);
5085   }
5086   if (isPositionIndependent() || Subtarget->isROPI()) {
5087     Addr =
5088         DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
5089                     MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
5090     Chain = Addr.getValue(1);
5091     Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
5092     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5093   } else {
5094     Addr =
5095         DAG.getLoad(PTy, dl, Chain, Addr,
5096                     MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
5097     Chain = Addr.getValue(1);
5098     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5099   }
5100 }
5101
5102 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
5103   EVT VT = Op.getValueType();
5104   SDLoc dl(Op);
5105
5106   if (Op.getValueType().getVectorElementType() == MVT::i32) {
5107     if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
5108       return Op;
5109     return DAG.UnrollVectorOp(Op.getNode());
5110   }
5111
5112   const bool HasFullFP16 =
5113     static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16();
5114
5115   EVT NewTy;
5116   const EVT OpTy = Op.getOperand(0).getValueType();
5117   if (OpTy == MVT::v4f32)
5118     NewTy = MVT::v4i32;
5119   else if (OpTy == MVT::v4f16 && HasFullFP16)
5120     NewTy = MVT::v4i16;
5121   else if (OpTy == MVT::v8f16 && HasFullFP16)
5122     NewTy = MVT::v8i16;
5123   else
5124     llvm_unreachable("Invalid type for custom lowering!");
5125
5126   if (VT != MVT::v4i16 && VT != MVT::v8i16)
5127     return DAG.UnrollVectorOp(Op.getNode());
5128
5129   Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
5130   return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
5131 }
5132
5133 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
5134   EVT VT = Op.getValueType();
5135   if (VT.isVector())
5136     return LowerVectorFP_TO_INT(Op, DAG);
5137   if (isUnsupportedFloatingType(Op.getOperand(0).getValueType())) {
5138     RTLIB::Libcall LC;
5139     if (Op.getOpcode() == ISD::FP_TO_SINT)
5140       LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(),
5141                               Op.getValueType());
5142     else
5143       LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(),
5144                               Op.getValueType());
5145     return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
5146                        /*isSigned*/ false, SDLoc(Op)).first;
5147   }
5148
5149   return Op;
5150 }
5151
5152 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
5153   EVT VT = Op.getValueType();
5154   SDLoc dl(Op);
5155
5156   if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
5157     if (VT.getVectorElementType() == MVT::f32)
5158       return Op;
5159     return DAG.UnrollVectorOp(Op.getNode());
5160   }
5161
5162   assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
5163           Op.getOperand(0).getValueType() == MVT::v8i16) &&
5164          "Invalid type for custom lowering!");
5165
5166   const bool HasFullFP16 =
5167     static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16();
5168
5169   EVT DestVecType;
5170   if (VT == MVT::v4f32)
5171     DestVecType = MVT::v4i32;
5172   else if (VT == MVT::v4f16 && HasFullFP16)
5173     DestVecType = MVT::v4i16;
5174   else if (VT == MVT::v8f16 && HasFullFP16)
5175     DestVecType = MVT::v8i16;
5176   else
5177     return DAG.UnrollVectorOp(Op.getNode());
5178
5179   unsigned CastOpc;
5180   unsigned Opc;
5181   switch (Op.getOpcode()) {
5182   default: llvm_unreachable("Invalid opcode!");
5183   case ISD::SINT_TO_FP:
5184     CastOpc = ISD::SIGN_EXTEND;
5185     Opc = ISD::SINT_TO_FP;
5186     break;
5187   case ISD::UINT_TO_FP:
5188     CastOpc = ISD::ZERO_EXTEND;
5189     Opc = ISD::UINT_TO_FP;
5190     break;
5191   }
5192
5193   Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
5194   return DAG.getNode(Opc, dl, VT, Op);
5195 }
5196
5197 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
5198   EVT VT = Op.getValueType();
5199   if (VT.isVector())
5200     return LowerVectorINT_TO_FP(Op, DAG);
5201   if (isUnsupportedFloatingType(VT)) {
5202     RTLIB::Libcall LC;
5203     if (Op.getOpcode() == ISD::SINT_TO_FP)
5204       LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
5205                               Op.getValueType());
5206     else
5207       LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
5208                               Op.getValueType());
5209     return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
5210                        /*isSigned*/ false, SDLoc(Op)).first;
5211   }
5212
5213   return Op;
5214 }
5215
5216 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
5217   // Implement fcopysign with a fabs and a conditional fneg.
5218   SDValue Tmp0 = Op.getOperand(0);
5219   SDValue Tmp1 = Op.getOperand(1);
5220   SDLoc dl(Op);
5221   EVT VT = Op.getValueType();
5222   EVT SrcVT = Tmp1.getValueType();
5223   bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
5224     Tmp0.getOpcode() == ARMISD::VMOVDRR;
5225   bool UseNEON = !InGPR && Subtarget->hasNEON();
5226
5227   if (UseNEON) {
5228     // Use VBSL to copy the sign bit.
5229     unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
5230     SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
5231                                DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
5232     EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
5233     if (VT == MVT::f64)
5234       Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5235                          DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
5236                          DAG.getConstant(32, dl, MVT::i32));
5237     else /*if (VT == MVT::f32)*/
5238       Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
5239     if (SrcVT == MVT::f32) {
5240       Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
5241       if (VT == MVT::f64)
5242         Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5243                            DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
5244                            DAG.getConstant(32, dl, MVT::i32));
5245     } else if (VT == MVT::f32)
5246       Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
5247                          DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
5248                          DAG.getConstant(32, dl, MVT::i32));
5249     Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
5250     Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
5251
5252     SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff),
5253                                             dl, MVT::i32);
5254     AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
5255     SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
5256                                   DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
5257
5258     SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
5259                               DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
5260                               DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
5261     if (VT == MVT::f32) {
5262       Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
5263       Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
5264                         DAG.getConstant(0, dl, MVT::i32));
5265     } else {
5266       Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
5267     }
5268
5269     return Res;
5270   }
5271
5272   // Bitcast operand 1 to i32.
5273   if (SrcVT == MVT::f64)
5274     Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5275                        Tmp1).getValue(1);
5276   Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
5277
5278   // Or in the signbit with integer operations.
5279   SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
5280   SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5281   Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
5282   if (VT == MVT::f32) {
5283     Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
5284                        DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
5285     return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
5286                        DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
5287   }
5288
5289   // f64: Or the high part with signbit and then combine two parts.
5290   Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5291                      Tmp0);
5292   SDValue Lo = Tmp0.getValue(0);
5293   SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
5294   Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
5295   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
5296 }
5297
5298 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
5299   MachineFunction &MF = DAG.getMachineFunction();
5300   MachineFrameInfo &MFI = MF.getFrameInfo();
5301   MFI.setReturnAddressIsTaken(true);
5302
5303   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
5304     return SDValue();
5305
5306   EVT VT = Op.getValueType();
5307   SDLoc dl(Op);
5308   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
5309   if (Depth) {
5310     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
5311     SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
5312     return DAG.getLoad(VT, dl, DAG.getEntryNode(),
5313                        DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
5314                        MachinePointerInfo());
5315   }
5316
5317   // Return LR, which contains the return address. Mark it an implicit live-in.
5318   unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
5319   return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
5320 }
5321
5322 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
5323   const ARMBaseRegisterInfo &ARI =
5324     *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
5325   MachineFunction &MF = DAG.getMachineFunction();
5326   MachineFrameInfo &MFI = MF.getFrameInfo();
5327   MFI.setFrameAddressIsTaken(true);
5328
5329   EVT VT = Op.getValueType();
5330   SDLoc dl(Op);  // FIXME probably not meaningful
5331   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
5332   unsigned FrameReg = ARI.getFrameRegister(MF);
5333   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
5334   while (Depth--)
5335     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
5336                             MachinePointerInfo());
5337   return FrameAddr;
5338 }
5339
5340 // FIXME? Maybe this could be a TableGen attribute on some registers and
5341 // this table could be generated automatically from RegInfo.
5342 unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT,
5343                                               SelectionDAG &DAG) const {
5344   unsigned Reg = StringSwitch<unsigned>(RegName)
5345                        .Case("sp", ARM::SP)
5346                        .Default(0);
5347   if (Reg)
5348     return Reg;
5349   report_fatal_error(Twine("Invalid register name \""
5350                               + StringRef(RegName)  + "\"."));
5351 }
5352
5353 // Result is 64 bit value so split into two 32 bit values and return as a
5354 // pair of values.
5355 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
5356                                 SelectionDAG &DAG) {
5357   SDLoc DL(N);
5358
5359   // This function is only supposed to be called for i64 type destination.
5360   assert(N->getValueType(0) == MVT::i64
5361           && "ExpandREAD_REGISTER called for non-i64 type result.");
5362
5363   SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL,
5364                              DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
5365                              N->getOperand(0),
5366                              N->getOperand(1));
5367
5368   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
5369                     Read.getValue(1)));
5370   Results.push_back(Read.getOperand(0));
5371 }
5372
5373 /// \p BC is a bitcast that is about to be turned into a VMOVDRR.
5374 /// When \p DstVT, the destination type of \p BC, is on the vector
5375 /// register bank and the source of bitcast, \p Op, operates on the same bank,
5376 /// it might be possible to combine them, such that everything stays on the
5377 /// vector register bank.
5378 /// \p return The node that would replace \p BT, if the combine
5379 /// is possible.
5380 static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
5381                                                 SelectionDAG &DAG) {
5382   SDValue Op = BC->getOperand(0);
5383   EVT DstVT = BC->getValueType(0);
5384
5385   // The only vector instruction that can produce a scalar (remember,
5386   // since the bitcast was about to be turned into VMOVDRR, the source
5387   // type is i64) from a vector is EXTRACT_VECTOR_ELT.
5388   // Moreover, we can do this combine only if there is one use.
5389   // Finally, if the destination type is not a vector, there is not
5390   // much point on forcing everything on the vector bank.
5391   if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5392       !Op.hasOneUse())
5393     return SDValue();
5394
5395   // If the index is not constant, we will introduce an additional
5396   // multiply that will stick.
5397   // Give up in that case.
5398   ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
5399   if (!Index)
5400     return SDValue();
5401   unsigned DstNumElt = DstVT.getVectorNumElements();
5402
5403   // Compute the new index.
5404   const APInt &APIntIndex = Index->getAPIntValue();
5405   APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
5406   NewIndex *= APIntIndex;
5407   // Check if the new constant index fits into i32.
5408   if (NewIndex.getBitWidth() > 32)
5409     return SDValue();
5410
5411   // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
5412   // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
5413   SDLoc dl(Op);
5414   SDValue ExtractSrc = Op.getOperand(0);
5415   EVT VecVT = EVT::getVectorVT(
5416       *DAG.getContext(), DstVT.getScalarType(),
5417       ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
5418   SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
5419   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
5420                      DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
5421 }
5422
5423 /// ExpandBITCAST - If the target supports VFP, this function is called to
5424 /// expand a bit convert where either the source or destination type is i64 to
5425 /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
5426 /// operand type is illegal (e.g., v2f32 for a target that doesn't support
5427 /// vectors), since the legalizer won't know what to do with that.
5428 static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
5429                              const ARMSubtarget *Subtarget) {
5430   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5431   SDLoc dl(N);
5432   SDValue Op = N->getOperand(0);
5433
5434   // This function is only supposed to be called for i64 types, either as the
5435   // source or destination of the bit convert.
5436   EVT SrcVT = Op.getValueType();
5437   EVT DstVT = N->getValueType(0);
5438   const bool HasFullFP16 = Subtarget->hasFullFP16();
5439
5440   if (SrcVT == MVT::f32 && DstVT == MVT::i32) {
5441      // FullFP16: half values are passed in S-registers, and we don't
5442      // need any of the bitcast and moves:
5443      //
5444      // t2: f32,ch = CopyFromReg t0, Register:f32 %0
5445      //   t5: i32 = bitcast t2
5446      // t18: f16 = ARMISD::VMOVhr t5
5447      if (Op.getOpcode() != ISD::CopyFromReg ||
5448          Op.getValueType() != MVT::f32)
5449        return SDValue();
5450
5451      auto Move = N->use_begin();
5452      if (Move->getOpcode() != ARMISD::VMOVhr)
5453        return SDValue();
5454
5455      SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) };
5456      SDValue Copy = DAG.getNode(ISD::CopyFromReg, SDLoc(Op), MVT::f16, Ops);
5457      DAG.ReplaceAllUsesWith(*Move, &Copy);
5458      return Copy;
5459   }
5460
5461   if (SrcVT == MVT::i16 && DstVT == MVT::f16) {
5462     if (!HasFullFP16)
5463       return SDValue();
5464     // SoftFP: read half-precision arguments:
5465     //
5466     // t2: i32,ch = ...
5467     //        t7: i16 = truncate t2 <~~~~ Op
5468     //      t8: f16 = bitcast t7    <~~~~ N
5469     //
5470     if (Op.getOperand(0).getValueType() == MVT::i32)
5471       return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op),
5472                          MVT::f16, Op.getOperand(0));
5473
5474     return SDValue();
5475   }
5476
5477   // Half-precision return values
5478   if (SrcVT == MVT::f16 && DstVT == MVT::i16) {
5479     if (!HasFullFP16)
5480       return SDValue();
5481     //
5482     //          t11: f16 = fadd t8, t10
5483     //        t12: i16 = bitcast t11       <~~~ SDNode N
5484     //      t13: i32 = zero_extend t12
5485     //    t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13
5486     //  t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1
5487     //
5488     // transform this into:
5489     //
5490     //    t20: i32 = ARMISD::VMOVrh t11
5491     //  t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20
5492     //
5493     auto ZeroExtend = N->use_begin();
5494     if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND ||
5495         ZeroExtend->getValueType(0) != MVT::i32)
5496       return SDValue();
5497
5498     auto Copy = ZeroExtend->use_begin();
5499     if (Copy->getOpcode() == ISD::CopyToReg &&
5500         Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) {
5501       SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op);
5502       DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt);
5503       return Cvt;
5504     }
5505     return SDValue();
5506   }
5507
5508   if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
5509     return SDValue();
5510
5511   // Turn i64->f64 into VMOVDRR.
5512   if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
5513     // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
5514     // if we can combine the bitcast with its source.
5515     if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG))
5516       return Val;
5517
5518     SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
5519                              DAG.getConstant(0, dl, MVT::i32));
5520     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
5521                              DAG.getConstant(1, dl, MVT::i32));
5522     return DAG.getNode(ISD::BITCAST, dl, DstVT,
5523                        DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
5524   }
5525
5526   // Turn f64->i64 into VMOVRRD.
5527   if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
5528     SDValue Cvt;
5529     if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
5530         SrcVT.getVectorNumElements() > 1)
5531       Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
5532                         DAG.getVTList(MVT::i32, MVT::i32),
5533                         DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
5534     else
5535       Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
5536                         DAG.getVTList(MVT::i32, MVT::i32), Op);
5537     // Merge the pieces into a single i64 value.
5538     return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
5539   }
5540
5541   return SDValue();
5542 }
5543
5544 /// getZeroVector - Returns a vector of specified type with all zero elements.
5545 /// Zero vectors are used to represent vector negation and in those cases
5546 /// will be implemented with the NEON VNEG instruction.  However, VNEG does
5547 /// not support i64 elements, so sometimes the zero vectors will need to be
5548 /// explicitly constructed.  Regardless, use a canonical VMOV to create the
5549 /// zero vector.
5550 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5551   assert(VT.isVector() && "Expected a vector type");
5552   // The canonical modified immediate encoding of a zero vector is....0!
5553   SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
5554   EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
5555   SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
5556   return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
5557 }
5558
5559 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two
5560 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
5561 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
5562                                                 SelectionDAG &DAG) const {
5563   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
5564   EVT VT = Op.getValueType();
5565   unsigned VTBits = VT.getSizeInBits();
5566   SDLoc dl(Op);
5567   SDValue ShOpLo = Op.getOperand(0);
5568   SDValue ShOpHi = Op.getOperand(1);
5569   SDValue ShAmt  = Op.getOperand(2);
5570   SDValue ARMcc;
5571   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5572   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
5573
5574   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
5575
5576   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
5577                                  DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
5578   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
5579   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
5580                                    DAG.getConstant(VTBits, dl, MVT::i32));
5581   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
5582   SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
5583   SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
5584   SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
5585                             ISD::SETGE, ARMcc, DAG, dl);
5586   SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift,
5587                            ARMcc, CCR, CmpLo);
5588
5589   SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
5590   SDValue HiBigShift = Opc == ISD::SRA
5591                            ? DAG.getNode(Opc, dl, VT, ShOpHi,
5592                                          DAG.getConstant(VTBits - 1, dl, VT))
5593                            : DAG.getConstant(0, dl, VT);
5594   SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
5595                             ISD::SETGE, ARMcc, DAG, dl);
5596   SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
5597                            ARMcc, CCR, CmpHi);
5598
5599   SDValue Ops[2] = { Lo, Hi };
5600   return DAG.getMergeValues(Ops, dl);
5601 }
5602
5603 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
5604 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
5605 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
5606                                                SelectionDAG &DAG) const {
5607   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
5608   EVT VT = Op.getValueType();
5609   unsigned VTBits = VT.getSizeInBits();
5610   SDLoc dl(Op);
5611   SDValue ShOpLo = Op.getOperand(0);
5612   SDValue ShOpHi = Op.getOperand(1);
5613   SDValue ShAmt  = Op.getOperand(2);
5614   SDValue ARMcc;
5615   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5616
5617   assert(Op.getOpcode() == ISD::SHL_PARTS);
5618   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
5619                                  DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
5620   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
5621   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
5622   SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
5623
5624   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
5625                                    DAG.getConstant(VTBits, dl, MVT::i32));
5626   SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
5627   SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
5628                             ISD::SETGE, ARMcc, DAG, dl);
5629   SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
5630                            ARMcc, CCR, CmpHi);
5631
5632   SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
5633                           ISD::SETGE, ARMcc, DAG, dl);
5634   SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
5635   SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
5636                            DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo);
5637
5638   SDValue Ops[2] = { Lo, Hi };
5639   return DAG.getMergeValues(Ops, dl);
5640 }
5641
5642 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
5643                                             SelectionDAG &DAG) const {
5644   // The rounding mode is in bits 23:22 of the FPSCR.
5645   // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5646   // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5647   // so that the shift + and get folded into a bitfield extract.
5648   SDLoc dl(Op);
5649   SDValue Ops[] = { DAG.getEntryNode(),
5650                     DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32) };
5651
5652   SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, MVT::i32, Ops);
5653   SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
5654                                   DAG.getConstant(1U << 22, dl, MVT::i32));
5655   SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
5656                               DAG.getConstant(22, dl, MVT::i32));
5657   return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
5658                      DAG.getConstant(3, dl, MVT::i32));
5659 }
5660
5661 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
5662                          const ARMSubtarget *ST) {
5663   SDLoc dl(N);
5664   EVT VT = N->getValueType(0);
5665   if (VT.isVector()) {
5666     assert(ST->hasNEON());
5667
5668     // Compute the least significant set bit: LSB = X & -X
5669     SDValue X = N->getOperand(0);
5670     SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
5671     SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
5672
5673     EVT ElemTy = VT.getVectorElementType();
5674
5675     if (ElemTy == MVT::i8) {
5676       // Compute with: cttz(x) = ctpop(lsb - 1)
5677       SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
5678                                 DAG.getTargetConstant(1, dl, ElemTy));
5679       SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
5680       return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
5681     }
5682
5683     if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
5684         (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
5685       // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
5686       unsigned NumBits = ElemTy.getSizeInBits();
5687       SDValue WidthMinus1 =
5688           DAG.getNode(ARMISD::VMOVIMM, dl, VT,
5689                       DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
5690       SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
5691       return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
5692     }
5693
5694     // Compute with: cttz(x) = ctpop(lsb - 1)
5695
5696     // Compute LSB - 1.
5697     SDValue Bits;
5698     if (ElemTy == MVT::i64) {
5699       // Load constant 0xffff'ffff'ffff'ffff to register.
5700       SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
5701                                DAG.getTargetConstant(0x1eff, dl, MVT::i32));
5702       Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
5703     } else {
5704       SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
5705                                 DAG.getTargetConstant(1, dl, ElemTy));
5706       Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
5707     }
5708     return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
5709   }
5710
5711   if (!ST->hasV6T2Ops())
5712     return SDValue();
5713
5714   SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
5715   return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
5716 }
5717
5718 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
5719                           const ARMSubtarget *ST) {
5720   EVT VT = N->getValueType(0);
5721   SDLoc DL(N);
5722
5723   assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
5724   assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
5725           VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
5726          "Unexpected type for custom ctpop lowering");
5727
5728   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5729   EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
5730   SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
5731   Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
5732
5733   // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
5734   unsigned EltSize = 8;
5735   unsigned NumElts = VT.is64BitVector() ? 8 : 16;
5736   while (EltSize != VT.getScalarSizeInBits()) {
5737     SmallVector<SDValue, 8> Ops;
5738     Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
5739                                   TLI.getPointerTy(DAG.getDataLayout())));
5740     Ops.push_back(Res);
5741
5742     EltSize *= 2;
5743     NumElts /= 2;
5744     MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
5745     Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
5746   }
5747
5748   return Res;
5749 }
5750
5751 /// Getvshiftimm - Check if this is a valid build_vector for the immediate
5752 /// operand of a vector shift operation, where all the elements of the
5753 /// build_vector must have the same constant integer value.
5754 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
5755   // Ignore bit_converts.
5756   while (Op.getOpcode() == ISD::BITCAST)
5757     Op = Op.getOperand(0);
5758   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
5759   APInt SplatBits, SplatUndef;
5760   unsigned SplatBitSize;
5761   bool HasAnyUndefs;
5762   if (!BVN ||
5763       !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
5764                             ElementBits) ||
5765       SplatBitSize > ElementBits)
5766     return false;
5767   Cnt = SplatBits.getSExtValue();
5768   return true;
5769 }
5770
5771 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
5772 /// operand of a vector shift left operation.  That value must be in the range:
5773 ///   0 <= Value < ElementBits for a left shift; or
5774 ///   0 <= Value <= ElementBits for a long left shift.
5775 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
5776   assert(VT.isVector() && "vector shift count is not a vector type");
5777   int64_t ElementBits = VT.getScalarSizeInBits();
5778   if (!getVShiftImm(Op, ElementBits, Cnt))
5779     return false;
5780   return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
5781 }
5782
5783 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
5784 /// operand of a vector shift right operation.  For a shift opcode, the value
5785 /// is positive, but for an intrinsic the value count must be negative. The
5786 /// absolute value must be in the range:
5787 ///   1 <= |Value| <= ElementBits for a right shift; or
5788 ///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
5789 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
5790                          int64_t &Cnt) {
5791   assert(VT.isVector() && "vector shift count is not a vector type");
5792   int64_t ElementBits = VT.getScalarSizeInBits();
5793   if (!getVShiftImm(Op, ElementBits, Cnt))
5794     return false;
5795   if (!isIntrinsic)
5796     return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
5797   if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
5798     Cnt = -Cnt;
5799     return true;
5800   }
5801   return false;
5802 }
5803
5804 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
5805                           const ARMSubtarget *ST) {
5806   EVT VT = N->getValueType(0);
5807   SDLoc dl(N);
5808   int64_t Cnt;
5809
5810   if (!VT.isVector())
5811     return SDValue();
5812
5813   // We essentially have two forms here. Shift by an immediate and shift by a
5814   // vector register (there are also shift by a gpr, but that is just handled
5815   // with a tablegen pattern). We cannot easily match shift by an immediate in
5816   // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
5817   // For shifting by a vector, we don't have VSHR, only VSHL (which can be
5818   // signed or unsigned, and a negative shift indicates a shift right).
5819   if (N->getOpcode() == ISD::SHL) {
5820     if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
5821       return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
5822                          DAG.getConstant(Cnt, dl, MVT::i32));
5823     return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
5824                        N->getOperand(1));
5825   }
5826
5827   assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
5828          "unexpected vector shift opcode");
5829
5830   if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
5831     unsigned VShiftOpc =
5832         (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
5833     return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
5834                        DAG.getConstant(Cnt, dl, MVT::i32));
5835   }
5836
5837   // Other right shifts we don't have operations for (we use a shift left by a
5838   // negative number).
5839   EVT ShiftVT = N->getOperand(1).getValueType();
5840   SDValue NegatedCount = DAG.getNode(
5841       ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
5842   unsigned VShiftOpc =
5843       (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
5844   return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
5845 }
5846
5847 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
5848                                 const ARMSubtarget *ST) {
5849   EVT VT = N->getValueType(0);
5850   SDLoc dl(N);
5851
5852   // We can get here for a node like i32 = ISD::SHL i32, i64
5853   if (VT != MVT::i64)
5854     return SDValue();
5855
5856   assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
5857           N->getOpcode() == ISD::SHL) &&
5858          "Unknown shift to lower!");
5859
5860   unsigned ShOpc = N->getOpcode();
5861   if (ST->hasMVEIntegerOps()) {
5862     SDValue ShAmt = N->getOperand(1);
5863     unsigned ShPartsOpc = ARMISD::LSLL;
5864     ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt);
5865
5866     // If the shift amount is greater than 32 then do the default optimisation
5867     if (Con && Con->getZExtValue() > 32)
5868       return SDValue();
5869
5870     // Extract the lower 32 bits of the shift amount if it's an i64
5871     if (ShAmt->getValueType(0) == MVT::i64)
5872       ShAmt = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ShAmt,
5873                           DAG.getConstant(0, dl, MVT::i32));
5874
5875     if (ShOpc == ISD::SRL) {
5876       if (!Con)
5877         // There is no t2LSRLr instruction so negate and perform an lsll if the
5878         // shift amount is in a register, emulating a right shift.
5879         ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
5880                             DAG.getConstant(0, dl, MVT::i32), ShAmt);
5881       else
5882         // Else generate an lsrl on the immediate shift amount
5883         ShPartsOpc = ARMISD::LSRL;
5884     } else if (ShOpc == ISD::SRA)
5885       ShPartsOpc = ARMISD::ASRL;
5886
5887     // Lower 32 bits of the destination/source
5888     SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
5889                              DAG.getConstant(0, dl, MVT::i32));
5890     // Upper 32 bits of the destination/source
5891     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
5892                              DAG.getConstant(1, dl, MVT::i32));
5893
5894     // Generate the shift operation as computed above
5895     Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
5896                      ShAmt);
5897     // The upper 32 bits come from the second return value of lsll
5898     Hi = SDValue(Lo.getNode(), 1);
5899     return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
5900   }
5901
5902   // We only lower SRA, SRL of 1 here, all others use generic lowering.
5903   if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
5904     return SDValue();
5905
5906   // If we are in thumb mode, we don't have RRX.
5907   if (ST->isThumb1Only())
5908     return SDValue();
5909
5910   // Okay, we have a 64-bit SRA or SRL of 1.  Lower this to an RRX expr.
5911   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
5912                            DAG.getConstant(0, dl, MVT::i32));
5913   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
5914                            DAG.getConstant(1, dl, MVT::i32));
5915
5916   // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
5917   // captures the result into a carry flag.
5918   unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
5919   Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);
5920
5921   // The low part is an ARMISD::RRX operand, which shifts the carry in.
5922   Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
5923
5924   // Merge the pieces into a single i64 value.
5925  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
5926 }
5927
5928 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG,
5929                            const ARMSubtarget *ST) {
5930   bool Invert = false;
5931   bool Swap = false;
5932   unsigned Opc = ARMCC::AL;
5933
5934   SDValue Op0 = Op.getOperand(0);
5935   SDValue Op1 = Op.getOperand(1);
5936   SDValue CC = Op.getOperand(2);
5937   EVT VT = Op.getValueType();
5938   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
5939   SDLoc dl(Op);
5940
5941   EVT CmpVT;
5942   if (ST->hasNEON())
5943     CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
5944   else {
5945     assert(ST->hasMVEIntegerOps() &&
5946            "No hardware support for integer vector comparison!");
5947
5948     if (Op.getValueType().getVectorElementType() != MVT::i1)
5949       return SDValue();
5950
5951     // Make sure we expand floating point setcc to scalar if we do not have
5952     // mve.fp, so that we can handle them from there.
5953     if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
5954       return SDValue();
5955
5956     CmpVT = VT;
5957   }
5958
5959   if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
5960       (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
5961     // Special-case integer 64-bit equality comparisons. They aren't legal,
5962     // but they can be lowered with a few vector instructions.
5963     unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
5964     EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
5965     SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
5966     SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
5967     SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
5968                               DAG.getCondCode(ISD::SETEQ));
5969     SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
5970     SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
5971     Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
5972     if (SetCCOpcode == ISD::SETNE)
5973       Merged = DAG.getNOT(dl, Merged, CmpVT);
5974     Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
5975     return Merged;
5976   }
5977
5978   if (CmpVT.getVectorElementType() == MVT::i64)
5979     // 64-bit comparisons are not legal in general.
5980     return SDValue();
5981
5982   if (Op1.getValueType().isFloatingPoint()) {
5983     switch (SetCCOpcode) {
5984     default: llvm_unreachable("Illegal FP comparison");
5985     case ISD::SETUNE:
5986     case ISD::SETNE:
5987       if (ST->hasMVEFloatOps()) {
5988         Opc = ARMCC::NE; break;
5989       } else {
5990         Invert = true; LLVM_FALLTHROUGH;
5991       }
5992     case ISD::SETOEQ:
5993     case ISD::SETEQ:  Opc = ARMCC::EQ; break;
5994     case ISD::SETOLT:
5995     case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
5996     case ISD::SETOGT:
5997     case ISD::SETGT:  Opc = ARMCC::GT; break;
5998     case ISD::SETOLE:
5999     case ISD::SETLE:  Swap = true; LLVM_FALLTHROUGH;
6000     case ISD::SETOGE:
6001     case ISD::SETGE: Opc = ARMCC::GE; break;
6002     case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH;
6003     case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
6004     case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH;
6005     case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
6006     case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH;
6007     case ISD::SETONE: {
6008       // Expand this to (OLT | OGT).
6009       SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6010                                    DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6011       SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6012                                    DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6013       SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6014       if (Invert)
6015         Result = DAG.getNOT(dl, Result, VT);
6016       return Result;
6017     }
6018     case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH;
6019     case ISD::SETO: {
6020       // Expand this to (OLT | OGE).
6021       SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6022                                    DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6023       SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6024                                    DAG.getConstant(ARMCC::GE, dl, MVT::i32));
6025       SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6026       if (Invert)
6027         Result = DAG.getNOT(dl, Result, VT);
6028       return Result;
6029     }
6030     }
6031   } else {
6032     // Integer comparisons.
6033     switch (SetCCOpcode) {
6034     default: llvm_unreachable("Illegal integer comparison");
6035     case ISD::SETNE:
6036       if (ST->hasMVEIntegerOps()) {
6037         Opc = ARMCC::NE; break;
6038       } else {
6039         Invert = true; LLVM_FALLTHROUGH;
6040       }
6041     case ISD::SETEQ:  Opc = ARMCC::EQ; break;
6042     case ISD::SETLT:  Swap = true; LLVM_FALLTHROUGH;
6043     case ISD::SETGT:  Opc = ARMCC::GT; break;
6044     case ISD::SETLE:  Swap = true; LLVM_FALLTHROUGH;
6045     case ISD::SETGE:  Opc = ARMCC::GE; break;
6046     case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
6047     case ISD::SETUGT: Opc = ARMCC::HI; break;
6048     case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
6049     case ISD::SETUGE: Opc = ARMCC::HS; break;
6050     }
6051
6052     // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
6053     if (ST->hasNEON() && Opc == ARMCC::EQ) {
6054       SDValue AndOp;
6055       if (ISD::isBuildVectorAllZeros(Op1.getNode()))
6056         AndOp = Op0;
6057       else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
6058         AndOp = Op1;
6059
6060       // Ignore bitconvert.
6061       if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
6062         AndOp = AndOp.getOperand(0);
6063
6064       if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
6065         Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
6066         Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
6067         SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
6068         if (!Invert)
6069           Result = DAG.getNOT(dl, Result, VT);
6070         return Result;
6071       }
6072     }
6073   }
6074
6075   if (Swap)
6076     std::swap(Op0, Op1);
6077
6078   // If one of the operands is a constant vector zero, attempt to fold the
6079   // comparison to a specialized compare-against-zero form.
6080   SDValue SingleOp;
6081   if (ISD::isBuildVectorAllZeros(Op1.getNode()))
6082     SingleOp = Op0;
6083   else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
6084     if (Opc == ARMCC::GE)
6085       Opc = ARMCC::LE;
6086     else if (Opc == ARMCC::GT)
6087       Opc = ARMCC::LT;
6088     SingleOp = Op1;
6089   }
6090
6091   SDValue Result;
6092   if (SingleOp.getNode()) {
6093     Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp,
6094                          DAG.getConstant(Opc, dl, MVT::i32));
6095   } else {
6096     Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6097                          DAG.getConstant(Opc, dl, MVT::i32));
6098   }
6099
6100   Result = DAG.getSExtOrTrunc(Result, dl, VT);
6101
6102   if (Invert)
6103     Result = DAG.getNOT(dl, Result, VT);
6104
6105   return Result;
6106 }
6107
6108 static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) {
6109   SDValue LHS = Op.getOperand(0);
6110   SDValue RHS = Op.getOperand(1);
6111   SDValue Carry = Op.getOperand(2);
6112   SDValue Cond = Op.getOperand(3);
6113   SDLoc DL(Op);
6114
6115   assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
6116
6117   // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
6118   // have to invert the carry first.
6119   Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
6120                       DAG.getConstant(1, DL, MVT::i32), Carry);
6121   // This converts the boolean value carry into the carry flag.
6122   Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
6123
6124   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
6125   SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
6126
6127   SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
6128   SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
6129   SDValue ARMcc = DAG.getConstant(
6130       IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
6131   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
6132   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR,
6133                                    Cmp.getValue(1), SDValue());
6134   return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
6135                      CCR, Chain.getValue(1));
6136 }
6137
6138 /// isVMOVModifiedImm - Check if the specified splat value corresponds to a
6139 /// valid vector constant for a NEON or MVE instruction with a "modified
6140 /// immediate" operand (e.g., VMOV).  If so, return the encoded value.
6141 static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
6142                                  unsigned SplatBitSize, SelectionDAG &DAG,
6143                                  const SDLoc &dl, EVT &VT, bool is128Bits,
6144                                  VMOVModImmType type) {
6145   unsigned OpCmode, Imm;
6146
6147   // SplatBitSize is set to the smallest size that splats the vector, so a
6148   // zero vector will always have SplatBitSize == 8.  However, NEON modified
6149   // immediate instructions others than VMOV do not support the 8-bit encoding
6150   // of a zero vector, and the default encoding of zero is supposed to be the
6151   // 32-bit version.
6152   if (SplatBits == 0)
6153     SplatBitSize = 32;
6154
6155   switch (SplatBitSize) {
6156   case 8:
6157     if (type != VMOVModImm)
6158       return SDValue();
6159     // Any 1-byte value is OK.  Op=0, Cmode=1110.
6160     assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
6161     OpCmode = 0xe;
6162     Imm = SplatBits;
6163     VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
6164     break;
6165
6166   case 16:
6167     // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
6168     VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
6169     if ((SplatBits & ~0xff) == 0) {
6170       // Value = 0x00nn: Op=x, Cmode=100x.
6171       OpCmode = 0x8;
6172       Imm = SplatBits;
6173       break;
6174     }
6175     if ((SplatBits & ~0xff00) == 0) {
6176       // Value = 0xnn00: Op=x, Cmode=101x.
6177       OpCmode = 0xa;
6178       Imm = SplatBits >> 8;
6179       break;
6180     }
6181     return SDValue();
6182
6183   case 32:
6184     // NEON's 32-bit VMOV supports splat values where:
6185     // * only one byte is nonzero, or
6186     // * the least significant byte is 0xff and the second byte is nonzero, or
6187     // * the least significant 2 bytes are 0xff and the third is nonzero.
6188     VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
6189     if ((SplatBits & ~0xff) == 0) {
6190       // Value = 0x000000nn: Op=x, Cmode=000x.
6191       OpCmode = 0;
6192       Imm = SplatBits;
6193       break;
6194     }
6195     if ((SplatBits & ~0xff00) == 0) {
6196       // Value = 0x0000nn00: Op=x, Cmode=001x.
6197       OpCmode = 0x2;
6198       Imm = SplatBits >> 8;
6199       break;
6200     }
6201     if ((SplatBits & ~0xff0000) == 0) {
6202       // Value = 0x00nn0000: Op=x, Cmode=010x.
6203       OpCmode = 0x4;
6204       Imm = SplatBits >> 16;
6205       break;
6206     }
6207     if ((SplatBits & ~0xff000000) == 0) {
6208       // Value = 0xnn000000: Op=x, Cmode=011x.
6209       OpCmode = 0x6;
6210       Imm = SplatBits >> 24;
6211       break;
6212     }
6213
6214     // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
6215     if (type == OtherModImm) return SDValue();
6216
6217     if ((SplatBits & ~0xffff) == 0 &&
6218         ((SplatBits | SplatUndef) & 0xff) == 0xff) {
6219       // Value = 0x0000nnff: Op=x, Cmode=1100.
6220       OpCmode = 0xc;
6221       Imm = SplatBits >> 8;
6222       break;
6223     }
6224
6225     // cmode == 0b1101 is not supported for MVE VMVN
6226     if (type == MVEVMVNModImm)
6227       return SDValue();
6228
6229     if ((SplatBits & ~0xffffff) == 0 &&
6230         ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
6231       // Value = 0x00nnffff: Op=x, Cmode=1101.
6232       OpCmode = 0xd;
6233       Imm = SplatBits >> 16;
6234       break;
6235     }
6236
6237     // Note: there are a few 32-bit splat values (specifically: 00ffff00,
6238     // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
6239     // VMOV.I32.  A (very) minor optimization would be to replicate the value
6240     // and fall through here to test for a valid 64-bit splat.  But, then the
6241     // caller would also need to check and handle the change in size.
6242     return SDValue();
6243
6244   case 64: {
6245     if (type != VMOVModImm)
6246       return SDValue();
6247     // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
6248     uint64_t BitMask = 0xff;
6249     uint64_t Val = 0;
6250     unsigned ImmMask = 1;
6251     Imm = 0;
6252     for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
6253       if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
6254         Val |= BitMask;
6255         Imm |= ImmMask;
6256       } else if ((SplatBits & BitMask) != 0) {
6257         return SDValue();
6258       }
6259       BitMask <<= 8;
6260       ImmMask <<= 1;
6261     }
6262
6263     if (DAG.getDataLayout().isBigEndian())
6264       // swap higher and lower 32 bit word
6265       Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4);
6266
6267     // Op=1, Cmode=1110.
6268     OpCmode = 0x1e;
6269     VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
6270     break;
6271   }
6272
6273   default:
6274     llvm_unreachable("unexpected size for isVMOVModifiedImm");
6275   }
6276
6277   unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
6278   return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
6279 }
6280
6281 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
6282                                            const ARMSubtarget *ST) const {
6283   EVT VT = Op.getValueType();
6284   bool IsDouble = (VT == MVT::f64);
6285   ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
6286   const APFloat &FPVal = CFP->getValueAPF();
6287
6288   // Prevent floating-point constants from using literal loads
6289   // when execute-only is enabled.
6290   if (ST->genExecuteOnly()) {
6291     // If we can represent the constant as an immediate, don't lower it
6292     if (isFPImmLegal(FPVal, VT))
6293       return Op;
6294     // Otherwise, construct as integer, and move to float register
6295     APInt INTVal = FPVal.bitcastToAPInt();
6296     SDLoc DL(CFP);
6297     switch (VT.getSimpleVT().SimpleTy) {
6298       default:
6299         llvm_unreachable("Unknown floating point type!");
6300         break;
6301       case MVT::f64: {
6302         SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
6303         SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
6304         if (!ST->isLittle())
6305           std::swap(Lo, Hi);
6306         return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
6307       }
6308       case MVT::f32:
6309           return DAG.getNode(ARMISD::VMOVSR, DL, VT,
6310               DAG.getConstant(INTVal, DL, MVT::i32));
6311     }
6312   }
6313
6314   if (!ST->hasVFP3Base())
6315     return SDValue();
6316
6317   // Use the default (constant pool) lowering for double constants when we have
6318   // an SP-only FPU
6319   if (IsDouble && !Subtarget->hasFP64())
6320     return SDValue();
6321
6322   // Try splatting with a VMOV.f32...
6323   int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
6324
6325   if (ImmVal != -1) {
6326     if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
6327       // We have code in place to select a valid ConstantFP already, no need to
6328       // do any mangling.
6329       return Op;
6330     }
6331
6332     // It's a float and we are trying to use NEON operations where
6333     // possible. Lower it to a splat followed by an extract.
6334     SDLoc DL(Op);
6335     SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
6336     SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
6337                                       NewVal);
6338     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
6339                        DAG.getConstant(0, DL, MVT::i32));
6340   }
6341
6342   // The rest of our options are NEON only, make sure that's allowed before
6343   // proceeding..
6344   if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
6345     return SDValue();
6346
6347   EVT VMovVT;
6348   uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
6349
6350   // It wouldn't really be worth bothering for doubles except for one very
6351   // important value, which does happen to match: 0.0. So make sure we don't do
6352   // anything stupid.
6353   if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
6354     return SDValue();
6355
6356   // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
6357   SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
6358                                      VMovVT, false, VMOVModImm);
6359   if (NewVal != SDValue()) {
6360     SDLoc DL(Op);
6361     SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
6362                                       NewVal);
6363     if (IsDouble)
6364       return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
6365
6366     // It's a float: cast and extract a vector element.
6367     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
6368                                        VecConstant);
6369     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
6370                        DAG.getConstant(0, DL, MVT::i32));
6371   }
6372
6373   // Finally, try a VMVN.i32
6374   NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
6375                              false, VMVNModImm);
6376   if (NewVal != SDValue()) {
6377     SDLoc DL(Op);
6378     SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
6379
6380     if (IsDouble)
6381       return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
6382
6383     // It's a float: cast and extract a vector element.
6384     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
6385                                        VecConstant);
6386     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
6387                        DAG.getConstant(0, DL, MVT::i32));
6388   }
6389
6390   return SDValue();
6391 }
6392
6393 // check if an VEXT instruction can handle the shuffle mask when the
6394 // vector sources of the shuffle are the same.
6395 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
6396   unsigned NumElts = VT.getVectorNumElements();
6397
6398   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
6399   if (M[0] < 0)
6400     return false;
6401
6402   Imm = M[0];
6403
6404   // If this is a VEXT shuffle, the immediate value is the index of the first
6405   // element.  The other shuffle indices must be the successive elements after
6406   // the first one.
6407   unsigned ExpectedElt = Imm;
6408   for (unsigned i = 1; i < NumElts; ++i) {
6409     // Increment the expected index.  If it wraps around, just follow it
6410     // back to index zero and keep going.
6411     ++ExpectedElt;
6412     if (ExpectedElt == NumElts)
6413       ExpectedElt = 0;
6414
6415     if (M[i] < 0) continue; // ignore UNDEF indices
6416     if (ExpectedElt != static_cast<unsigned>(M[i]))
6417       return false;
6418   }
6419
6420   return true;
6421 }
6422
6423 static bool isVEXTMask(ArrayRef<int> M, EVT VT,
6424                        bool &ReverseVEXT, unsigned &Imm) {
6425   unsigned NumElts = VT.getVectorNumElements();
6426   ReverseVEXT = false;
6427
6428   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
6429   if (M[0] < 0)
6430     return false;
6431
6432   Imm = M[0];
6433
6434   // If this is a VEXT shuffle, the immediate value is the index of the first
6435   // element.  The other shuffle indices must be the successive elements after
6436   // the first one.
6437   unsigned ExpectedElt = Imm;
6438   for (unsigned i = 1; i < NumElts; ++i) {
6439     // Increment the expected index.  If it wraps around, it may still be
6440     // a VEXT but the source vectors must be swapped.
6441     ExpectedElt += 1;
6442     if (ExpectedElt == NumElts * 2) {
6443       ExpectedElt = 0;
6444       ReverseVEXT = true;
6445     }
6446
6447     if (M[i] < 0) continue; // ignore UNDEF indices
6448     if (ExpectedElt != static_cast<unsigned>(M[i]))
6449       return false;
6450   }
6451
6452   // Adjust the index value if the source operands will be swapped.
6453   if (ReverseVEXT)
6454     Imm -= NumElts;
6455
6456   return true;
6457 }
6458
6459 /// isVREVMask - Check if a vector shuffle corresponds to a VREV
6460 /// instruction with the specified blocksize.  (The order of the elements
6461 /// within each block of the vector is reversed.)
6462 static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
6463   assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
6464          "Only possible block sizes for VREV are: 16, 32, 64");
6465
6466   unsigned EltSz = VT.getScalarSizeInBits();
6467   if (EltSz == 64)
6468     return false;
6469
6470   unsigned NumElts = VT.getVectorNumElements();
6471   unsigned BlockElts = M[0] + 1;
6472   // If the first shuffle index is UNDEF, be optimistic.
6473   if (M[0] < 0)
6474     BlockElts = BlockSize / EltSz;
6475
6476   if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
6477     return false;
6478
6479   for (unsigned i = 0; i < NumElts; ++i) {
6480     if (M[i] < 0) continue; // ignore UNDEF indices
6481     if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
6482       return false;
6483   }
6484
6485   return true;
6486 }
6487
6488 static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
6489   // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
6490   // range, then 0 is placed into the resulting vector. So pretty much any mask
6491   // of 8 elements can work here.
6492   return VT == MVT::v8i8 && M.size() == 8;
6493 }
6494
6495 static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
6496                                unsigned Index) {
6497   if (Mask.size() == Elements * 2)
6498     return Index / Elements;
6499   return Mask[Index] == 0 ? 0 : 1;
6500 }
6501
6502 // Checks whether the shuffle mask represents a vector transpose (VTRN) by
6503 // checking that pairs of elements in the shuffle mask represent the same index
6504 // in each vector, incrementing the expected index by 2 at each step.
6505 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
6506 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
6507 //  v2={e,f,g,h}
6508 // WhichResult gives the offset for each element in the mask based on which
6509 // of the two results it belongs to.
6510 //
6511 // The transpose can be represented either as:
6512 // result1 = shufflevector v1, v2, result1_shuffle_mask
6513 // result2 = shufflevector v1, v2, result2_shuffle_mask
6514 // where v1/v2 and the shuffle masks have the same number of elements
6515 // (here WhichResult (see below) indicates which result is being checked)
6516 //
6517 // or as:
6518 // results = shufflevector v1, v2, shuffle_mask
6519 // where both results are returned in one vector and the shuffle mask has twice
6520 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
6521 // want to check the low half and high half of the shuffle mask as if it were
6522 // the other case
6523 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
6524   unsigned EltSz = VT.getScalarSizeInBits();
6525   if (EltSz == 64)
6526     return false;
6527
6528   unsigned NumElts = VT.getVectorNumElements();
6529   if (M.size() != NumElts && M.size() != NumElts*2)
6530     return false;
6531
6532   // If the mask is twice as long as the input vector then we need to check the
6533   // upper and lower parts of the mask with a matching value for WhichResult
6534   // FIXME: A mask with only even values will be rejected in case the first
6535   // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
6536   // M[0] is used to determine WhichResult
6537   for (unsigned i = 0; i < M.size(); i += NumElts) {
6538     WhichResult = SelectPairHalf(NumElts, M, i);
6539     for (unsigned j = 0; j < NumElts; j += 2) {
6540       if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
6541           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
6542         return false;
6543     }
6544   }
6545
6546   if (M.size() == NumElts*2)
6547     WhichResult = 0;
6548
6549   return true;
6550 }
6551
6552 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
6553 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
6554 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
6555 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
6556   unsigned EltSz = VT.getScalarSizeInBits();
6557   if (EltSz == 64)
6558     return false;
6559
6560   unsigned NumElts = VT.getVectorNumElements();
6561   if (M.size() != NumElts && M.size() != NumElts*2)
6562     return false;
6563
6564   for (unsigned i = 0; i < M.size(); i += NumElts) {
6565     WhichResult = SelectPairHalf(NumElts, M, i);
6566     for (unsigned j = 0; j < NumElts; j += 2) {
6567       if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
6568           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
6569         return false;
6570     }
6571   }
6572
6573   if (M.size() == NumElts*2)
6574     WhichResult = 0;
6575
6576   return true;
6577 }
6578
6579 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
6580 // that the mask elements are either all even and in steps of size 2 or all odd
6581 // and in steps of size 2.
6582 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
6583 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
6584 //  v2={e,f,g,h}
6585 // Requires similar checks to that of isVTRNMask with
6586 // respect the how results are returned.
6587 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
6588   unsigned EltSz = VT.getScalarSizeInBits();
6589   if (EltSz == 64)
6590     return false;
6591
6592   unsigned NumElts = VT.getVectorNumElements();
6593   if (M.size() != NumElts && M.size() != NumElts*2)
6594     return false;
6595
6596   for (unsigned i = 0; i < M.size(); i += NumElts) {
6597     WhichResult = SelectPairHalf(NumElts, M, i);
6598     for (unsigned j = 0; j < NumElts; ++j) {
6599       if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
6600         return false;
6601     }
6602   }
6603
6604   if (M.size() == NumElts*2)
6605     WhichResult = 0;
6606
6607   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
6608   if (VT.is64BitVector() && EltSz == 32)
6609     return false;
6610
6611   return true;
6612 }
6613
6614 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
6615 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
6616 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
6617 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
6618   unsigned EltSz = VT.getScalarSizeInBits();
6619   if (EltSz == 64)
6620     return false;
6621
6622   unsigned NumElts = VT.getVectorNumElements();
6623   if (M.size() != NumElts && M.size() != NumElts*2)
6624     return false;
6625
6626   unsigned Half = NumElts / 2;
6627   for (unsigned i = 0; i < M.size(); i += NumElts) {
6628     WhichResult = SelectPairHalf(NumElts, M, i);
6629     for (unsigned j = 0; j < NumElts; j += Half) {
6630       unsigned Idx = WhichResult;
6631       for (unsigned k = 0; k < Half; ++k) {
6632         int MIdx = M[i + j + k];
6633         if (MIdx >= 0 && (unsigned) MIdx != Idx)
6634           return false;
6635         Idx += 2;
6636       }
6637     }
6638   }
6639
6640   if (M.size() == NumElts*2)
6641     WhichResult = 0;
6642
6643   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
6644   if (VT.is64BitVector() && EltSz == 32)
6645     return false;
6646
6647   return true;
6648 }
6649
6650 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking
6651 // that pairs of elements of the shufflemask represent the same index in each
6652 // vector incrementing sequentially through the vectors.
6653 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
6654 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
6655 //  v2={e,f,g,h}
6656 // Requires similar checks to that of isVTRNMask with respect the how results
6657 // are returned.
6658 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
6659   unsigned EltSz = VT.getScalarSizeInBits();
6660   if (EltSz == 64)
6661     return false;
6662
6663   unsigned NumElts = VT.getVectorNumElements();
6664   if (M.size() != NumElts && M.size() != NumElts*2)
6665     return false;
6666
6667   for (unsigned i = 0; i < M.size(); i += NumElts) {
6668     WhichResult = SelectPairHalf(NumElts, M, i);
6669     unsigned Idx = WhichResult * NumElts / 2;
6670     for (unsigned j = 0; j < NumElts; j += 2) {
6671       if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
6672           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
6673         return false;
6674       Idx += 1;
6675     }
6676   }
6677
6678   if (M.size() == NumElts*2)
6679     WhichResult = 0;
6680
6681   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
6682   if (VT.is64BitVector() && EltSz == 32)
6683     return false;
6684
6685   return true;
6686 }
6687
6688 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
6689 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
6690 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
6691 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
6692   unsigned EltSz = VT.getScalarSizeInBits();
6693   if (EltSz == 64)
6694     return false;
6695
6696   unsigned NumElts = VT.getVectorNumElements();
6697   if (M.size() != NumElts && M.size() != NumElts*2)
6698     return false;
6699
6700   for (unsigned i = 0; i < M.size(); i += NumElts) {
6701     WhichResult = SelectPairHalf(NumElts, M, i);
6702     unsigned Idx = WhichResult * NumElts / 2;
6703     for (unsigned j = 0; j < NumElts; j += 2) {
6704       if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
6705           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
6706         return false;
6707       Idx += 1;
6708     }
6709   }
6710
6711   if (M.size() == NumElts*2)
6712     WhichResult = 0;
6713
6714   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
6715   if (VT.is64BitVector() && EltSz == 32)
6716     return false;
6717
6718   return true;
6719 }
6720
6721 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
6722 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
6723 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
6724                                            unsigned &WhichResult,
6725                                            bool &isV_UNDEF) {
6726   isV_UNDEF = false;
6727   if (isVTRNMask(ShuffleMask, VT, WhichResult))
6728     return ARMISD::VTRN;
6729   if (isVUZPMask(ShuffleMask, VT, WhichResult))
6730     return ARMISD::VUZP;
6731   if (isVZIPMask(ShuffleMask, VT, WhichResult))
6732     return ARMISD::VZIP;
6733
6734   isV_UNDEF = true;
6735   if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
6736     return ARMISD::VTRN;
6737   if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
6738     return ARMISD::VUZP;
6739   if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
6740     return ARMISD::VZIP;
6741
6742   return 0;
6743 }
6744
6745 /// \return true if this is a reverse operation on an vector.
6746 static bool isReverseMask(ArrayRef<int> M, EVT VT) {
6747   unsigned NumElts = VT.getVectorNumElements();
6748   // Make sure the mask has the right size.
6749   if (NumElts != M.size())
6750       return false;
6751
6752   // Look for <15, ..., 3, -1, 1, 0>.
6753   for (unsigned i = 0; i != NumElts; ++i)
6754     if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
6755       return false;
6756
6757   return true;
6758 }
6759
6760 // If N is an integer constant that can be moved into a register in one
6761 // instruction, return an SDValue of such a constant (will become a MOV
6762 // instruction).  Otherwise return null.
6763 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
6764                                      const ARMSubtarget *ST, const SDLoc &dl) {
6765   uint64_t Val;
6766   if (!isa<ConstantSDNode>(N))
6767     return SDValue();
6768   Val = cast<ConstantSDNode>(N)->getZExtValue();
6769
6770   if (ST->isThumb1Only()) {
6771     if (Val <= 255 || ~Val <= 255)
6772       return DAG.getConstant(Val, dl, MVT::i32);
6773   } else {
6774     if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
6775       return DAG.getConstant(Val, dl, MVT::i32);
6776   }
6777   return SDValue();
6778 }
6779
6780 static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG,
6781                                     const ARMSubtarget *ST) {
6782   SDLoc dl(Op);
6783   EVT VT = Op.getValueType();
6784
6785   assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
6786
6787   unsigned NumElts = VT.getVectorNumElements();
6788   unsigned BoolMask;
6789   unsigned BitsPerBool;
6790   if (NumElts == 4) {
6791     BitsPerBool = 4;
6792     BoolMask = 0xf;
6793   } else if (NumElts == 8) {
6794     BitsPerBool = 2;
6795     BoolMask = 0x3;
6796   } else if (NumElts == 16) {
6797     BitsPerBool = 1;
6798     BoolMask = 0x1;
6799   } else
6800     return SDValue();
6801
6802   // First create base with bits set where known
6803   unsigned Bits32 = 0;
6804   for (unsigned i = 0; i < NumElts; ++i) {
6805     SDValue V = Op.getOperand(i);
6806     if (!isa<ConstantSDNode>(V) && !V.isUndef())
6807       continue;
6808     bool BitSet = V.isUndef() ? false : cast<ConstantSDNode>(V)->getZExtValue();
6809     if (BitSet)
6810       Bits32 |= BoolMask << (i * BitsPerBool);
6811   }
6812
6813   // Add in unknown nodes
6814   // FIXME: Handle splats of the same value better.
6815   SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
6816                              DAG.getConstant(Bits32, dl, MVT::i32));
6817   for (unsigned i = 0; i < NumElts; ++i) {
6818     SDValue V = Op.getOperand(i);
6819     if (isa<ConstantSDNode>(V) || V.isUndef())
6820       continue;
6821     Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
6822                        DAG.getConstant(i, dl, MVT::i32));
6823   }
6824
6825   return Base;
6826 }
6827
6828 // If this is a case we can't handle, return null and let the default
6829 // expansion code take care of it.
6830 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
6831                                              const ARMSubtarget *ST) const {
6832   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
6833   SDLoc dl(Op);
6834   EVT VT = Op.getValueType();
6835
6836   if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
6837     return LowerBUILD_VECTOR_i1(Op, DAG, ST);
6838
6839   APInt SplatBits, SplatUndef;
6840   unsigned SplatBitSize;
6841   bool HasAnyUndefs;
6842   if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
6843     if (SplatUndef.isAllOnesValue())
6844       return DAG.getUNDEF(VT);
6845
6846     if ((ST->hasNEON() && SplatBitSize <= 64) ||
6847         (ST->hasMVEIntegerOps() && SplatBitSize <= 32)) {
6848       // Check if an immediate VMOV works.
6849       EVT VmovVT;
6850       SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(),
6851                                       SplatUndef.getZExtValue(), SplatBitSize,
6852                                       DAG, dl, VmovVT, VT.is128BitVector(),
6853                                       VMOVModImm);
6854
6855       if (Val.getNode()) {
6856         SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
6857         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6858       }
6859
6860       // Try an immediate VMVN.
6861       uint64_t NegatedImm = (~SplatBits).getZExtValue();
6862       Val = isVMOVModifiedImm(
6863           NegatedImm, SplatUndef.getZExtValue(), SplatBitSize,
6864           DAG, dl, VmovVT, VT.is128BitVector(),
6865           ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
6866       if (Val.getNode()) {
6867         SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
6868         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6869       }
6870
6871       // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
6872       if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
6873         int ImmVal = ARM_AM::getFP32Imm(SplatBits);
6874         if (ImmVal != -1) {
6875           SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
6876           return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
6877         }
6878       }
6879     }
6880   }
6881
6882   // Scan through the operands to see if only one value is used.
6883   //
6884   // As an optimisation, even if more than one value is used it may be more
6885   // profitable to splat with one value then change some lanes.
6886   //
6887   // Heuristically we decide to do this if the vector has a "dominant" value,
6888   // defined as splatted to more than half of the lanes.
6889   unsigned NumElts = VT.getVectorNumElements();
6890   bool isOnlyLowElement = true;
6891   bool usesOnlyOneValue = true;
6892   bool hasDominantValue = false;
6893   bool isConstant = true;
6894
6895   // Map of the number of times a particular SDValue appears in the
6896   // element list.
6897   DenseMap<SDValue, unsigned> ValueCounts;
6898   SDValue Value;
6899   for (unsigned i = 0; i < NumElts; ++i) {
6900     SDValue V = Op.getOperand(i);
6901     if (V.isUndef())
6902       continue;
6903     if (i > 0)
6904       isOnlyLowElement = false;
6905     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
6906       isConstant = false;
6907
6908     ValueCounts.insert(std::make_pair(V, 0));
6909     unsigned &Count = ValueCounts[V];
6910
6911     // Is this value dominant? (takes up more than half of the lanes)
6912     if (++Count > (NumElts / 2)) {
6913       hasDominantValue = true;
6914       Value = V;
6915     }
6916   }
6917   if (ValueCounts.size() != 1)
6918     usesOnlyOneValue = false;
6919   if (!Value.getNode() && !ValueCounts.empty())
6920     Value = ValueCounts.begin()->first;
6921
6922   if (ValueCounts.empty())
6923     return DAG.getUNDEF(VT);
6924
6925   // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
6926   // Keep going if we are hitting this case.
6927   if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
6928     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
6929
6930   unsigned EltSize = VT.getScalarSizeInBits();
6931
6932   // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
6933   // i32 and try again.
6934   if (hasDominantValue && EltSize <= 32) {
6935     if (!isConstant) {
6936       SDValue N;
6937
6938       // If we are VDUPing a value that comes directly from a vector, that will
6939       // cause an unnecessary move to and from a GPR, where instead we could
6940       // just use VDUPLANE. We can only do this if the lane being extracted
6941       // is at a constant index, as the VDUP from lane instructions only have
6942       // constant-index forms.
6943       ConstantSDNode *constIndex;
6944       if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6945           (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
6946         // We need to create a new undef vector to use for the VDUPLANE if the
6947         // size of the vector from which we get the value is different than the
6948         // size of the vector that we need to create. We will insert the element
6949         // such that the register coalescer will remove unnecessary copies.
6950         if (VT != Value->getOperand(0).getValueType()) {
6951           unsigned index = constIndex->getAPIntValue().getLimitedValue() %
6952                              VT.getVectorNumElements();
6953           N =  DAG.getNode(ARMISD::VDUPLANE, dl, VT,
6954                  DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
6955                         Value, DAG.getConstant(index, dl, MVT::i32)),
6956                            DAG.getConstant(index, dl, MVT::i32));
6957         } else
6958           N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
6959                         Value->getOperand(0), Value->getOperand(1));
6960       } else
6961         N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
6962
6963       if (!usesOnlyOneValue) {
6964         // The dominant value was splatted as 'N', but we now have to insert
6965         // all differing elements.
6966         for (unsigned I = 0; I < NumElts; ++I) {
6967           if (Op.getOperand(I) == Value)
6968             continue;
6969           SmallVector<SDValue, 3> Ops;
6970           Ops.push_back(N);
6971           Ops.push_back(Op.getOperand(I));
6972           Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
6973           N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
6974         }
6975       }
6976       return N;
6977     }
6978     if (VT.getVectorElementType().isFloatingPoint()) {
6979       SmallVector<SDValue, 8> Ops;
6980       MVT FVT = VT.getVectorElementType().getSimpleVT();
6981       assert(FVT == MVT::f32 || FVT == MVT::f16);
6982       MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
6983       for (unsigned i = 0; i < NumElts; ++i)
6984         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
6985                                   Op.getOperand(i)));
6986       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
6987       SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
6988       Val = LowerBUILD_VECTOR(Val, DAG, ST);
6989       if (Val.getNode())
6990         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
6991     }
6992     if (usesOnlyOneValue) {
6993       SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
6994       if (isConstant && Val.getNode())
6995         return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
6996     }
6997   }
6998
6999   // If all elements are constants and the case above didn't get hit, fall back
7000   // to the default expansion, which will generate a load from the constant
7001   // pool.
7002   if (isConstant)
7003     return SDValue();
7004
7005   // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
7006   if (NumElts >= 4) {
7007     SDValue shuffle = ReconstructShuffle(Op, DAG);
7008     if (shuffle != SDValue())
7009       return shuffle;
7010   }
7011
7012   if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
7013     // If we haven't found an efficient lowering, try splitting a 128-bit vector
7014     // into two 64-bit vectors; we might discover a better way to lower it.
7015     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
7016     EVT ExtVT = VT.getVectorElementType();
7017     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
7018     SDValue Lower =
7019         DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2));
7020     if (Lower.getOpcode() == ISD::BUILD_VECTOR)
7021       Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
7022     SDValue Upper = DAG.getBuildVector(
7023         HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2));
7024     if (Upper.getOpcode() == ISD::BUILD_VECTOR)
7025       Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
7026     if (Lower && Upper)
7027       return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
7028   }
7029
7030   // Vectors with 32- or 64-bit elements can be built by directly assigning
7031   // the subregisters.  Lower it to an ARMISD::BUILD_VECTOR so the operands
7032   // will be legalized.
7033   if (EltSize >= 32) {
7034     // Do the expansion with floating-point types, since that is what the VFP
7035     // registers are defined to use, and since i64 is not legal.
7036     EVT EltVT = EVT::getFloatingPointVT(EltSize);
7037     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
7038     SmallVector<SDValue, 8> Ops;
7039     for (unsigned i = 0; i < NumElts; ++i)
7040       Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
7041     SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
7042     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7043   }
7044
7045   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
7046   // know the default expansion would otherwise fall back on something even
7047   // worse. For a vector with one or two non-undef values, that's
7048   // scalar_to_vector for the elements followed by a shuffle (provided the
7049   // shuffle is valid for the target) and materialization element by element
7050   // on the stack followed by a load for everything else.
7051   if (!isConstant && !usesOnlyOneValue) {
7052     SDValue Vec = DAG.getUNDEF(VT);
7053     for (unsigned i = 0 ; i < NumElts; ++i) {
7054       SDValue V = Op.getOperand(i);
7055       if (V.isUndef())
7056         continue;
7057       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
7058       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
7059     }
7060     return Vec;
7061   }
7062
7063   return SDValue();
7064 }
7065
7066 // Gather data to see if the operation can be modelled as a
7067 // shuffle in combination with VEXTs.
7068 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
7069                                               SelectionDAG &DAG) const {
7070   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7071   SDLoc dl(Op);
7072   EVT VT = Op.getValueType();
7073   unsigned NumElts = VT.getVectorNumElements();
7074
7075   struct ShuffleSourceInfo {
7076     SDValue Vec;
7077     unsigned MinElt = std::numeric_limits<unsigned>::max();
7078     unsigned MaxElt = 0;
7079
7080     // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
7081     // be compatible with the shuffle we intend to construct. As a result
7082     // ShuffleVec will be some sliding window into the original Vec.
7083     SDValue ShuffleVec;
7084
7085     // Code should guarantee that element i in Vec starts at element "WindowBase
7086     // + i * WindowScale in ShuffleVec".
7087     int WindowBase = 0;
7088     int WindowScale = 1;
7089
7090     ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
7091
7092     bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
7093   };
7094
7095   // First gather all vectors used as an immediate source for this BUILD_VECTOR
7096   // node.
7097   SmallVector<ShuffleSourceInfo, 2> Sources;
7098   for (unsigned i = 0; i < NumElts; ++i) {
7099     SDValue V = Op.getOperand(i);
7100     if (V.isUndef())
7101       continue;
7102     else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
7103       // A shuffle can only come from building a vector from various
7104       // elements of other vectors.
7105       return SDValue();
7106     } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
7107       // Furthermore, shuffles require a constant mask, whereas extractelts
7108       // accept variable indices.
7109       return SDValue();
7110     }
7111
7112     // Add this element source to the list if it's not already there.
7113     SDValue SourceVec = V.getOperand(0);
7114     auto Source = llvm::find(Sources, SourceVec);
7115     if (Source == Sources.end())
7116       Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
7117
7118     // Update the minimum and maximum lane number seen.
7119     unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
7120     Source->MinElt = std::min(Source->MinElt, EltNo);
7121     Source->MaxElt = std::max(Source->MaxElt, EltNo);
7122   }
7123
7124   // Currently only do something sane when at most two source vectors
7125   // are involved.
7126   if (Sources.size() > 2)
7127     return SDValue();
7128
7129   // Find out the smallest element size among result and two sources, and use
7130   // it as element size to build the shuffle_vector.
7131   EVT SmallestEltTy = VT.getVectorElementType();
7132   for (auto &Source : Sources) {
7133     EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
7134     if (SrcEltTy.bitsLT(SmallestEltTy))
7135       SmallestEltTy = SrcEltTy;
7136   }
7137   unsigned ResMultiplier =
7138       VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
7139   NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
7140   EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
7141
7142   // If the source vector is too wide or too narrow, we may nevertheless be able
7143   // to construct a compatible shuffle either by concatenating it with UNDEF or
7144   // extracting a suitable range of elements.
7145   for (auto &Src : Sources) {
7146     EVT SrcVT = Src.ShuffleVec.getValueType();
7147
7148     if (SrcVT.getSizeInBits() == VT.getSizeInBits())
7149       continue;
7150
7151     // This stage of the search produces a source with the same element type as
7152     // the original, but with a total width matching the BUILD_VECTOR output.
7153     EVT EltVT = SrcVT.getVectorElementType();
7154     unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
7155     EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
7156
7157     if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
7158       if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits())
7159         return SDValue();
7160       // We can pad out the smaller vector for free, so if it's part of a
7161       // shuffle...
7162       Src.ShuffleVec =
7163           DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
7164                       DAG.getUNDEF(Src.ShuffleVec.getValueType()));
7165       continue;
7166     }
7167
7168     if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits())
7169       return SDValue();
7170
7171     if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
7172       // Span too large for a VEXT to cope
7173       return SDValue();
7174     }
7175
7176     if (Src.MinElt >= NumSrcElts) {
7177       // The extraction can just take the second half
7178       Src.ShuffleVec =
7179           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
7180                       DAG.getConstant(NumSrcElts, dl, MVT::i32));
7181       Src.WindowBase = -NumSrcElts;
7182     } else if (Src.MaxElt < NumSrcElts) {
7183       // The extraction can just take the first half
7184       Src.ShuffleVec =
7185           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
7186                       DAG.getConstant(0, dl, MVT::i32));
7187     } else {
7188       // An actual VEXT is needed
7189       SDValue VEXTSrc1 =
7190           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
7191                       DAG.getConstant(0, dl, MVT::i32));
7192       SDValue VEXTSrc2 =
7193           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
7194                       DAG.getConstant(NumSrcElts, dl, MVT::i32));
7195
7196       Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
7197                                    VEXTSrc2,
7198                                    DAG.getConstant(Src.MinElt, dl, MVT::i32));
7199       Src.WindowBase = -Src.MinElt;
7200     }
7201   }
7202
7203   // Another possible incompatibility occurs from the vector element types. We
7204   // can fix this by bitcasting the source vectors to the same type we intend
7205   // for the shuffle.
7206   for (auto &Src : Sources) {
7207     EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
7208     if (SrcEltTy == SmallestEltTy)
7209       continue;
7210     assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
7211     Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
7212     Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
7213     Src.WindowBase *= Src.WindowScale;
7214   }
7215
7216   // Final sanity check before we try to actually produce a shuffle.
7217   LLVM_DEBUG(for (auto Src
7218                   : Sources)
7219                  assert(Src.ShuffleVec.getValueType() == ShuffleVT););
7220
7221   // The stars all align, our next step is to produce the mask for the shuffle.
7222   SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
7223   int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
7224   for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
7225     SDValue Entry = Op.getOperand(i);
7226     if (Entry.isUndef())
7227       continue;
7228
7229     auto Src = llvm::find(Sources, Entry.getOperand(0));
7230     int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
7231
7232     // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
7233     // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
7234     // segment.
7235     EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
7236     int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
7237                                VT.getScalarSizeInBits());
7238     int LanesDefined = BitsDefined / BitsPerShuffleLane;
7239
7240     // This source is expected to fill ResMultiplier lanes of the final shuffle,
7241     // starting at the appropriate offset.
7242     int *LaneMask = &Mask[i * ResMultiplier];
7243
7244     int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
7245     ExtractBase += NumElts * (Src - Sources.begin());
7246     for (int j = 0; j < LanesDefined; ++j)
7247       LaneMask[j] = ExtractBase + j;
7248   }
7249
7250   // Final check before we try to produce nonsense...
7251   if (!isShuffleMaskLegal(Mask, ShuffleVT))
7252     return SDValue();
7253
7254   // We can't handle more than two sources. This should have already
7255   // been checked before this point.
7256   assert(Sources.size() <= 2 && "Too many sources!");
7257
7258   SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
7259   for (unsigned i = 0; i < Sources.size(); ++i)
7260     ShuffleOps[i] = Sources[i].ShuffleVec;
7261
7262   SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
7263                                          ShuffleOps[1], Mask);
7264   return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
7265 }
7266
7267 enum ShuffleOpCodes {
7268   OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
7269   OP_VREV,
7270   OP_VDUP0,
7271   OP_VDUP1,
7272   OP_VDUP2,
7273   OP_VDUP3,
7274   OP_VEXT1,
7275   OP_VEXT2,
7276   OP_VEXT3,
7277   OP_VUZPL, // VUZP, left result
7278   OP_VUZPR, // VUZP, right result
7279   OP_VZIPL, // VZIP, left result
7280   OP_VZIPR, // VZIP, right result
7281   OP_VTRNL, // VTRN, left result
7282   OP_VTRNR  // VTRN, right result
7283 };
7284
7285 static bool isLegalMVEShuffleOp(unsigned PFEntry) {
7286   unsigned OpNum = (PFEntry >> 26) & 0x0F;
7287   switch (OpNum) {
7288   case OP_COPY:
7289   case OP_VREV:
7290   case OP_VDUP0:
7291   case OP_VDUP1:
7292   case OP_VDUP2:
7293   case OP_VDUP3:
7294     return true;
7295   }
7296   return false;
7297 }
7298
7299 /// isShuffleMaskLegal - Targets can use this to indicate that they only
7300 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
7301 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
7302 /// are assumed to be legal.
7303 bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
7304   if (VT.getVectorNumElements() == 4 &&
7305       (VT.is128BitVector() || VT.is64BitVector())) {
7306     unsigned PFIndexes[4];
7307     for (unsigned i = 0; i != 4; ++i) {
7308       if (M[i] < 0)
7309         PFIndexes[i] = 8;
7310       else
7311         PFIndexes[i] = M[i];
7312     }
7313
7314     // Compute the index in the perfect shuffle table.
7315     unsigned PFTableIndex =
7316       PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
7317     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
7318     unsigned Cost = (PFEntry >> 30);
7319
7320     if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
7321       return true;
7322   }
7323
7324   bool ReverseVEXT, isV_UNDEF;
7325   unsigned Imm, WhichResult;
7326
7327   unsigned EltSize = VT.getScalarSizeInBits();
7328   if (EltSize >= 32 ||
7329       ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
7330       isVREVMask(M, VT, 64) ||
7331       isVREVMask(M, VT, 32) ||
7332       isVREVMask(M, VT, 16))
7333     return true;
7334   else if (Subtarget->hasNEON() &&
7335            (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
7336             isVTBLMask(M, VT) ||
7337             isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
7338     return true;
7339   else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) &&
7340            isReverseMask(M, VT))
7341     return true;
7342   else
7343     return false;
7344 }
7345
7346 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
7347 /// the specified operations to build the shuffle.
7348 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
7349                                       SDValue RHS, SelectionDAG &DAG,
7350                                       const SDLoc &dl) {
7351   unsigned OpNum = (PFEntry >> 26) & 0x0F;
7352   unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
7353   unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
7354
7355   if (OpNum == OP_COPY) {
7356     if (LHSID == (1*9+2)*9+3) return LHS;
7357     assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
7358     return RHS;
7359   }
7360
7361   SDValue OpLHS, OpRHS;
7362   OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
7363   OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
7364   EVT VT = OpLHS.getValueType();
7365
7366   switch (OpNum) {
7367   default: llvm_unreachable("Unknown shuffle opcode!");
7368   case OP_VREV:
7369     // VREV divides the vector in half and swaps within the half.
7370     if (VT.getVectorElementType() == MVT::i32 ||
7371         VT.getVectorElementType() == MVT::f32)
7372       return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
7373     // vrev <4 x i16> -> VREV32
7374     if (VT.getVectorElementType() == MVT::i16)
7375       return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
7376     // vrev <4 x i8> -> VREV16
7377     assert(VT.getVectorElementType() == MVT::i8);
7378     return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
7379   case OP_VDUP0:
7380   case OP_VDUP1:
7381   case OP_VDUP2:
7382   case OP_VDUP3:
7383     return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7384                        OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
7385   case OP_VEXT1:
7386   case OP_VEXT2:
7387   case OP_VEXT3:
7388     return DAG.getNode(ARMISD::VEXT, dl, VT,
7389                        OpLHS, OpRHS,
7390                        DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
7391   case OP_VUZPL:
7392   case OP_VUZPR:
7393     return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
7394                        OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
7395   case OP_VZIPL:
7396   case OP_VZIPR:
7397     return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
7398                        OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
7399   case OP_VTRNL:
7400   case OP_VTRNR:
7401     return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
7402                        OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
7403   }
7404 }
7405
7406 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
7407                                        ArrayRef<int> ShuffleMask,
7408                                        SelectionDAG &DAG) {
7409   // Check to see if we can use the VTBL instruction.
7410   SDValue V1 = Op.getOperand(0);
7411   SDValue V2 = Op.getOperand(1);
7412   SDLoc DL(Op);
7413
7414   SmallVector<SDValue, 8> VTBLMask;
7415   for (ArrayRef<int>::iterator
7416          I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
7417     VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32));
7418
7419   if (V2.getNode()->isUndef())
7420     return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
7421                        DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
7422
7423   return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
7424                      DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
7425 }
7426
7427 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
7428                                                       SelectionDAG &DAG) {
7429   SDLoc DL(Op);
7430   SDValue OpLHS = Op.getOperand(0);
7431   EVT VT = OpLHS.getValueType();
7432
7433   assert((VT == MVT::v8i16 || VT == MVT::v16i8) &&
7434          "Expect an v8i16/v16i8 type");
7435   OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);
7436   // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
7437   // extract the first 8 bytes into the top double word and the last 8 bytes
7438   // into the bottom double word. The v8i16 case is similar.
7439   unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
7440   return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
7441                      DAG.getConstant(ExtractNum, DL, MVT::i32));
7442 }
7443
7444 static EVT getVectorTyFromPredicateVector(EVT VT) {
7445   switch (VT.getSimpleVT().SimpleTy) {
7446   case MVT::v4i1:
7447     return MVT::v4i32;
7448   case MVT::v8i1:
7449     return MVT::v8i16;
7450   case MVT::v16i1:
7451     return MVT::v16i8;
7452   default:
7453     llvm_unreachable("Unexpected vector predicate type");
7454   }
7455 }
7456
7457 static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT,
7458                                     SelectionDAG &DAG) {
7459   // Converting from boolean predicates to integers involves creating a vector
7460   // of all ones or all zeroes and selecting the lanes based upon the real
7461   // predicate.
7462   SDValue AllOnes =
7463       DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
7464   AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
7465
7466   SDValue AllZeroes =
7467       DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
7468   AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
7469
7470   // Get full vector type from predicate type
7471   EVT NewVT = getVectorTyFromPredicateVector(VT);
7472
7473   SDValue RecastV1;
7474   // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
7475   // this to a v16i1. This cannot be done with an ordinary bitcast because the
7476   // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
7477   // since we know in hardware the sizes are really the same.
7478   if (VT != MVT::v16i1)
7479     RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
7480   else
7481     RecastV1 = Pred;
7482
7483   // Select either all ones or zeroes depending upon the real predicate bits.
7484   SDValue PredAsVector =
7485       DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
7486
7487   // Recast our new predicate-as-integer v16i8 vector into something
7488   // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
7489   return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
7490 }
7491
7492 static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG,
7493                                       const ARMSubtarget *ST) {
7494   EVT VT = Op.getValueType();
7495   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
7496   ArrayRef<int> ShuffleMask = SVN->getMask();
7497
7498   assert(ST->hasMVEIntegerOps() &&
7499          "No support for vector shuffle of boolean predicates");
7500
7501   SDValue V1 = Op.getOperand(0);
7502   SDLoc dl(Op);
7503   if (isReverseMask(ShuffleMask, VT)) {
7504     SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
7505     SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
7506     SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
7507                               DAG.getConstant(16, dl, MVT::i32));
7508     return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
7509   }
7510
7511   // Until we can come up with optimised cases for every single vector
7512   // shuffle in existence we have chosen the least painful strategy. This is
7513   // to essentially promote the boolean predicate to a 8-bit integer, where
7514   // each predicate represents a byte. Then we fall back on a normal integer
7515   // vector shuffle and convert the result back into a predicate vector. In
7516   // many cases the generated code might be even better than scalar code
7517   // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
7518   // fields in a register into 8 other arbitrary 2-bit fields!
7519   SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG);
7520   EVT NewVT = PredAsVector.getValueType();
7521
7522   // Do the shuffle!
7523   SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector,
7524                                           DAG.getUNDEF(NewVT), ShuffleMask);
7525
7526   // Now return the result of comparing the shuffled vector with zero,
7527   // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
7528   return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
7529                      DAG.getConstant(ARMCC::NE, dl, MVT::i32));
7530 }
7531
7532 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
7533                                    const ARMSubtarget *ST) {
7534   SDValue V1 = Op.getOperand(0);
7535   SDValue V2 = Op.getOperand(1);
7536   SDLoc dl(Op);
7537   EVT VT = Op.getValueType();
7538   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
7539   unsigned EltSize = VT.getScalarSizeInBits();
7540
7541   if (ST->hasMVEIntegerOps() && EltSize == 1)
7542     return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
7543
7544   // Convert shuffles that are directly supported on NEON to target-specific
7545   // DAG nodes, instead of keeping them as shuffles and matching them again
7546   // during code selection.  This is more efficient and avoids the possibility
7547   // of inconsistencies between legalization and selection.
7548   // FIXME: floating-point vectors should be canonicalized to integer vectors
7549   // of the same time so that they get CSEd properly.
7550   ArrayRef<int> ShuffleMask = SVN->getMask();
7551
7552   if (EltSize <= 32) {
7553     if (SVN->isSplat()) {
7554       int Lane = SVN->getSplatIndex();
7555       // If this is undef splat, generate it via "just" vdup, if possible.
7556       if (Lane == -1) Lane = 0;
7557
7558       // Test if V1 is a SCALAR_TO_VECTOR.
7559       if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7560         return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
7561       }
7562       // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
7563       // (and probably will turn into a SCALAR_TO_VECTOR once legalization
7564       // reaches it).
7565       if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
7566           !isa<ConstantSDNode>(V1.getOperand(0))) {
7567         bool IsScalarToVector = true;
7568         for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
7569           if (!V1.getOperand(i).isUndef()) {
7570             IsScalarToVector = false;
7571             break;
7572           }
7573         if (IsScalarToVector)
7574           return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
7575       }
7576       return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
7577                          DAG.getConstant(Lane, dl, MVT::i32));
7578     }
7579
7580     bool ReverseVEXT = false;
7581     unsigned Imm = 0;
7582     if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
7583       if (ReverseVEXT)
7584         std::swap(V1, V2);
7585       return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
7586                          DAG.getConstant(Imm, dl, MVT::i32));
7587     }
7588
7589     if (isVREVMask(ShuffleMask, VT, 64))
7590       return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
7591     if (isVREVMask(ShuffleMask, VT, 32))
7592       return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
7593     if (isVREVMask(ShuffleMask, VT, 16))
7594       return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
7595
7596     if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
7597       return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
7598                          DAG.getConstant(Imm, dl, MVT::i32));
7599     }
7600
7601     // Check for Neon shuffles that modify both input vectors in place.
7602     // If both results are used, i.e., if there are two shuffles with the same
7603     // source operands and with masks corresponding to both results of one of
7604     // these operations, DAG memoization will ensure that a single node is
7605     // used for both shuffles.
7606     unsigned WhichResult = 0;
7607     bool isV_UNDEF = false;
7608     if (ST->hasNEON()) {
7609       if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
7610               ShuffleMask, VT, WhichResult, isV_UNDEF)) {
7611         if (isV_UNDEF)
7612           V2 = V1;
7613         return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
7614             .getValue(WhichResult);
7615       }
7616     }
7617
7618     // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
7619     // shuffles that produce a result larger than their operands with:
7620     //   shuffle(concat(v1, undef), concat(v2, undef))
7621     // ->
7622     //   shuffle(concat(v1, v2), undef)
7623     // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
7624     //
7625     // This is useful in the general case, but there are special cases where
7626     // native shuffles produce larger results: the two-result ops.
7627     //
7628     // Look through the concat when lowering them:
7629     //   shuffle(concat(v1, v2), undef)
7630     // ->
7631     //   concat(VZIP(v1, v2):0, :1)
7632     //
7633     if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
7634       SDValue SubV1 = V1->getOperand(0);
7635       SDValue SubV2 = V1->getOperand(1);
7636       EVT SubVT = SubV1.getValueType();
7637
7638       // We expect these to have been canonicalized to -1.
7639       assert(llvm::all_of(ShuffleMask, [&](int i) {
7640         return i < (int)VT.getVectorNumElements();
7641       }) && "Unexpected shuffle index into UNDEF operand!");
7642
7643       if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
7644               ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
7645         if (isV_UNDEF)
7646           SubV2 = SubV1;
7647         assert((WhichResult == 0) &&
7648                "In-place shuffle of concat can only have one result!");
7649         SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
7650                                   SubV1, SubV2);
7651         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
7652                            Res.getValue(1));
7653       }
7654     }
7655   }
7656
7657   // If the shuffle is not directly supported and it has 4 elements, use
7658   // the PerfectShuffle-generated table to synthesize it from other shuffles.
7659   unsigned NumElts = VT.getVectorNumElements();
7660   if (NumElts == 4) {
7661     unsigned PFIndexes[4];
7662     for (unsigned i = 0; i != 4; ++i) {
7663       if (ShuffleMask[i] < 0)
7664         PFIndexes[i] = 8;
7665       else
7666         PFIndexes[i] = ShuffleMask[i];
7667     }
7668
7669     // Compute the index in the perfect shuffle table.
7670     unsigned PFTableIndex =
7671       PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
7672     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
7673     unsigned Cost = (PFEntry >> 30);
7674
7675     if (Cost <= 4) {
7676       if (ST->hasNEON())
7677         return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
7678       else if (isLegalMVEShuffleOp(PFEntry)) {
7679         unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
7680         unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
7681         unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
7682         unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
7683         if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
7684           return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
7685       }
7686     }
7687   }
7688
7689   // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
7690   if (EltSize >= 32) {
7691     // Do the expansion with floating-point types, since that is what the VFP
7692     // registers are defined to use, and since i64 is not legal.
7693     EVT EltVT = EVT::getFloatingPointVT(EltSize);
7694     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
7695     V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
7696     V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
7697     SmallVector<SDValue, 8> Ops;
7698     for (unsigned i = 0; i < NumElts; ++i) {
7699       if (ShuffleMask[i] < 0)
7700         Ops.push_back(DAG.getUNDEF(EltVT));
7701       else
7702         Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
7703                                   ShuffleMask[i] < (int)NumElts ? V1 : V2,
7704                                   DAG.getConstant(ShuffleMask[i] & (NumElts-1),
7705                                                   dl, MVT::i32)));
7706     }
7707     SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
7708     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7709   }
7710
7711   if (ST->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
7712     return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
7713
7714   if (ST->hasNEON() && VT == MVT::v8i8)
7715     if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
7716       return NewOp;
7717
7718   return SDValue();
7719 }
7720
7721 static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,
7722                                          const ARMSubtarget *ST) {
7723   EVT VecVT = Op.getOperand(0).getValueType();
7724   SDLoc dl(Op);
7725
7726   assert(ST->hasMVEIntegerOps() &&
7727          "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
7728
7729   SDValue Conv =
7730       DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
7731   unsigned Lane = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
7732   unsigned LaneWidth =
7733       getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;
7734   unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
7735   SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
7736                             Op.getOperand(1), DAG.getValueType(MVT::i1));
7737   SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
7738                             DAG.getConstant(~Mask, dl, MVT::i32));
7739   return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
7740 }
7741
7742 SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
7743                                                   SelectionDAG &DAG) const {
7744   // INSERT_VECTOR_ELT is legal only for immediate indexes.
7745   SDValue Lane = Op.getOperand(2);
7746   if (!isa<ConstantSDNode>(Lane))
7747     return SDValue();
7748
7749   SDValue Elt = Op.getOperand(1);
7750   EVT EltVT = Elt.getValueType();
7751
7752   if (Subtarget->hasMVEIntegerOps() &&
7753       Op.getValueType().getScalarSizeInBits() == 1)
7754     return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
7755
7756   if (getTypeAction(*DAG.getContext(), EltVT) ==
7757       TargetLowering::TypePromoteFloat) {
7758     // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
7759     // but the type system will try to do that if we don't intervene.
7760     // Reinterpret any such vector-element insertion as one with the
7761     // corresponding integer types.
7762
7763     SDLoc dl(Op);
7764
7765     EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
7766     assert(getTypeAction(*DAG.getContext(), IEltVT) !=
7767            TargetLowering::TypePromoteFloat);
7768
7769     SDValue VecIn = Op.getOperand(0);
7770     EVT VecVT = VecIn.getValueType();
7771     EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
7772                                   VecVT.getVectorNumElements());
7773
7774     SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
7775     SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
7776     SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
7777                                   IVecIn, IElt, Lane);
7778     return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
7779   }
7780
7781   return Op;
7782 }
7783
7784 static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,
7785                                           const ARMSubtarget *ST) {
7786   EVT VecVT = Op.getOperand(0).getValueType();
7787   SDLoc dl(Op);
7788
7789   assert(ST->hasMVEIntegerOps() &&
7790          "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
7791
7792   SDValue Conv =
7793       DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
7794   unsigned Lane = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7795   unsigned LaneWidth =
7796       getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;
7797   SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
7798                               DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
7799   return Shift;
7800 }
7801
7802 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG,
7803                                        const ARMSubtarget *ST) {
7804   // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
7805   SDValue Lane = Op.getOperand(1);
7806   if (!isa<ConstantSDNode>(Lane))
7807     return SDValue();
7808
7809   SDValue Vec = Op.getOperand(0);
7810   EVT VT = Vec.getValueType();
7811
7812   if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
7813     return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
7814
7815   if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
7816     SDLoc dl(Op);
7817     return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
7818   }
7819
7820   return Op;
7821 }
7822
7823 static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG,
7824                                       const ARMSubtarget *ST) {
7825   SDValue V1 = Op.getOperand(0);
7826   SDValue V2 = Op.getOperand(1);
7827   SDLoc dl(Op);
7828   EVT VT = Op.getValueType();
7829   EVT Op1VT = V1.getValueType();
7830   EVT Op2VT = V2.getValueType();
7831   unsigned NumElts = VT.getVectorNumElements();
7832
7833   assert(Op1VT == Op2VT && "Operand types don't match!");
7834   assert(VT.getScalarSizeInBits() == 1 &&
7835          "Unexpected custom CONCAT_VECTORS lowering");
7836   assert(ST->hasMVEIntegerOps() &&
7837          "CONCAT_VECTORS lowering only supported for MVE");
7838
7839   SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
7840   SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
7841
7842   // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
7843   // promoted to v8i16, etc.
7844
7845   MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
7846
7847   // Extract the vector elements from Op1 and Op2 one by one and truncate them
7848   // to be the right size for the destination. For example, if Op1 is v4i1 then
7849   // the promoted vector is v4i32. The result of concatentation gives a v8i1,
7850   // which when promoted is v8i16. That means each i32 element from Op1 needs
7851   // truncating to i16 and inserting in the result.
7852   EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
7853   SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
7854   auto ExractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
7855     EVT NewVT = NewV.getValueType();
7856     EVT ConcatVT = ConVec.getValueType();
7857     for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
7858       SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
7859                                 DAG.getIntPtrConstant(i, dl));
7860       ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
7861                            DAG.getConstant(j, dl, MVT::i32));
7862     }
7863     return ConVec;
7864   };
7865   unsigned j = 0;
7866   ConVec = ExractInto(NewV1, ConVec, j);
7867   ConVec = ExractInto(NewV2, ConVec, j);
7868
7869   // Now return the result of comparing the subvector with zero,
7870   // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
7871   return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
7872                      DAG.getConstant(ARMCC::NE, dl, MVT::i32));
7873 }
7874
7875 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
7876                                    const ARMSubtarget *ST) {
7877   EVT VT = Op->getValueType(0);
7878   if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
7879     return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
7880
7881   // The only time a CONCAT_VECTORS operation can have legal types is when
7882   // two 64-bit vectors are concatenated to a 128-bit vector.
7883   assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
7884          "unexpected CONCAT_VECTORS");
7885   SDLoc dl(Op);
7886   SDValue Val = DAG.getUNDEF(MVT::v2f64);
7887   SDValue Op0 = Op.getOperand(0);
7888   SDValue Op1 = Op.getOperand(1);
7889   if (!Op0.isUndef())
7890     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
7891                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
7892                       DAG.getIntPtrConstant(0, dl));
7893   if (!Op1.isUndef())
7894     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
7895                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
7896                       DAG.getIntPtrConstant(1, dl));
7897   return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
7898 }
7899
7900 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG,
7901                                       const ARMSubtarget *ST) {
7902   SDValue V1 = Op.getOperand(0);
7903   SDValue V2 = Op.getOperand(1);
7904   SDLoc dl(Op);
7905   EVT VT = Op.getValueType();
7906   EVT Op1VT = V1.getValueType();
7907   unsigned NumElts = VT.getVectorNumElements();
7908   unsigned Index = cast<ConstantSDNode>(V2)->getZExtValue();
7909
7910   assert(VT.getScalarSizeInBits() == 1 &&
7911          "Unexpected custom EXTRACT_SUBVECTOR lowering");
7912   assert(ST->hasMVEIntegerOps() &&
7913          "EXTRACT_SUBVECTOR lowering only supported for MVE");
7914
7915   SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
7916
7917   // We now have Op1 promoted to a vector of integers, where v8i1 gets
7918   // promoted to v8i16, etc.
7919
7920   MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
7921
7922   EVT SubVT = MVT::getVectorVT(ElType, NumElts);
7923   SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
7924   for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
7925     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
7926                               DAG.getIntPtrConstant(i, dl));
7927     SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
7928                          DAG.getConstant(j, dl, MVT::i32));
7929   }
7930
7931   // Now return the result of comparing the subvector with zero,
7932   // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
7933   return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
7934                      DAG.getConstant(ARMCC::NE, dl, MVT::i32));
7935 }
7936
7937 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
7938 /// element has been zero/sign-extended, depending on the isSigned parameter,
7939 /// from an integer type half its size.
7940 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
7941                                    bool isSigned) {
7942   // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
7943   EVT VT = N->getValueType(0);
7944   if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
7945     SDNode *BVN = N->getOperand(0).getNode();
7946     if (BVN->getValueType(0) != MVT::v4i32 ||
7947         BVN->getOpcode() != ISD::BUILD_VECTOR)
7948       return false;
7949     unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
7950     unsigned HiElt = 1 - LoElt;
7951     ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
7952     ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
7953     ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
7954     ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
7955     if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
7956       return false;
7957     if (isSigned) {
7958       if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
7959           Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
7960         return true;
7961     } else {
7962       if (Hi0->isNullValue() && Hi1->isNullValue())
7963         return true;
7964     }
7965     return false;
7966   }
7967
7968   if (N->getOpcode() != ISD::BUILD_VECTOR)
7969     return false;
7970
7971   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
7972     SDNode *Elt = N->getOperand(i).getNode();
7973     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
7974       unsigned EltSize = VT.getScalarSizeInBits();
7975       unsigned HalfSize = EltSize / 2;
7976       if (isSigned) {
7977         if (!isIntN(HalfSize, C->getSExtValue()))
7978           return false;
7979       } else {
7980         if (!isUIntN(HalfSize, C->getZExtValue()))
7981           return false;
7982       }
7983       continue;
7984     }
7985     return false;
7986   }
7987
7988   return true;
7989 }
7990
7991 /// isSignExtended - Check if a node is a vector value that is sign-extended
7992 /// or a constant BUILD_VECTOR with sign-extended elements.
7993 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
7994   if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
7995     return true;
7996   if (isExtendedBUILD_VECTOR(N, DAG, true))
7997     return true;
7998   return false;
7999 }
8000
8001 /// isZeroExtended - Check if a node is a vector value that is zero-extended
8002 /// or a constant BUILD_VECTOR with zero-extended elements.
8003 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
8004   if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N))
8005     return true;
8006   if (isExtendedBUILD_VECTOR(N, DAG, false))
8007     return true;
8008   return false;
8009 }
8010
8011 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
8012   if (OrigVT.getSizeInBits() >= 64)
8013     return OrigVT;
8014
8015   assert(OrigVT.isSimple() && "Expecting a simple value type");
8016
8017   MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
8018   switch (OrigSimpleTy) {
8019   default: llvm_unreachable("Unexpected Vector Type");
8020   case MVT::v2i8:
8021   case MVT::v2i16:
8022      return MVT::v2i32;
8023   case MVT::v4i8:
8024     return  MVT::v4i16;
8025   }
8026 }
8027
8028 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
8029 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
8030 /// We insert the required extension here to get the vector to fill a D register.
8031 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
8032                                             const EVT &OrigTy,
8033                                             const EVT &ExtTy,
8034                                             unsigned ExtOpcode) {
8035   // The vector originally had a size of OrigTy. It was then extended to ExtTy.
8036   // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
8037   // 64-bits we need to insert a new extension so that it will be 64-bits.
8038   assert(ExtTy.is128BitVector() && "Unexpected extension size");
8039   if (OrigTy.getSizeInBits() >= 64)
8040     return N;
8041
8042   // Must extend size to at least 64 bits to be used as an operand for VMULL.
8043   EVT NewVT = getExtensionTo64Bits(OrigTy);
8044
8045   return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
8046 }
8047
8048 /// SkipLoadExtensionForVMULL - return a load of the original vector size that
8049 /// does not do any sign/zero extension. If the original vector is less
8050 /// than 64 bits, an appropriate extension will be added after the load to
8051 /// reach a total size of 64 bits. We have to add the extension separately
8052 /// because ARM does not have a sign/zero extending load for vectors.
8053 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
8054   EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
8055
8056   // The load already has the right type.
8057   if (ExtendedTy == LD->getMemoryVT())
8058     return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
8059                        LD->getBasePtr(), LD->getPointerInfo(),
8060                        LD->getAlignment(), LD->getMemOperand()->getFlags());
8061
8062   // We need to create a zextload/sextload. We cannot just create a load
8063   // followed by a zext/zext node because LowerMUL is also run during normal
8064   // operation legalization where we can't create illegal types.
8065   return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
8066                         LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
8067                         LD->getMemoryVT(), LD->getAlignment(),
8068                         LD->getMemOperand()->getFlags());
8069 }
8070
8071 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
8072 /// extending load, or BUILD_VECTOR with extended elements, return the
8073 /// unextended value. The unextended vector should be 64 bits so that it can
8074 /// be used as an operand to a VMULL instruction. If the original vector size
8075 /// before extension is less than 64 bits we add a an extension to resize
8076 /// the vector to 64 bits.
8077 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
8078   if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
8079     return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
8080                                         N->getOperand(0)->getValueType(0),
8081                                         N->getValueType(0),
8082                                         N->getOpcode());
8083
8084   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
8085     assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
8086            "Expected extending load");
8087
8088     SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
8089     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
8090     unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
8091     SDValue extLoad =
8092         DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
8093     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
8094
8095     return newLoad;
8096   }
8097
8098   // Otherwise, the value must be a BUILD_VECTOR.  For v2i64, it will
8099   // have been legalized as a BITCAST from v4i32.
8100   if (N->getOpcode() == ISD::BITCAST) {
8101     SDNode *BVN = N->getOperand(0).getNode();
8102     assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
8103            BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
8104     unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
8105     return DAG.getBuildVector(
8106         MVT::v2i32, SDLoc(N),
8107         {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
8108   }
8109   // Construct a new BUILD_VECTOR with elements truncated to half the size.
8110   assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
8111   EVT VT = N->getValueType(0);
8112   unsigned EltSize = VT.getScalarSizeInBits() / 2;
8113   unsigned NumElts = VT.getVectorNumElements();
8114   MVT TruncVT = MVT::getIntegerVT(EltSize);
8115   SmallVector<SDValue, 8> Ops;
8116   SDLoc dl(N);
8117   for (unsigned i = 0; i != NumElts; ++i) {
8118     ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
8119     const APInt &CInt = C->getAPIntValue();
8120     // Element types smaller than 32 bits are not legal, so use i32 elements.
8121     // The values are implicitly truncated so sext vs. zext doesn't matter.
8122     Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
8123   }
8124   return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
8125 }
8126
8127 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
8128   unsigned Opcode = N->getOpcode();
8129   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
8130     SDNode *N0 = N->getOperand(0).getNode();
8131     SDNode *N1 = N->getOperand(1).getNode();
8132     return N0->hasOneUse() && N1->hasOneUse() &&
8133       isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
8134   }
8135   return false;
8136 }
8137
8138 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
8139   unsigned Opcode = N->getOpcode();
8140   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
8141     SDNode *N0 = N->getOperand(0).getNode();
8142     SDNode *N1 = N->getOperand(1).getNode();
8143     return N0->hasOneUse() && N1->hasOneUse() &&
8144       isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
8145   }
8146   return false;
8147 }
8148
8149 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
8150   // Multiplications are only custom-lowered for 128-bit vectors so that
8151   // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
8152   EVT VT = Op.getValueType();
8153   assert(VT.is128BitVector() && VT.isInteger() &&
8154          "unexpected type for custom-lowering ISD::MUL");
8155   SDNode *N0 = Op.getOperand(0).getNode();
8156   SDNode *N1 = Op.getOperand(1).getNode();
8157   unsigned NewOpc = 0;
8158   bool isMLA = false;
8159   bool isN0SExt = isSignExtended(N0, DAG);
8160   bool isN1SExt = isSignExtended(N1, DAG);
8161   if (isN0SExt && isN1SExt)
8162     NewOpc = ARMISD::VMULLs;
8163   else {
8164     bool isN0ZExt = isZeroExtended(N0, DAG);
8165     bool isN1ZExt = isZeroExtended(N1, DAG);
8166     if (isN0ZExt && isN1ZExt)
8167       NewOpc = ARMISD::VMULLu;
8168     else if (isN1SExt || isN1ZExt) {
8169       // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
8170       // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
8171       if (isN1SExt && isAddSubSExt(N0, DAG)) {
8172         NewOpc = ARMISD::VMULLs;
8173         isMLA = true;
8174       } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
8175         NewOpc = ARMISD::VMULLu;
8176         isMLA = true;
8177       } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
8178         std::swap(N0, N1);
8179         NewOpc = ARMISD::VMULLu;
8180         isMLA = true;
8181       }
8182     }
8183
8184     if (!NewOpc) {
8185       if (VT == MVT::v2i64)
8186         // Fall through to expand this.  It is not legal.
8187         return SDValue();
8188       else
8189         // Other vector multiplications are legal.
8190         return Op;
8191     }
8192   }
8193
8194   // Legalize to a VMULL instruction.
8195   SDLoc DL(Op);
8196   SDValue Op0;
8197   SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
8198   if (!isMLA) {
8199     Op0 = SkipExtensionForVMULL(N0, DAG);
8200     assert(Op0.getValueType().is64BitVector() &&
8201            Op1.getValueType().is64BitVector() &&
8202            "unexpected types for extended operands to VMULL");
8203     return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
8204   }
8205
8206   // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
8207   // isel lowering to take advantage of no-stall back to back vmul + vmla.
8208   //   vmull q0, d4, d6
8209   //   vmlal q0, d5, d6
8210   // is faster than
8211   //   vaddl q0, d4, d5
8212   //   vmovl q1, d6
8213   //   vmul  q0, q0, q1
8214   SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
8215   SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
8216   EVT Op1VT = Op1.getValueType();
8217   return DAG.getNode(N0->getOpcode(), DL, VT,
8218                      DAG.getNode(NewOpc, DL, VT,
8219                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
8220                      DAG.getNode(NewOpc, DL, VT,
8221                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
8222 }
8223
8224 static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl,
8225                               SelectionDAG &DAG) {
8226   // TODO: Should this propagate fast-math-flags?
8227
8228   // Convert to float
8229   // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
8230   // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
8231   X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
8232   Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
8233   X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
8234   Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
8235   // Get reciprocal estimate.
8236   // float4 recip = vrecpeq_f32(yf);
8237   Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
8238                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
8239                    Y);
8240   // Because char has a smaller range than uchar, we can actually get away
8241   // without any newton steps.  This requires that we use a weird bias
8242   // of 0xb000, however (again, this has been exhaustively tested).
8243   // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
8244   X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
8245   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
8246   Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
8247   X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
8248   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
8249   // Convert back to short.
8250   X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
8251   X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
8252   return X;
8253 }
8254
8255 static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl,
8256                                SelectionDAG &DAG) {
8257   // TODO: Should this propagate fast-math-flags?
8258
8259   SDValue N2;
8260   // Convert to float.
8261   // float4 yf = vcvt_f32_s32(vmovl_s16(y));
8262   // float4 xf = vcvt_f32_s32(vmovl_s16(x));
8263   N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
8264   N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
8265   N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
8266   N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
8267
8268   // Use reciprocal estimate and one refinement step.
8269   // float4 recip = vrecpeq_f32(yf);
8270   // recip *= vrecpsq_f32(yf, recip);
8271   N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
8272                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
8273                    N1);
8274   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
8275                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
8276                    N1, N2);
8277   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
8278   // Because short has a smaller range than ushort, we can actually get away
8279   // with only a single newton step.  This requires that we use a weird bias
8280   // of 89, however (again, this has been exhaustively tested).
8281   // float4 result = as_float4(as_int4(xf*recip) + 0x89);
8282   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
8283   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
8284   N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
8285   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
8286   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
8287   // Convert back to integer and return.
8288   // return vmovn_s32(vcvt_s32_f32(result));
8289   N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
8290   N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
8291   return N0;
8292 }
8293
8294 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG,
8295                          const ARMSubtarget *ST) {
8296   EVT VT = Op.getValueType();
8297   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
8298          "unexpected type for custom-lowering ISD::SDIV");
8299
8300   SDLoc dl(Op);
8301   SDValue N0 = Op.getOperand(0);
8302   SDValue N1 = Op.getOperand(1);
8303   SDValue N2, N3;
8304
8305   if (VT == MVT::v8i8) {
8306     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
8307     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
8308
8309     N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
8310                      DAG.getIntPtrConstant(4, dl));
8311     N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
8312                      DAG.getIntPtrConstant(4, dl));
8313     N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
8314                      DAG.getIntPtrConstant(0, dl));
8315     N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
8316                      DAG.getIntPtrConstant(0, dl));
8317
8318     N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
8319     N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
8320
8321     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
8322     N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
8323
8324     N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
8325     return N0;
8326   }
8327   return LowerSDIV_v4i16(N0, N1, dl, DAG);
8328 }
8329
8330 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG,
8331                          const ARMSubtarget *ST) {
8332   // TODO: Should this propagate fast-math-flags?
8333   EVT VT = Op.getValueType();
8334   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
8335          "unexpected type for custom-lowering ISD::UDIV");
8336
8337   SDLoc dl(Op);
8338   SDValue N0 = Op.getOperand(0);
8339   SDValue N1 = Op.getOperand(1);
8340   SDValue N2, N3;
8341
8342   if (VT == MVT::v8i8) {
8343     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
8344     N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
8345
8346     N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
8347                      DAG.getIntPtrConstant(4, dl));
8348     N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
8349                      DAG.getIntPtrConstant(4, dl));
8350     N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
8351                      DAG.getIntPtrConstant(0, dl));
8352     N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
8353                      DAG.getIntPtrConstant(0, dl));
8354
8355     N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
8356     N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
8357
8358     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
8359     N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
8360
8361     N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
8362                      DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
8363                                      MVT::i32),
8364                      N0);
8365     return N0;
8366   }
8367
8368   // v4i16 sdiv ... Convert to float.
8369   // float4 yf = vcvt_f32_s32(vmovl_u16(y));
8370   // float4 xf = vcvt_f32_s32(vmovl_u16(x));
8371   N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
8372   N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
8373   N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
8374   SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
8375
8376   // Use reciprocal estimate and two refinement steps.
8377   // float4 recip = vrecpeq_f32(yf);
8378   // recip *= vrecpsq_f32(yf, recip);
8379   // recip *= vrecpsq_f32(yf, recip);
8380   N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
8381                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
8382                    BN1);
8383   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
8384                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
8385                    BN1, N2);
8386   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
8387   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
8388                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
8389                    BN1, N2);
8390   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
8391   // Simply multiplying by the reciprocal estimate can leave us a few ulps
8392   // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
8393   // and that it will never cause us to return an answer too large).
8394   // float4 result = as_float4(as_int4(xf*recip) + 2);
8395   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
8396   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
8397   N1 = DAG.getConstant(2, dl, MVT::v4i32);
8398   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
8399   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
8400   // Convert back to integer and return.
8401   // return vmovn_u32(vcvt_s32_f32(result));
8402   N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
8403   N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
8404   return N0;
8405 }
8406
8407 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
8408   SDNode *N = Op.getNode();
8409   EVT VT = N->getValueType(0);
8410   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
8411
8412   SDValue Carry = Op.getOperand(2);
8413
8414   SDLoc DL(Op);
8415
8416   SDValue Result;
8417   if (Op.getOpcode() == ISD::ADDCARRY) {
8418     // This converts the boolean value carry into the carry flag.
8419     Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
8420
8421     // Do the addition proper using the carry flag we wanted.
8422     Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
8423                          Op.getOperand(1), Carry);
8424
8425     // Now convert the carry flag into a boolean value.
8426     Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
8427   } else {
8428     // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
8429     // have to invert the carry first.
8430     Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
8431                         DAG.getConstant(1, DL, MVT::i32), Carry);
8432     // This converts the boolean value carry into the carry flag.
8433     Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
8434
8435     // Do the subtraction proper using the carry flag we wanted.
8436     Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
8437                          Op.getOperand(1), Carry);
8438
8439     // Now convert the carry flag into a boolean value.
8440     Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
8441     // But the carry returned by ARMISD::SUBE is not a borrow as expected
8442     // by ISD::SUBCARRY, so compute 1 - C.
8443     Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
8444                         DAG.getConstant(1, DL, MVT::i32), Carry);
8445   }
8446
8447   // Return both values.
8448   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
8449 }
8450
8451 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
8452   assert(Subtarget->isTargetDarwin());
8453
8454   // For iOS, we want to call an alternative entry point: __sincos_stret,
8455   // return values are passed via sret.
8456   SDLoc dl(Op);
8457   SDValue Arg = Op.getOperand(0);
8458   EVT ArgVT = Arg.getValueType();
8459   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
8460   auto PtrVT = getPointerTy(DAG.getDataLayout());
8461
8462   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
8463   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8464
8465   // Pair of floats / doubles used to pass the result.
8466   Type *RetTy = StructType::get(ArgTy, ArgTy);
8467   auto &DL = DAG.getDataLayout();
8468
8469   ArgListTy Args;
8470   bool ShouldUseSRet = Subtarget->isAPCS_ABI();
8471   SDValue SRet;
8472   if (ShouldUseSRet) {
8473     // Create stack object for sret.
8474     const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
8475     const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy);
8476     int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
8477     SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));
8478
8479     ArgListEntry Entry;
8480     Entry.Node = SRet;
8481     Entry.Ty = RetTy->getPointerTo();
8482     Entry.IsSExt = false;
8483     Entry.IsZExt = false;
8484     Entry.IsSRet = true;
8485     Args.push_back(Entry);
8486     RetTy = Type::getVoidTy(*DAG.getContext());
8487   }
8488
8489   ArgListEntry Entry;
8490   Entry.Node = Arg;
8491   Entry.Ty = ArgTy;
8492   Entry.IsSExt = false;
8493   Entry.IsZExt = false;
8494   Args.push_back(Entry);
8495
8496   RTLIB::Libcall LC =
8497       (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
8498   const char *LibcallName = getLibcallName(LC);
8499   CallingConv::ID CC = getLibcallCallingConv(LC);
8500   SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
8501
8502   TargetLowering::CallLoweringInfo CLI(DAG);
8503   CLI.setDebugLoc(dl)
8504       .setChain(DAG.getEntryNode())
8505       .setCallee(CC, RetTy, Callee, std::move(Args))
8506       .setDiscardResult(ShouldUseSRet);
8507   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
8508
8509   if (!ShouldUseSRet)
8510     return CallResult.first;
8511
8512   SDValue LoadSin =
8513       DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
8514
8515   // Address of cos field.
8516   SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
8517                             DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
8518   SDValue LoadCos =
8519       DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
8520
8521   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
8522   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
8523                      LoadSin.getValue(0), LoadCos.getValue(0));
8524 }
8525
8526 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
8527                                                   bool Signed,
8528                                                   SDValue &Chain) const {
8529   EVT VT = Op.getValueType();
8530   assert((VT == MVT::i32 || VT == MVT::i64) &&
8531          "unexpected type for custom lowering DIV");
8532   SDLoc dl(Op);
8533
8534   const auto &DL = DAG.getDataLayout();
8535   const auto &TLI = DAG.getTargetLoweringInfo();
8536
8537   const char *Name = nullptr;
8538   if (Signed)
8539     Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";
8540   else
8541     Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
8542
8543   SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL));
8544
8545   ARMTargetLowering::ArgListTy Args;
8546
8547   for (auto AI : {1, 0}) {
8548     ArgListEntry Arg;
8549     Arg.Node = Op.getOperand(AI);
8550     Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext());
8551     Args.push_back(Arg);
8552   }
8553
8554   CallLoweringInfo CLI(DAG);
8555   CLI.setDebugLoc(dl)
8556     .setChain(Chain)
8557     .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()),
8558                ES, std::move(Args));
8559
8560   return LowerCallTo(CLI).first;
8561 }
8562
8563 // This is a code size optimisation: return the original SDIV node to
8564 // DAGCombiner when we don't want to expand SDIV into a sequence of
8565 // instructions, and an empty node otherwise which will cause the
8566 // SDIV to be expanded in DAGCombine.
8567 SDValue
8568 ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
8569                                  SelectionDAG &DAG,
8570                                  SmallVectorImpl<SDNode *> &Created) const {
8571   // TODO: Support SREM
8572   if (N->getOpcode() != ISD::SDIV)
8573     return SDValue();
8574
8575   const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget());
8576   const bool MinSize = ST.hasMinSize();
8577   const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
8578                                       : ST.hasDivideInARMMode();
8579
8580   // Don't touch vector types; rewriting this may lead to scalarizing
8581   // the int divs.
8582   if (N->getOperand(0).getValueType().isVector())
8583     return SDValue();
8584
8585   // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
8586   // hwdiv support for this to be really profitable.
8587   if (!(MinSize && HasDivide))
8588     return SDValue();
8589
8590   // ARM mode is a bit simpler than Thumb: we can handle large power
8591   // of 2 immediates with 1 mov instruction; no further checks required,
8592   // just return the sdiv node.
8593   if (!ST.isThumb())
8594     return SDValue(N, 0);
8595
8596   // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
8597   // and thus lose the code size benefits of a MOVS that requires only 2.
8598   // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
8599   // but as it's doing exactly this, it's not worth the trouble to get TTI.
8600   if (Divisor.sgt(128))
8601     return SDValue();
8602
8603   return SDValue(N, 0);
8604 }
8605
8606 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
8607                                             bool Signed) const {
8608   assert(Op.getValueType() == MVT::i32 &&
8609          "unexpected type for custom lowering DIV");
8610   SDLoc dl(Op);
8611
8612   SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
8613                                DAG.getEntryNode(), Op.getOperand(1));
8614
8615   return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
8616 }
8617
8618 static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) {
8619   SDLoc DL(N);
8620   SDValue Op = N->getOperand(1);
8621   if (N->getValueType(0) == MVT::i32)
8622     return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
8623   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
8624                            DAG.getConstant(0, DL, MVT::i32));
8625   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
8626                            DAG.getConstant(1, DL, MVT::i32));
8627   return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
8628                      DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
8629 }
8630
8631 void ARMTargetLowering::ExpandDIV_Windows(
8632     SDValue Op, SelectionDAG &DAG, bool Signed,
8633     SmallVectorImpl<SDValue> &Results) const {
8634   const auto &DL = DAG.getDataLayout();
8635   const auto &TLI = DAG.getTargetLoweringInfo();
8636
8637   assert(Op.getValueType() == MVT::i64 &&
8638          "unexpected type for custom lowering DIV");
8639   SDLoc dl(Op);
8640
8641   SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
8642
8643   SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
8644
8645   SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
8646   SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
8647                               DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
8648   Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
8649
8650   Results.push_back(Lower);
8651   Results.push_back(Upper);
8652 }
8653
8654 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
8655   if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
8656     // Acquire/Release load/store is not legal for targets without a dmb or
8657     // equivalent available.
8658     return SDValue();
8659
8660   // Monotonic load/store is legal for all targets.
8661   return Op;
8662 }
8663
8664 static void ReplaceREADCYCLECOUNTER(SDNode *N,
8665                                     SmallVectorImpl<SDValue> &Results,
8666                                     SelectionDAG &DAG,
8667                                     const ARMSubtarget *Subtarget) {
8668   SDLoc DL(N);
8669   // Under Power Management extensions, the cycle-count is:
8670   //    mrc p15, #0, <Rt>, c9, c13, #0
8671   SDValue Ops[] = { N->getOperand(0), // Chain
8672                     DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
8673                     DAG.getConstant(15, DL, MVT::i32),
8674                     DAG.getConstant(0, DL, MVT::i32),
8675                     DAG.getConstant(9, DL, MVT::i32),
8676                     DAG.getConstant(13, DL, MVT::i32),
8677                     DAG.getConstant(0, DL, MVT::i32)
8678   };
8679
8680   SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
8681                                  DAG.getVTList(MVT::i32, MVT::Other), Ops);
8682   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
8683                                 DAG.getConstant(0, DL, MVT::i32)));
8684   Results.push_back(Cycles32.getValue(1));
8685 }
8686
8687 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
8688   SDLoc dl(V.getNode());
8689   SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32);
8690   SDValue VHi = DAG.getAnyExtOrTrunc(
8691       DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)),
8692       dl, MVT::i32);
8693   bool isBigEndian = DAG.getDataLayout().isBigEndian();
8694   if (isBigEndian)
8695     std::swap (VLo, VHi);
8696   SDValue RegClass =
8697       DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
8698   SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
8699   SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
8700   const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
8701   return SDValue(
8702       DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
8703 }
8704
8705 static void ReplaceCMP_SWAP_64Results(SDNode *N,
8706                                        SmallVectorImpl<SDValue> & Results,
8707                                        SelectionDAG &DAG) {
8708   assert(N->getValueType(0) == MVT::i64 &&
8709          "AtomicCmpSwap on types less than 64 should be legal");
8710   SDValue Ops[] = {N->getOperand(1),
8711                    createGPRPairNode(DAG, N->getOperand(2)),
8712                    createGPRPairNode(DAG, N->getOperand(3)),
8713                    N->getOperand(0)};
8714   SDNode *CmpSwap = DAG.getMachineNode(
8715       ARM::CMP_SWAP_64, SDLoc(N),
8716       DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops);
8717
8718   MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
8719   DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
8720
8721   bool isBigEndian = DAG.getDataLayout().isBigEndian();
8722
8723   Results.push_back(
8724       DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
8725                                  SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)));
8726   Results.push_back(
8727       DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
8728                                  SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)));
8729   Results.push_back(SDValue(CmpSwap, 2));
8730 }
8731
8732 static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget,
8733                           SelectionDAG &DAG) {
8734   const auto &TLI = DAG.getTargetLoweringInfo();
8735
8736   assert(Subtarget.getTargetTriple().isOSMSVCRT() &&
8737          "Custom lowering is MSVCRT specific!");
8738
8739   SDLoc dl(Op);
8740   SDValue Val = Op.getOperand(0);
8741   MVT Ty = Val->getSimpleValueType(0);
8742   SDValue Exponent = DAG.getNode(ISD::SINT_TO_FP, dl, Ty, Op.getOperand(1));
8743   SDValue Callee = DAG.getExternalSymbol(Ty == MVT::f32 ? "powf" : "pow",
8744                                          TLI.getPointerTy(DAG.getDataLayout()));
8745
8746   TargetLowering::ArgListTy Args;
8747   TargetLowering::ArgListEntry Entry;
8748
8749   Entry.Node = Val;
8750   Entry.Ty = Val.getValueType().getTypeForEVT(*DAG.getContext());
8751   Entry.IsZExt = true;
8752   Args.push_back(Entry);
8753
8754   Entry.Node = Exponent;
8755   Entry.Ty = Exponent.getValueType().getTypeForEVT(*DAG.getContext());
8756   Entry.IsZExt = true;
8757   Args.push_back(Entry);
8758
8759   Type *LCRTy = Val.getValueType().getTypeForEVT(*DAG.getContext());
8760
8761   // In the in-chain to the call is the entry node  If we are emitting a
8762   // tailcall, the chain will be mutated if the node has a non-entry input
8763   // chain.
8764   SDValue InChain = DAG.getEntryNode();
8765   SDValue TCChain = InChain;
8766
8767   const Function &F = DAG.getMachineFunction().getFunction();
8768   bool IsTC = TLI.isInTailCallPosition(DAG, Op.getNode(), TCChain) &&
8769               F.getReturnType() == LCRTy;
8770   if (IsTC)
8771     InChain = TCChain;
8772
8773   TargetLowering::CallLoweringInfo CLI(DAG);
8774   CLI.setDebugLoc(dl)
8775       .setChain(InChain)
8776       .setCallee(CallingConv::ARM_AAPCS_VFP, LCRTy, Callee, std::move(Args))
8777       .setTailCall(IsTC);
8778   std::pair<SDValue, SDValue> CI = TLI.LowerCallTo(CLI);
8779
8780   // Return the chain (the DAG root) if it is a tail call
8781   return !CI.second.getNode() ? DAG.getRoot() : CI.first;
8782 }
8783
8784 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
8785   LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
8786   switch (Op.getOpcode()) {
8787   default: llvm_unreachable("Don't know how to custom lower this!");
8788   case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
8789   case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
8790   case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
8791   case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
8792   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
8793   case ISD::SELECT:        return LowerSELECT(Op, DAG);
8794   case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
8795   case ISD::BRCOND:        return LowerBRCOND(Op, DAG);
8796   case ISD::BR_CC:         return LowerBR_CC(Op, DAG);
8797   case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
8798   case ISD::VASTART:       return LowerVASTART(Op, DAG);
8799   case ISD::ATOMIC_FENCE:  return LowerATOMIC_FENCE(Op, DAG, Subtarget);
8800   case ISD::PREFETCH:      return LowerPREFETCH(Op, DAG, Subtarget);
8801   case ISD::SINT_TO_FP:
8802   case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
8803   case ISD::FP_TO_SINT:
8804   case ISD::FP_TO_UINT:    return LowerFP_TO_INT(Op, DAG);
8805   case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
8806   case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
8807   case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
8808   case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
8809   case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
8810   case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
8811   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
8812                                                                Subtarget);
8813   case ISD::BITCAST:       return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
8814   case ISD::SHL:
8815   case ISD::SRL:
8816   case ISD::SRA:           return LowerShift(Op.getNode(), DAG, Subtarget);
8817   case ISD::SREM:          return LowerREM(Op.getNode(), DAG);
8818   case ISD::UREM:          return LowerREM(Op.getNode(), DAG);
8819   case ISD::SHL_PARTS:     return LowerShiftLeftParts(Op, DAG);
8820   case ISD::SRL_PARTS:
8821   case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
8822   case ISD::CTTZ:
8823   case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
8824   case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
8825   case ISD::SETCC:         return LowerVSETCC(Op, DAG, Subtarget);
8826   case ISD::SETCCCARRY:    return LowerSETCCCARRY(Op, DAG);
8827   case ISD::ConstantFP:    return LowerConstantFP(Op, DAG, Subtarget);
8828   case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
8829   case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
8830   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
8831   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
8832   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
8833   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
8834   case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
8835   case ISD::MUL:           return LowerMUL(Op, DAG);
8836   case ISD::SDIV:
8837     if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
8838       return LowerDIV_Windows(Op, DAG, /* Signed */ true);
8839     return LowerSDIV(Op, DAG, Subtarget);
8840   case ISD::UDIV:
8841     if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
8842       return LowerDIV_Windows(Op, DAG, /* Signed */ false);
8843     return LowerUDIV(Op, DAG, Subtarget);
8844   case ISD::ADDCARRY:
8845   case ISD::SUBCARRY:      return LowerADDSUBCARRY(Op, DAG);
8846   case ISD::SADDO:
8847   case ISD::SSUBO:
8848     return LowerSignedALUO(Op, DAG);
8849   case ISD::UADDO:
8850   case ISD::USUBO:
8851     return LowerUnsignedALUO(Op, DAG);
8852   case ISD::ATOMIC_LOAD:
8853   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
8854   case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
8855   case ISD::SDIVREM:
8856   case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
8857   case ISD::DYNAMIC_STACKALLOC:
8858     if (Subtarget->isTargetWindows())
8859       return LowerDYNAMIC_STACKALLOC(Op, DAG);
8860     llvm_unreachable("Don't know how to custom lower this!");
8861   case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
8862   case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
8863   case ISD::FPOWI: return LowerFPOWI(Op, *Subtarget, DAG);
8864   case ARMISD::WIN__DBZCHK: return SDValue();
8865   }
8866 }
8867
8868 static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results,
8869                                  SelectionDAG &DAG) {
8870   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
8871   unsigned Opc = 0;
8872   if (IntNo == Intrinsic::arm_smlald)
8873     Opc = ARMISD::SMLALD;
8874   else if (IntNo == Intrinsic::arm_smlaldx)
8875     Opc = ARMISD::SMLALDX;
8876   else if (IntNo == Intrinsic::arm_smlsld)
8877     Opc = ARMISD::SMLSLD;
8878   else if (IntNo == Intrinsic::arm_smlsldx)
8879     Opc = ARMISD::SMLSLDX;
8880   else
8881     return;
8882
8883   SDLoc dl(N);
8884   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
8885                            N->getOperand(3),
8886                            DAG.getConstant(0, dl, MVT::i32));
8887   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
8888                            N->getOperand(3),
8889                            DAG.getConstant(1, dl, MVT::i32));
8890
8891   SDValue LongMul = DAG.getNode(Opc, dl,
8892                                 DAG.getVTList(MVT::i32, MVT::i32),
8893                                 N->getOperand(1), N->getOperand(2),
8894                                 Lo, Hi);
8895   Results.push_back(LongMul.getValue(0));
8896   Results.push_back(LongMul.getValue(1));
8897 }
8898
8899 /// ReplaceNodeResults - Replace the results of node with an illegal result
8900 /// type with new values built out of custom code.
8901 void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
8902                                            SmallVectorImpl<SDValue> &Results,
8903                                            SelectionDAG &DAG) const {
8904   SDValue Res;
8905   switch (N->getOpcode()) {
8906   default:
8907     llvm_unreachable("Don't know how to custom expand this!");
8908   case ISD::READ_REGISTER:
8909     ExpandREAD_REGISTER(N, Results, DAG);
8910     break;
8911   case ISD::BITCAST:
8912     Res = ExpandBITCAST(N, DAG, Subtarget);
8913     break;
8914   case ISD::SRL:
8915   case ISD::SRA:
8916   case ISD::SHL:
8917     Res = Expand64BitShift(N, DAG, Subtarget);
8918     break;
8919   case ISD::SREM:
8920   case ISD::UREM:
8921     Res = LowerREM(N, DAG);
8922     break;
8923   case ISD::SDIVREM:
8924   case ISD::UDIVREM:
8925     Res = LowerDivRem(SDValue(N, 0), DAG);
8926     assert(Res.getNumOperands() == 2 && "DivRem needs two values");
8927     Results.push_back(Res.getValue(0));
8928     Results.push_back(Res.getValue(1));
8929     return;
8930   case ISD::READCYCLECOUNTER:
8931     ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
8932     return;
8933   case ISD::UDIV:
8934   case ISD::SDIV:
8935     assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
8936     return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
8937                              Results);
8938   case ISD::ATOMIC_CMP_SWAP:
8939     ReplaceCMP_SWAP_64Results(N, Results, DAG);
8940     return;
8941   case ISD::INTRINSIC_WO_CHAIN:
8942     return ReplaceLongIntrinsic(N, Results, DAG);
8943   case ISD::ABS:
8944      lowerABS(N, Results, DAG);
8945      return ;
8946
8947   }
8948   if (Res.getNode())
8949     Results.push_back(Res);
8950 }
8951
8952 //===----------------------------------------------------------------------===//
8953 //                           ARM Scheduler Hooks
8954 //===----------------------------------------------------------------------===//
8955
8956 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
8957 /// registers the function context.
8958 void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
8959                                                MachineBasicBlock *MBB,
8960                                                MachineBasicBlock *DispatchBB,
8961                                                int FI) const {
8962   assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
8963          "ROPI/RWPI not currently supported with SjLj");
8964   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
8965   DebugLoc dl = MI.getDebugLoc();
8966   MachineFunction *MF = MBB->getParent();
8967   MachineRegisterInfo *MRI = &MF->getRegInfo();
8968   MachineConstantPool *MCP = MF->getConstantPool();
8969   ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
8970   const Function &F = MF->getFunction();
8971
8972   bool isThumb = Subtarget->isThumb();
8973   bool isThumb2 = Subtarget->isThumb2();
8974
8975   unsigned PCLabelId = AFI->createPICLabelUId();
8976   unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
8977   ARMConstantPoolValue *CPV =
8978     ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
8979   unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
8980
8981   const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
8982                                            : &ARM::GPRRegClass;
8983
8984   // Grab constant pool and fixed stack memory operands.
8985   MachineMemOperand *CPMMO =
8986       MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
8987                                MachineMemOperand::MOLoad, 4, 4);
8988
8989   MachineMemOperand *FIMMOSt =
8990       MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),
8991                                MachineMemOperand::MOStore, 4, 4);
8992
8993   // Load the address of the dispatch MBB into the jump buffer.
8994   if (isThumb2) {
8995     // Incoming value: jbuf
8996     //   ldr.n  r5, LCPI1_1
8997     //   orr    r5, r5, #1
8998     //   add    r5, pc
8999     //   str    r5, [$jbuf, #+4] ; &jbuf[1]
9000     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
9001     BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
9002         .addConstantPoolIndex(CPI)
9003         .addMemOperand(CPMMO)
9004         .add(predOps(ARMCC::AL));
9005     // Set the low bit because of thumb mode.
9006     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
9007     BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
9008         .addReg(NewVReg1, RegState::Kill)
9009         .addImm(0x01)
9010         .add(predOps(ARMCC::AL))
9011         .add(condCodeOp());
9012     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
9013     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
9014       .addReg(NewVReg2, RegState::Kill)
9015       .addImm(PCLabelId);
9016     BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
9017         .addReg(NewVReg3, RegState::Kill)
9018         .addFrameIndex(FI)
9019         .addImm(36) // &jbuf[1] :: pc
9020         .addMemOperand(FIMMOSt)
9021         .add(predOps(ARMCC::AL));
9022   } else if (isThumb) {
9023     // Incoming value: jbuf
9024     //   ldr.n  r1, LCPI1_4
9025     //   add    r1, pc
9026     //   mov    r2, #1
9027     //   orrs   r1, r2
9028     //   add    r2, $jbuf, #+4 ; &jbuf[1]
9029     //   str    r1, [r2]
9030     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
9031     BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
9032         .addConstantPoolIndex(CPI)
9033         .addMemOperand(CPMMO)
9034         .add(predOps(ARMCC::AL));
9035     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
9036     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
9037       .addReg(NewVReg1, RegState::Kill)
9038       .addImm(PCLabelId);
9039     // Set the low bit because of thumb mode.
9040     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
9041     BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
9042         .addReg(ARM::CPSR, RegState::Define)
9043         .addImm(1)
9044         .add(predOps(ARMCC::AL));
9045     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
9046     BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
9047         .addReg(ARM::CPSR, RegState::Define)
9048         .addReg(NewVReg2, RegState::Kill)
9049         .addReg(NewVReg3, RegState::Kill)
9050         .add(predOps(ARMCC::AL));
9051     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
9052     BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
9053             .addFrameIndex(FI)
9054             .addImm(36); // &jbuf[1] :: pc
9055     BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
9056         .addReg(NewVReg4, RegState::Kill)
9057         .addReg(NewVReg5, RegState::Kill)
9058         .addImm(0)
9059         .addMemOperand(FIMMOSt)
9060         .add(predOps(ARMCC::AL));
9061   } else {
9062     // Incoming value: jbuf
9063     //   ldr  r1, LCPI1_1
9064     //   add  r1, pc, r1
9065     //   str  r1, [$jbuf, #+4] ; &jbuf[1]
9066     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
9067     BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
9068         .addConstantPoolIndex(CPI)
9069         .addImm(0)
9070         .addMemOperand(CPMMO)
9071         .add(predOps(ARMCC::AL));
9072     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
9073     BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
9074         .addReg(NewVReg1, RegState::Kill)
9075         .addImm(PCLabelId)
9076         .add(predOps(ARMCC::AL));
9077     BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
9078         .addReg(NewVReg2, RegState::Kill)
9079         .addFrameIndex(FI)
9080         .addImm(36) // &jbuf[1] :: pc
9081         .addMemOperand(FIMMOSt)
9082         .add(predOps(ARMCC::AL));
9083   }
9084 }
9085
9086 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
9087                                               MachineBasicBlock *MBB) const {
9088   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
9089   DebugLoc dl = MI.getDebugLoc();
9090   MachineFunction *MF = MBB->getParent();
9091   MachineRegisterInfo *MRI = &MF->getRegInfo();
9092   MachineFrameInfo &MFI = MF->getFrameInfo();
9093   int FI = MFI.getFunctionContextIndex();
9094
9095   const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
9096                                                         : &ARM::GPRnopcRegClass;
9097
9098   // Get a mapping of the call site numbers to all of the landing pads they're
9099   // associated with.
9100   DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
9101   unsigned MaxCSNum = 0;
9102   for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
9103        ++BB) {
9104     if (!BB->isEHPad()) continue;
9105
9106     // FIXME: We should assert that the EH_LABEL is the first MI in the landing
9107     // pad.
9108     for (MachineBasicBlock::iterator
9109            II = BB->begin(), IE = BB->end(); II != IE; ++II) {
9110       if (!II->isEHLabel()) continue;
9111
9112       MCSymbol *Sym = II->getOperand(0).getMCSymbol();
9113       if (!MF->hasCallSiteLandingPad(Sym)) continue;
9114
9115       SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
9116       for (SmallVectorImpl<unsigned>::iterator
9117              CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
9118            CSI != CSE; ++CSI) {
9119         CallSiteNumToLPad[*CSI].push_back(&*BB);
9120         MaxCSNum = std::max(MaxCSNum, *CSI);
9121       }
9122       break;
9123     }
9124   }
9125
9126   // Get an ordered list of the machine basic blocks for the jump table.
9127   std::vector<MachineBasicBlock*> LPadList;
9128   SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
9129   LPadList.reserve(CallSiteNumToLPad.size());
9130   for (unsigned I = 1; I <= MaxCSNum; ++I) {
9131     SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
9132     for (SmallVectorImpl<MachineBasicBlock*>::iterator
9133            II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) {
9134       LPadList.push_back(*II);
9135       InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end());
9136     }
9137   }
9138
9139   assert(!LPadList.empty() &&
9140          "No landing pad destinations for the dispatch jump table!");
9141
9142   // Create the jump table and associated information.
9143   MachineJumpTableInfo *JTI =
9144     MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
9145   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
9146
9147   // Create the MBBs for the dispatch code.
9148
9149   // Shove the dispatch's address into the return slot in the function context.
9150   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
9151   DispatchBB->setIsEHPad();
9152
9153   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
9154   unsigned trap_opcode;
9155   if (Subtarget->isThumb())
9156     trap_opcode = ARM::tTRAP;
9157   else
9158     trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
9159
9160   BuildMI(TrapBB, dl, TII->get(trap_opcode));
9161   DispatchBB->addSuccessor(TrapBB);
9162
9163   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
9164   DispatchBB->addSuccessor(DispContBB);
9165
9166   // Insert and MBBs.
9167   MF->insert(MF->end(), DispatchBB);
9168   MF->insert(MF->end(), DispContBB);
9169   MF->insert(MF->end(), TrapBB);
9170
9171   // Insert code into the entry block that creates and registers the function
9172   // context.
9173   SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
9174
9175   MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
9176       MachinePointerInfo::getFixedStack(*MF, FI),
9177       MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4);
9178
9179   MachineInstrBuilder MIB;
9180   MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
9181
9182   const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
9183   const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
9184
9185   // Add a register mask with no preserved registers.  This results in all
9186   // registers being marked as clobbered. This can't work if the dispatch block
9187   // is in a Thumb1 function and is linked with ARM code which uses the FP
9188   // registers, as there is no way to preserve the FP registers in Thumb1 mode.
9189   MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF));
9190
9191   bool IsPositionIndependent = isPositionIndependent();
9192   unsigned NumLPads = LPadList.size();
9193   if (Subtarget->isThumb2()) {
9194     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
9195     BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
9196         .addFrameIndex(FI)
9197         .addImm(4)
9198         .addMemOperand(FIMMOLd)
9199         .add(predOps(ARMCC::AL));
9200
9201     if (NumLPads < 256) {
9202       BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
9203           .addReg(NewVReg1)
9204           .addImm(LPadList.size())
9205           .add(predOps(ARMCC::AL));
9206     } else {
9207       unsigned VReg1 = MRI->createVirtualRegister(TRC);
9208       BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
9209           .addImm(NumLPads & 0xFFFF)
9210           .add(predOps(ARMCC::AL));
9211
9212       unsigned VReg2 = VReg1;
9213       if ((NumLPads & 0xFFFF0000) != 0) {
9214         VReg2 = MRI->createVirtualRegister(TRC);
9215         BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
9216             .addReg(VReg1)
9217             .addImm(NumLPads >> 16)
9218             .add(predOps(ARMCC::AL));
9219       }
9220
9221       BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
9222           .addReg(NewVReg1)
9223           .addReg(VReg2)
9224           .add(predOps(ARMCC::AL));
9225     }
9226
9227     BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
9228       .addMBB(TrapBB)
9229       .addImm(ARMCC::HI)
9230       .addReg(ARM::CPSR);
9231
9232     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
9233     BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
9234         .addJumpTableIndex(MJTI)
9235         .add(predOps(ARMCC::AL));
9236
9237     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
9238     BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
9239         .addReg(NewVReg3, RegState::Kill)
9240         .addReg(NewVReg1)
9241         .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
9242         .add(predOps(ARMCC::AL))
9243         .add(condCodeOp());
9244
9245     BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
9246       .addReg(NewVReg4, RegState::Kill)
9247       .addReg(NewVReg1)
9248       .addJumpTableIndex(MJTI);
9249   } else if (Subtarget->isThumb()) {
9250     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
9251     BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
9252         .addFrameIndex(FI)
9253         .addImm(1)
9254         .addMemOperand(FIMMOLd)
9255         .add(predOps(ARMCC::AL));
9256
9257     if (NumLPads < 256) {
9258       BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
9259           .addReg(NewVReg1)
9260           .addImm(NumLPads)
9261           .add(predOps(ARMCC::AL));
9262     } else {
9263       MachineConstantPool *ConstantPool = MF->getConstantPool();
9264       Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
9265       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
9266
9267       // MachineConstantPool wants an explicit alignment.
9268       unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
9269       if (Align == 0)
9270         Align = MF->getDataLayout().getTypeAllocSize(C->getType());
9271       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
9272
9273       unsigned VReg1 = MRI->createVirtualRegister(TRC);
9274       BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
9275           .addReg(VReg1, RegState::Define)
9276           .addConstantPoolIndex(Idx)
9277           .add(predOps(ARMCC::AL));
9278       BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
9279           .addReg(NewVReg1)
9280           .addReg(VReg1)
9281           .add(predOps(ARMCC::AL));
9282     }
9283
9284     BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
9285       .addMBB(TrapBB)
9286       .addImm(ARMCC::HI)
9287       .addReg(ARM::CPSR);
9288
9289     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
9290     BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
9291         .addReg(ARM::CPSR, RegState::Define)
9292         .addReg(NewVReg1)
9293         .addImm(2)
9294         .add(predOps(ARMCC::AL));
9295
9296     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
9297     BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
9298         .addJumpTableIndex(MJTI)
9299         .add(predOps(ARMCC::AL));
9300
9301     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
9302     BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
9303         .addReg(ARM::CPSR, RegState::Define)
9304         .addReg(NewVReg2, RegState::Kill)
9305         .addReg(NewVReg3)
9306         .add(predOps(ARMCC::AL));
9307
9308     MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
9309         MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
9310
9311     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
9312     BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
9313         .addReg(NewVReg4, RegState::Kill)
9314         .addImm(0)
9315         .addMemOperand(JTMMOLd)
9316         .add(predOps(ARMCC::AL));
9317
9318     unsigned NewVReg6 = NewVReg5;
9319     if (IsPositionIndependent) {
9320       NewVReg6 = MRI->createVirtualRegister(TRC);
9321       BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
9322           .addReg(ARM::CPSR, RegState::Define)
9323           .addReg(NewVReg5, RegState::Kill)
9324           .addReg(NewVReg3)
9325           .add(predOps(ARMCC::AL));
9326     }
9327
9328     BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
9329       .addReg(NewVReg6, RegState::Kill)
9330       .addJumpTableIndex(MJTI);
9331   } else {
9332     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
9333     BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
9334         .addFrameIndex(FI)
9335         .addImm(4)
9336         .addMemOperand(FIMMOLd)
9337         .add(predOps(ARMCC::AL));
9338
9339     if (NumLPads < 256) {
9340       BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
9341           .addReg(NewVReg1)
9342           .addImm(NumLPads)
9343           .add(predOps(ARMCC::AL));
9344     } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
9345       unsigned VReg1 = MRI->createVirtualRegister(TRC);
9346       BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
9347           .addImm(NumLPads & 0xFFFF)
9348           .add(predOps(ARMCC::AL));
9349
9350       unsigned VReg2 = VReg1;
9351       if ((NumLPads & 0xFFFF0000) != 0) {
9352         VReg2 = MRI->createVirtualRegister(TRC);
9353         BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
9354             .addReg(VReg1)
9355             .addImm(NumLPads >> 16)
9356             .add(predOps(ARMCC::AL));
9357       }
9358
9359       BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
9360           .addReg(NewVReg1)
9361           .addReg(VReg2)
9362           .add(predOps(ARMCC::AL));
9363     } else {
9364       MachineConstantPool *ConstantPool = MF->getConstantPool();
9365       Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
9366       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
9367
9368       // MachineConstantPool wants an explicit alignment.
9369       unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
9370       if (Align == 0)
9371         Align = MF->getDataLayout().getTypeAllocSize(C->getType());
9372       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
9373
9374       unsigned VReg1 = MRI->createVirtualRegister(TRC);
9375       BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
9376           .addReg(VReg1, RegState::Define)
9377           .addConstantPoolIndex(Idx)
9378           .addImm(0)
9379           .add(predOps(ARMCC::AL));
9380       BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
9381           .addReg(NewVReg1)
9382           .addReg(VReg1, RegState::Kill)
9383           .add(predOps(ARMCC::AL));
9384     }
9385
9386     BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
9387       .addMBB(TrapBB)
9388       .addImm(ARMCC::HI)
9389       .addReg(ARM::CPSR);
9390
9391     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
9392     BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
9393         .addReg(NewVReg1)
9394         .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
9395         .add(predOps(ARMCC::AL))
9396         .add(condCodeOp());
9397     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
9398     BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
9399         .addJumpTableIndex(MJTI)
9400         .add(predOps(ARMCC::AL));
9401
9402     MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
9403         MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
9404     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
9405     BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
9406         .addReg(NewVReg3, RegState::Kill)
9407         .addReg(NewVReg4)
9408         .addImm(0)
9409         .addMemOperand(JTMMOLd)
9410         .add(predOps(ARMCC::AL));
9411
9412     if (IsPositionIndependent) {
9413       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
9414         .addReg(NewVReg5, RegState::Kill)
9415         .addReg(NewVReg4)
9416         .addJumpTableIndex(MJTI);
9417     } else {
9418       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
9419         .addReg(NewVReg5, RegState::Kill)
9420         .addJumpTableIndex(MJTI);
9421     }
9422   }
9423
9424   // Add the jump table entries as successors to the MBB.
9425   SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
9426   for (std::vector<MachineBasicBlock*>::iterator
9427          I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
9428     MachineBasicBlock *CurMBB = *I;
9429     if (SeenMBBs.insert(CurMBB).second)
9430       DispContBB->addSuccessor(CurMBB);
9431   }
9432
9433   // N.B. the order the invoke BBs are processed in doesn't matter here.
9434   const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
9435   SmallVector<MachineBasicBlock*, 64> MBBLPads;
9436   for (MachineBasicBlock *BB : InvokeBBs) {
9437
9438     // Remove the landing pad successor from the invoke block and replace it
9439     // with the new dispatch block.
9440     SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(),
9441                                                   BB->succ_end());
9442     while (!Successors.empty()) {
9443       MachineBasicBlock *SMBB = Successors.pop_back_val();
9444       if (SMBB->isEHPad()) {
9445         BB->removeSuccessor(SMBB);
9446         MBBLPads.push_back(SMBB);
9447       }
9448     }
9449
9450     BB->addSuccessor(DispatchBB, BranchProbability::getZero());
9451     BB->normalizeSuccProbs();
9452
9453     // Find the invoke call and mark all of the callee-saved registers as
9454     // 'implicit defined' so that they're spilled. This prevents code from
9455     // moving instructions to before the EH block, where they will never be
9456     // executed.
9457     for (MachineBasicBlock::reverse_iterator
9458            II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
9459       if (!II->isCall()) continue;
9460
9461       DenseMap<unsigned, bool> DefRegs;
9462       for (MachineInstr::mop_iterator
9463              OI = II->operands_begin(), OE = II->operands_end();
9464            OI != OE; ++OI) {
9465         if (!OI->isReg()) continue;
9466         DefRegs[OI->getReg()] = true;
9467       }
9468
9469       MachineInstrBuilder MIB(*MF, &*II);
9470
9471       for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
9472         unsigned Reg = SavedRegs[i];
9473         if (Subtarget->isThumb2() &&
9474             !ARM::tGPRRegClass.contains(Reg) &&
9475             !ARM::hGPRRegClass.contains(Reg))
9476           continue;
9477         if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
9478           continue;
9479         if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
9480           continue;
9481         if (!DefRegs[Reg])
9482           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
9483       }
9484
9485       break;
9486     }
9487   }
9488
9489   // Mark all former landing pads as non-landing pads. The dispatch is the only
9490   // landing pad now.
9491   for (SmallVectorImpl<MachineBasicBlock*>::iterator
9492          I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I)
9493     (*I)->setIsEHPad(false);
9494
9495   // The instruction is gone now.
9496   MI.eraseFromParent();
9497 }
9498
9499 static
9500 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
9501   for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
9502        E = MBB->succ_end(); I != E; ++I)
9503     if (*I != Succ)
9504       return *I;
9505   llvm_unreachable("Expecting a BB with two successors!");
9506 }
9507
9508 /// Return the load opcode for a given load size. If load size >= 8,
9509 /// neon opcode will be returned.
9510 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
9511   if (LdSize >= 8)
9512     return LdSize == 16 ? ARM::VLD1q32wb_fixed
9513                         : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
9514   if (IsThumb1)
9515     return LdSize == 4 ? ARM::tLDRi
9516                        : LdSize == 2 ? ARM::tLDRHi
9517                                      : LdSize == 1 ? ARM::tLDRBi : 0;
9518   if (IsThumb2)
9519     return LdSize == 4 ? ARM::t2LDR_POST
9520                        : LdSize == 2 ? ARM::t2LDRH_POST
9521                                      : LdSize == 1 ? ARM::t2LDRB_POST : 0;
9522   return LdSize == 4 ? ARM::LDR_POST_IMM
9523                      : LdSize == 2 ? ARM::LDRH_POST
9524                                    : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
9525 }
9526
9527 /// Return the store opcode for a given store size. If store size >= 8,
9528 /// neon opcode will be returned.
9529 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
9530   if (StSize >= 8)
9531     return StSize == 16 ? ARM::VST1q32wb_fixed
9532                         : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
9533   if (IsThumb1)
9534     return StSize == 4 ? ARM::tSTRi
9535                        : StSize == 2 ? ARM::tSTRHi
9536                                      : StSize == 1 ? ARM::tSTRBi : 0;
9537   if (IsThumb2)
9538     return StSize == 4 ? ARM::t2STR_POST
9539                        : StSize == 2 ? ARM::t2STRH_POST
9540                                      : StSize == 1 ? ARM::t2STRB_POST : 0;
9541   return StSize == 4 ? ARM::STR_POST_IMM
9542                      : StSize == 2 ? ARM::STRH_POST
9543                                    : StSize == 1 ? ARM::STRB_POST_IMM : 0;
9544 }
9545
9546 /// Emit a post-increment load operation with given size. The instructions
9547 /// will be added to BB at Pos.
9548 static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
9549                        const TargetInstrInfo *TII, const DebugLoc &dl,
9550                        unsigned LdSize, unsigned Data, unsigned AddrIn,
9551                        unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
9552   unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
9553   assert(LdOpc != 0 && "Should have a load opcode");
9554   if (LdSize >= 8) {
9555     BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
9556         .addReg(AddrOut, RegState::Define)
9557         .addReg(AddrIn)
9558         .addImm(0)
9559         .add(predOps(ARMCC::AL));
9560   } else if (IsThumb1) {
9561     // load + update AddrIn
9562     BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
9563         .addReg(AddrIn)
9564         .addImm(0)
9565         .add(predOps(ARMCC::AL));
9566     BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
9567         .add(t1CondCodeOp())
9568         .addReg(AddrIn)
9569         .addImm(LdSize)
9570         .add(predOps(ARMCC::AL));
9571   } else if (IsThumb2) {
9572     BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
9573         .addReg(AddrOut, RegState::Define)
9574         .addReg(AddrIn)
9575         .addImm(LdSize)
9576         .add(predOps(ARMCC::AL));
9577   } else { // arm
9578     BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
9579         .addReg(AddrOut, RegState::Define)
9580         .addReg(AddrIn)
9581         .addReg(0)
9582         .addImm(LdSize)
9583         .add(predOps(ARMCC::AL));
9584   }
9585 }
9586
9587 /// Emit a post-increment store operation with given size. The instructions
9588 /// will be added to BB at Pos.
9589 static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
9590                        const TargetInstrInfo *TII, const DebugLoc &dl,
9591                        unsigned StSize, unsigned Data, unsigned AddrIn,
9592                        unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
9593   unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
9594   assert(StOpc != 0 && "Should have a store opcode");
9595   if (StSize >= 8) {
9596     BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
9597         .addReg(AddrIn)
9598         .addImm(0)
9599         .addReg(Data)
9600         .add(predOps(ARMCC::AL));
9601   } else if (IsThumb1) {
9602     // store + update AddrIn
9603     BuildMI(*BB, Pos, dl, TII->get(StOpc))
9604         .addReg(Data)
9605         .addReg(AddrIn)
9606         .addImm(0)
9607         .add(predOps(ARMCC::AL));
9608     BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
9609         .add(t1CondCodeOp())
9610         .addReg(AddrIn)
9611         .addImm(StSize)
9612         .add(predOps(ARMCC::AL));
9613   } else if (IsThumb2) {
9614     BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
9615         .addReg(Data)
9616         .addReg(AddrIn)
9617         .addImm(StSize)
9618         .add(predOps(ARMCC::AL));
9619   } else { // arm
9620     BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
9621         .addReg(Data)
9622         .addReg(AddrIn)
9623         .addReg(0)
9624         .addImm(StSize)
9625         .add(predOps(ARMCC::AL));
9626   }
9627 }
9628
9629 MachineBasicBlock *
9630 ARMTargetLowering::EmitStructByval(MachineInstr &MI,
9631                                    MachineBasicBlock *BB) const {
9632   // This pseudo instruction has 3 operands: dst, src, size
9633   // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
9634   // Otherwise, we will generate unrolled scalar copies.
9635   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
9636   const BasicBlock *LLVM_BB = BB->getBasicBlock();
9637   MachineFunction::iterator It = ++BB->getIterator();
9638
9639   unsigned dest = MI.getOperand(0).getReg();
9640   unsigned src = MI.getOperand(1).getReg();
9641   unsigned SizeVal = MI.getOperand(2).getImm();
9642   unsigned Align = MI.getOperand(3).getImm();
9643   DebugLoc dl = MI.getDebugLoc();
9644
9645   MachineFunction *MF = BB->getParent();
9646   MachineRegisterInfo &MRI = MF->getRegInfo();
9647   unsigned UnitSize = 0;
9648   const TargetRegisterClass *TRC = nullptr;
9649   const TargetRegisterClass *VecTRC = nullptr;
9650
9651   bool IsThumb1 = Subtarget->isThumb1Only();
9652   bool IsThumb2 = Subtarget->isThumb2();
9653   bool IsThumb = Subtarget->isThumb();
9654
9655   if (Align & 1) {
9656     UnitSize = 1;
9657   } else if (Align & 2) {
9658     UnitSize = 2;
9659   } else {
9660     // Check whether we can use NEON instructions.
9661     if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
9662         Subtarget->hasNEON()) {
9663       if ((Align % 16 == 0) && SizeVal >= 16)
9664         UnitSize = 16;
9665       else if ((Align % 8 == 0) && SizeVal >= 8)
9666         UnitSize = 8;
9667     }
9668     // Can't use NEON instructions.
9669     if (UnitSize == 0)
9670       UnitSize = 4;
9671   }
9672
9673   // Select the correct opcode and register class for unit size load/store
9674   bool IsNeon = UnitSize >= 8;
9675   TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
9676   if (IsNeon)
9677     VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
9678                             : UnitSize == 8 ? &ARM::DPRRegClass
9679                                             : nullptr;
9680
9681   unsigned BytesLeft = SizeVal % UnitSize;
9682   unsigned LoopSize = SizeVal - BytesLeft;
9683
9684   if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
9685     // Use LDR and STR to copy.
9686     // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
9687     // [destOut] = STR_POST(scratch, destIn, UnitSize)
9688     unsigned srcIn = src;
9689     unsigned destIn = dest;
9690     for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
9691       unsigned srcOut = MRI.createVirtualRegister(TRC);
9692       unsigned destOut = MRI.createVirtualRegister(TRC);
9693       unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
9694       emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
9695                  IsThumb1, IsThumb2);
9696       emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
9697                  IsThumb1, IsThumb2);
9698       srcIn = srcOut;
9699       destIn = destOut;
9700     }
9701
9702     // Handle the leftover bytes with LDRB and STRB.
9703     // [scratch, srcOut] = LDRB_POST(srcIn, 1)
9704     // [destOut] = STRB_POST(scratch, destIn, 1)
9705     for (unsigned i = 0; i < BytesLeft; i++) {
9706       unsigned srcOut = MRI.createVirtualRegister(TRC);
9707       unsigned destOut = MRI.createVirtualRegister(TRC);
9708       unsigned scratch = MRI.createVirtualRegister(TRC);
9709       emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
9710                  IsThumb1, IsThumb2);
9711       emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
9712                  IsThumb1, IsThumb2);
9713       srcIn = srcOut;
9714       destIn = destOut;
9715     }
9716     MI.eraseFromParent(); // The instruction is gone now.
9717     return BB;
9718   }
9719
9720   // Expand the pseudo op to a loop.
9721   // thisMBB:
9722   //   ...
9723   //   movw varEnd, # --> with thumb2
9724   //   movt varEnd, #
9725   //   ldrcp varEnd, idx --> without thumb2
9726   //   fallthrough --> loopMBB
9727   // loopMBB:
9728   //   PHI varPhi, varEnd, varLoop
9729   //   PHI srcPhi, src, srcLoop
9730   //   PHI destPhi, dst, destLoop
9731   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
9732   //   [destLoop] = STR_POST(scratch, destPhi, UnitSize)
9733   //   subs varLoop, varPhi, #UnitSize
9734   //   bne loopMBB
9735   //   fallthrough --> exitMBB
9736   // exitMBB:
9737   //   epilogue to handle left-over bytes
9738   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
9739   //   [destOut] = STRB_POST(scratch, destLoop, 1)
9740   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
9741   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
9742   MF->insert(It, loopMBB);
9743   MF->insert(It, exitMBB);
9744
9745   // Transfer the remainder of BB and its successor edges to exitMBB.
9746   exitMBB->splice(exitMBB->begin(), BB,
9747                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
9748   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
9749
9750   // Load an immediate to varEnd.
9751   unsigned varEnd = MRI.createVirtualRegister(TRC);
9752   if (Subtarget->useMovt()) {
9753     unsigned Vtmp = varEnd;
9754     if ((LoopSize & 0xFFFF0000) != 0)
9755       Vtmp = MRI.createVirtualRegister(TRC);
9756     BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp)
9757         .addImm(LoopSize & 0xFFFF)
9758         .add(predOps(ARMCC::AL));
9759
9760     if ((LoopSize & 0xFFFF0000) != 0)
9761       BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd)
9762           .addReg(Vtmp)
9763           .addImm(LoopSize >> 16)
9764           .add(predOps(ARMCC::AL));
9765   } else {
9766     MachineConstantPool *ConstantPool = MF->getConstantPool();
9767     Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
9768     const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
9769
9770     // MachineConstantPool wants an explicit alignment.
9771     unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
9772     if (Align == 0)
9773       Align = MF->getDataLayout().getTypeAllocSize(C->getType());
9774     unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
9775     MachineMemOperand *CPMMO =
9776         MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
9777                                  MachineMemOperand::MOLoad, 4, 4);
9778
9779     if (IsThumb)
9780       BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
9781           .addReg(varEnd, RegState::Define)
9782           .addConstantPoolIndex(Idx)
9783           .add(predOps(ARMCC::AL))
9784           .addMemOperand(CPMMO);
9785     else
9786       BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
9787           .addReg(varEnd, RegState::Define)
9788           .addConstantPoolIndex(Idx)
9789           .addImm(0)
9790           .add(predOps(ARMCC::AL))
9791           .addMemOperand(CPMMO);
9792   }
9793   BB->addSuccessor(loopMBB);
9794
9795   // Generate the loop body:
9796   //   varPhi = PHI(varLoop, varEnd)
9797   //   srcPhi = PHI(srcLoop, src)
9798   //   destPhi = PHI(destLoop, dst)
9799   MachineBasicBlock *entryBB = BB;
9800   BB = loopMBB;
9801   unsigned varLoop = MRI.createVirtualRegister(TRC);
9802   unsigned varPhi = MRI.createVirtualRegister(TRC);
9803   unsigned srcLoop = MRI.createVirtualRegister(TRC);
9804   unsigned srcPhi = MRI.createVirtualRegister(TRC);
9805   unsigned destLoop = MRI.createVirtualRegister(TRC);
9806   unsigned destPhi = MRI.createVirtualRegister(TRC);
9807
9808   BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
9809     .addReg(varLoop).addMBB(loopMBB)
9810     .addReg(varEnd).addMBB(entryBB);
9811   BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
9812     .addReg(srcLoop).addMBB(loopMBB)
9813     .addReg(src).addMBB(entryBB);
9814   BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
9815     .addReg(destLoop).addMBB(loopMBB)
9816     .addReg(dest).addMBB(entryBB);
9817
9818   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
9819   //   [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
9820   unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
9821   emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
9822              IsThumb1, IsThumb2);
9823   emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
9824              IsThumb1, IsThumb2);
9825
9826   // Decrement loop variable by UnitSize.
9827   if (IsThumb1) {
9828     BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
9829         .add(t1CondCodeOp())
9830         .addReg(varPhi)
9831         .addImm(UnitSize)
9832         .add(predOps(ARMCC::AL));
9833   } else {
9834     MachineInstrBuilder MIB =
9835         BuildMI(*BB, BB->end(), dl,
9836                 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
9837     MIB.addReg(varPhi)
9838         .addImm(UnitSize)
9839         .add(predOps(ARMCC::AL))
9840         .add(condCodeOp());
9841     MIB->getOperand(5).setReg(ARM::CPSR);
9842     MIB->getOperand(5).setIsDef(true);
9843   }
9844   BuildMI(*BB, BB->end(), dl,
9845           TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
9846       .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
9847
9848   // loopMBB can loop back to loopMBB or fall through to exitMBB.
9849   BB->addSuccessor(loopMBB);
9850   BB->addSuccessor(exitMBB);
9851
9852   // Add epilogue to handle BytesLeft.
9853   BB = exitMBB;
9854   auto StartOfExit = exitMBB->begin();
9855
9856   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
9857   //   [destOut] = STRB_POST(scratch, destLoop, 1)
9858   unsigned srcIn = srcLoop;
9859   unsigned destIn = destLoop;
9860   for (unsigned i = 0; i < BytesLeft; i++) {
9861     unsigned srcOut = MRI.createVirtualRegister(TRC);
9862     unsigned destOut = MRI.createVirtualRegister(TRC);
9863     unsigned scratch = MRI.createVirtualRegister(TRC);
9864     emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
9865                IsThumb1, IsThumb2);
9866     emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
9867                IsThumb1, IsThumb2);
9868     srcIn = srcOut;
9869     destIn = destOut;
9870   }
9871
9872   MI.eraseFromParent(); // The instruction is gone now.
9873   return BB;
9874 }
9875
9876 MachineBasicBlock *
9877 ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
9878                                        MachineBasicBlock *MBB) const {
9879   const TargetMachine &TM = getTargetMachine();
9880   const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
9881   DebugLoc DL = MI.getDebugLoc();
9882
9883   assert(Subtarget->isTargetWindows() &&
9884          "__chkstk is only supported on Windows");
9885   assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
9886
9887   // __chkstk takes the number of words to allocate on the stack in R4, and
9888   // returns the stack adjustment in number of bytes in R4.  This will not
9889   // clober any other registers (other than the obvious lr).
9890   //
9891   // Although, technically, IP should be considered a register which may be
9892   // clobbered, the call itself will not touch it.  Windows on ARM is a pure
9893   // thumb-2 environment, so there is no interworking required.  As a result, we
9894   // do not expect a veneer to be emitted by the linker, clobbering IP.
9895   //
9896   // Each module receives its own copy of __chkstk, so no import thunk is
9897   // required, again, ensuring that IP is not clobbered.
9898   //
9899   // Finally, although some linkers may theoretically provide a trampoline for
9900   // out of range calls (which is quite common due to a 32M range limitation of
9901   // branches for Thumb), we can generate the long-call version via
9902   // -mcmodel=large, alleviating the need for the trampoline which may clobber
9903   // IP.
9904
9905   switch (TM.getCodeModel()) {
9906   case CodeModel::Tiny:
9907     llvm_unreachable("Tiny code model not available on ARM.");
9908   case CodeModel::Small:
9909   case CodeModel::Medium:
9910   case CodeModel::Kernel:
9911     BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
9912         .add(predOps(ARMCC::AL))
9913         .addExternalSymbol("__chkstk")
9914         .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
9915         .addReg(ARM::R4, RegState::Implicit | RegState::Define)
9916         .addReg(ARM::R12,
9917                 RegState::Implicit | RegState::Define | RegState::Dead)
9918         .addReg(ARM::CPSR,
9919                 RegState::Implicit | RegState::Define | RegState::Dead);
9920     break;
9921   case CodeModel::Large: {
9922     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
9923     unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
9924
9925     BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
9926       .addExternalSymbol("__chkstk");
9927     BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr))
9928         .add(predOps(ARMCC::AL))
9929         .addReg(Reg, RegState::Kill)
9930         .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
9931         .addReg(ARM::R4, RegState::Implicit | RegState::Define)
9932         .addReg(ARM::R12,
9933                 RegState::Implicit | RegState::Define | RegState::Dead)
9934         .addReg(ARM::CPSR,
9935                 RegState::Implicit | RegState::Define | RegState::Dead);
9936     break;
9937   }
9938   }
9939
9940   BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
9941       .addReg(ARM::SP, RegState::Kill)
9942       .addReg(ARM::R4, RegState::Kill)
9943       .setMIFlags(MachineInstr::FrameSetup)
9944       .add(predOps(ARMCC::AL))
9945       .add(condCodeOp());
9946
9947   MI.eraseFromParent();
9948   return MBB;
9949 }
9950
9951 MachineBasicBlock *
9952 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
9953                                        MachineBasicBlock *MBB) const {
9954   DebugLoc DL = MI.getDebugLoc();
9955   MachineFunction *MF = MBB->getParent();
9956   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
9957
9958   MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
9959   MF->insert(++MBB->getIterator(), ContBB);
9960   ContBB->splice(ContBB->begin(), MBB,
9961                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
9962   ContBB->transferSuccessorsAndUpdatePHIs(MBB);
9963   MBB->addSuccessor(ContBB);
9964
9965   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
9966   BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
9967   MF->push_back(TrapBB);
9968   MBB->addSuccessor(TrapBB);
9969
9970   BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
9971       .addReg(MI.getOperand(0).getReg())
9972       .addImm(0)
9973       .add(predOps(ARMCC::AL));
9974   BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
9975       .addMBB(TrapBB)
9976       .addImm(ARMCC::EQ)
9977       .addReg(ARM::CPSR);
9978
9979   MI.eraseFromParent();
9980   return ContBB;
9981 }
9982
9983 // The CPSR operand of SelectItr might be missing a kill marker
9984 // because there were multiple uses of CPSR, and ISel didn't know
9985 // which to mark. Figure out whether SelectItr should have had a
9986 // kill marker, and set it if it should. Returns the correct kill
9987 // marker value.
9988 static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr,
9989                                    MachineBasicBlock* BB,
9990                                    const TargetRegisterInfo* TRI) {
9991   // Scan forward through BB for a use/def of CPSR.
9992   MachineBasicBlock::iterator miI(std::next(SelectItr));
9993   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
9994     const MachineInstr& mi = *miI;
9995     if (mi.readsRegister(ARM::CPSR))
9996       return false;
9997     if (mi.definesRegister(ARM::CPSR))
9998       break; // Should have kill-flag - update below.
9999   }
10000
10001   // If we hit the end of the block, check whether CPSR is live into a
10002   // successor.
10003   if (miI == BB->end()) {
10004     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
10005                                           sEnd = BB->succ_end();
10006          sItr != sEnd; ++sItr) {
10007       MachineBasicBlock* succ = *sItr;
10008       if (succ->isLiveIn(ARM::CPSR))
10009         return false;
10010     }
10011   }
10012
10013   // We found a def, or hit the end of the basic block and CPSR wasn't live
10014   // out. SelectMI should have a kill flag on CPSR.
10015   SelectItr->addRegisterKilled(ARM::CPSR, TRI);
10016   return true;
10017 }
10018
10019 MachineBasicBlock *
10020 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
10021                                                MachineBasicBlock *BB) const {
10022   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10023   DebugLoc dl = MI.getDebugLoc();
10024   bool isThumb2 = Subtarget->isThumb2();
10025   switch (MI.getOpcode()) {
10026   default: {
10027     MI.print(errs());
10028     llvm_unreachable("Unexpected instr type to insert");
10029   }
10030
10031   // Thumb1 post-indexed loads are really just single-register LDMs.
10032   case ARM::tLDR_postidx: {
10033     MachineOperand Def(MI.getOperand(1));
10034     BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
10035         .add(Def)  // Rn_wb
10036         .add(MI.getOperand(2))  // Rn
10037         .add(MI.getOperand(3))  // PredImm
10038         .add(MI.getOperand(4))  // PredReg
10039         .add(MI.getOperand(0))  // Rt
10040         .cloneMemRefs(MI);
10041     MI.eraseFromParent();
10042     return BB;
10043   }
10044
10045   // The Thumb2 pre-indexed stores have the same MI operands, they just
10046   // define them differently in the .td files from the isel patterns, so
10047   // they need pseudos.
10048   case ARM::t2STR_preidx:
10049     MI.setDesc(TII->get(ARM::t2STR_PRE));
10050     return BB;
10051   case ARM::t2STRB_preidx:
10052     MI.setDesc(TII->get(ARM::t2STRB_PRE));
10053     return BB;
10054   case ARM::t2STRH_preidx:
10055     MI.setDesc(TII->get(ARM::t2STRH_PRE));
10056     return BB;
10057
10058   case ARM::STRi_preidx:
10059   case ARM::STRBi_preidx: {
10060     unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
10061                                                          : ARM::STRB_PRE_IMM;
10062     // Decode the offset.
10063     unsigned Offset = MI.getOperand(4).getImm();
10064     bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
10065     Offset = ARM_AM::getAM2Offset(Offset);
10066     if (isSub)
10067       Offset = -Offset;
10068
10069     MachineMemOperand *MMO = *MI.memoperands_begin();
10070     BuildMI(*BB, MI, dl, TII->get(NewOpc))
10071         .add(MI.getOperand(0)) // Rn_wb
10072         .add(MI.getOperand(1)) // Rt
10073         .add(MI.getOperand(2)) // Rn
10074         .addImm(Offset)        // offset (skip GPR==zero_reg)
10075         .add(MI.getOperand(5)) // pred
10076         .add(MI.getOperand(6))
10077         .addMemOperand(MMO);
10078     MI.eraseFromParent();
10079     return BB;
10080   }
10081   case ARM::STRr_preidx:
10082   case ARM::STRBr_preidx:
10083   case ARM::STRH_preidx: {
10084     unsigned NewOpc;
10085     switch (MI.getOpcode()) {
10086     default: llvm_unreachable("unexpected opcode!");
10087     case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
10088     case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
10089     case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
10090     }
10091     MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
10092     for (unsigned i = 0; i < MI.getNumOperands(); ++i)
10093       MIB.add(MI.getOperand(i));
10094     MI.eraseFromParent();
10095     return BB;
10096   }
10097
10098   case ARM::tMOVCCr_pseudo: {
10099     // To "insert" a SELECT_CC instruction, we actually have to insert the
10100     // diamond control-flow pattern.  The incoming instruction knows the
10101     // destination vreg to set, the condition code register to branch on, the
10102     // true/false values to select between, and a branch opcode to use.
10103     const BasicBlock *LLVM_BB = BB->getBasicBlock();
10104     MachineFunction::iterator It = ++BB->getIterator();
10105
10106     //  thisMBB:
10107     //  ...
10108     //   TrueVal = ...
10109     //   cmpTY ccX, r1, r2
10110     //   bCC copy1MBB
10111     //   fallthrough --> copy0MBB
10112     MachineBasicBlock *thisMBB  = BB;
10113     MachineFunction *F = BB->getParent();
10114     MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
10115     MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
10116     F->insert(It, copy0MBB);
10117     F->insert(It, sinkMBB);
10118
10119     // Check whether CPSR is live past the tMOVCCr_pseudo.
10120     const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
10121     if (!MI.killsRegister(ARM::CPSR) &&
10122         !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
10123       copy0MBB->addLiveIn(ARM::CPSR);
10124       sinkMBB->addLiveIn(ARM::CPSR);
10125     }
10126
10127     // Transfer the remainder of BB and its successor edges to sinkMBB.
10128     sinkMBB->splice(sinkMBB->begin(), BB,
10129                     std::next(MachineBasicBlock::iterator(MI)), BB->end());
10130     sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
10131
10132     BB->addSuccessor(copy0MBB);
10133     BB->addSuccessor(sinkMBB);
10134
10135     BuildMI(BB, dl, TII->get(ARM::tBcc))
10136         .addMBB(sinkMBB)
10137         .addImm(MI.getOperand(3).getImm())
10138         .addReg(MI.getOperand(4).getReg());
10139
10140     //  copy0MBB:
10141     //   %FalseValue = ...
10142     //   # fallthrough to sinkMBB
10143     BB = copy0MBB;
10144
10145     // Update machine-CFG edges
10146     BB->addSuccessor(sinkMBB);
10147
10148     //  sinkMBB:
10149     //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
10150     //  ...
10151     BB = sinkMBB;
10152     BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
10153         .addReg(MI.getOperand(1).getReg())
10154         .addMBB(copy0MBB)
10155         .addReg(MI.getOperand(2).getReg())
10156         .addMBB(thisMBB);
10157
10158     MI.eraseFromParent(); // The pseudo instruction is gone now.
10159     return BB;
10160   }
10161
10162   case ARM::BCCi64:
10163   case ARM::BCCZi64: {
10164     // If there is an unconditional branch to the other successor, remove it.
10165     BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
10166
10167     // Compare both parts that make up the double comparison separately for
10168     // equality.
10169     bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
10170
10171     unsigned LHS1 = MI.getOperand(1).getReg();
10172     unsigned LHS2 = MI.getOperand(2).getReg();
10173     if (RHSisZero) {
10174       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
10175           .addReg(LHS1)
10176           .addImm(0)
10177           .add(predOps(ARMCC::AL));
10178       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
10179         .addReg(LHS2).addImm(0)
10180         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
10181     } else {
10182       unsigned RHS1 = MI.getOperand(3).getReg();
10183       unsigned RHS2 = MI.getOperand(4).getReg();
10184       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
10185           .addReg(LHS1)
10186           .addReg(RHS1)
10187           .add(predOps(ARMCC::AL));
10188       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
10189         .addReg(LHS2).addReg(RHS2)
10190         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
10191     }
10192
10193     MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
10194     MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
10195     if (MI.getOperand(0).getImm() == ARMCC::NE)
10196       std::swap(destMBB, exitMBB);
10197
10198     BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
10199       .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
10200     if (isThumb2)
10201       BuildMI(BB, dl, TII->get(ARM::t2B))
10202           .addMBB(exitMBB)
10203           .add(predOps(ARMCC::AL));
10204     else
10205       BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
10206
10207     MI.eraseFromParent(); // The pseudo instruction is gone now.
10208     return BB;
10209   }
10210
10211   case ARM::Int_eh_sjlj_setjmp:
10212   case ARM::Int_eh_sjlj_setjmp_nofp:
10213   case ARM::tInt_eh_sjlj_setjmp:
10214   case ARM::t2Int_eh_sjlj_setjmp:
10215   case ARM::t2Int_eh_sjlj_setjmp_nofp:
10216     return BB;
10217
10218   case ARM::Int_eh_sjlj_setup_dispatch:
10219     EmitSjLjDispatchBlock(MI, BB);
10220     return BB;
10221
10222   case ARM::ABS:
10223   case ARM::t2ABS: {
10224     // To insert an ABS instruction, we have to insert the
10225     // diamond control-flow pattern.  The incoming instruction knows the
10226     // source vreg to test against 0, the destination vreg to set,
10227     // the condition code register to branch on, the
10228     // true/false values to select between, and a branch opcode to use.
10229     // It transforms
10230     //     V1 = ABS V0
10231     // into
10232     //     V2 = MOVS V0
10233     //     BCC                      (branch to SinkBB if V0 >= 0)
10234     //     RSBBB: V3 = RSBri V2, 0  (compute ABS if V2 < 0)
10235     //     SinkBB: V1 = PHI(V2, V3)
10236     const BasicBlock *LLVM_BB = BB->getBasicBlock();
10237     MachineFunction::iterator BBI = ++BB->getIterator();
10238     MachineFunction *Fn = BB->getParent();
10239     MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
10240     MachineBasicBlock *SinkBB  = Fn->CreateMachineBasicBlock(LLVM_BB);
10241     Fn->insert(BBI, RSBBB);
10242     Fn->insert(BBI, SinkBB);
10243
10244     unsigned int ABSSrcReg = MI.getOperand(1).getReg();
10245     unsigned int ABSDstReg = MI.getOperand(0).getReg();
10246     bool ABSSrcKIll = MI.getOperand(1).isKill();
10247     bool isThumb2 = Subtarget->isThumb2();
10248     MachineRegisterInfo &MRI = Fn->getRegInfo();
10249     // In Thumb mode S must not be specified if source register is the SP or
10250     // PC and if destination register is the SP, so restrict register class
10251     unsigned NewRsbDstReg =
10252       MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
10253
10254     // Transfer the remainder of BB and its successor edges to sinkMBB.
10255     SinkBB->splice(SinkBB->begin(), BB,
10256                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
10257     SinkBB->transferSuccessorsAndUpdatePHIs(BB);
10258
10259     BB->addSuccessor(RSBBB);
10260     BB->addSuccessor(SinkBB);
10261
10262     // fall through to SinkMBB
10263     RSBBB->addSuccessor(SinkBB);
10264
10265     // insert a cmp at the end of BB
10266     BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
10267         .addReg(ABSSrcReg)
10268         .addImm(0)
10269         .add(predOps(ARMCC::AL));
10270
10271     // insert a bcc with opposite CC to ARMCC::MI at the end of BB
10272     BuildMI(BB, dl,
10273       TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
10274       .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
10275
10276     // insert rsbri in RSBBB
10277     // Note: BCC and rsbri will be converted into predicated rsbmi
10278     // by if-conversion pass
10279     BuildMI(*RSBBB, RSBBB->begin(), dl,
10280             TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
10281         .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
10282         .addImm(0)
10283         .add(predOps(ARMCC::AL))
10284         .add(condCodeOp());
10285
10286     // insert PHI in SinkBB,
10287     // reuse ABSDstReg to not change uses of ABS instruction
10288     BuildMI(*SinkBB, SinkBB->begin(), dl,
10289       TII->get(ARM::PHI), ABSDstReg)
10290       .addReg(NewRsbDstReg).addMBB(RSBBB)
10291       .addReg(ABSSrcReg).addMBB(BB);
10292
10293     // remove ABS instruction
10294     MI.eraseFromParent();
10295
10296     // return last added BB
10297     return SinkBB;
10298   }
10299   case ARM::COPY_STRUCT_BYVAL_I32:
10300     ++NumLoopByVals;
10301     return EmitStructByval(MI, BB);
10302   case ARM::WIN__CHKSTK:
10303     return EmitLowered__chkstk(MI, BB);
10304   case ARM::WIN__DBZCHK:
10305     return EmitLowered__dbzchk(MI, BB);
10306   }
10307 }
10308
10309 /// Attaches vregs to MEMCPY that it will use as scratch registers
10310 /// when it is expanded into LDM/STM. This is done as a post-isel lowering
10311 /// instead of as a custom inserter because we need the use list from the SDNode.
10312 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
10313                                     MachineInstr &MI, const SDNode *Node) {
10314   bool isThumb1 = Subtarget->isThumb1Only();
10315
10316   DebugLoc DL = MI.getDebugLoc();
10317   MachineFunction *MF = MI.getParent()->getParent();
10318   MachineRegisterInfo &MRI = MF->getRegInfo();
10319   MachineInstrBuilder MIB(*MF, MI);
10320
10321   // If the new dst/src is unused mark it as dead.
10322   if (!Node->hasAnyUseOfValue(0)) {
10323     MI.getOperand(0).setIsDead(true);
10324   }
10325   if (!Node->hasAnyUseOfValue(1)) {
10326     MI.getOperand(1).setIsDead(true);
10327   }
10328
10329   // The MEMCPY both defines and kills the scratch registers.
10330   for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
10331     unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
10332                                                          : &ARM::GPRRegClass);
10333     MIB.addReg(TmpReg, RegState::Define|RegState::Dead);
10334   }
10335 }
10336
10337 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
10338                                                       SDNode *Node) const {
10339   if (MI.getOpcode() == ARM::MEMCPY) {
10340     attachMEMCPYScratchRegs(Subtarget, MI, Node);
10341     return;
10342   }
10343
10344   const MCInstrDesc *MCID = &MI.getDesc();
10345   // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
10346   // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
10347   // operand is still set to noreg. If needed, set the optional operand's
10348   // register to CPSR, and remove the redundant implicit def.
10349   //
10350   // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
10351
10352   // Rename pseudo opcodes.
10353   unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
10354   unsigned ccOutIdx;
10355   if (NewOpc) {
10356     const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
10357     MCID = &TII->get(NewOpc);
10358
10359     assert(MCID->getNumOperands() ==
10360            MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
10361         && "converted opcode should be the same except for cc_out"
10362            " (and, on Thumb1, pred)");
10363
10364     MI.setDesc(*MCID);
10365
10366     // Add the optional cc_out operand
10367     MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
10368
10369     // On Thumb1, move all input operands to the end, then add the predicate
10370     if (Subtarget->isThumb1Only()) {
10371       for (unsigned c = MCID->getNumOperands() - 4; c--;) {
10372         MI.addOperand(MI.getOperand(1));
10373         MI.RemoveOperand(1);
10374       }
10375
10376       // Restore the ties
10377       for (unsigned i = MI.getNumOperands(); i--;) {
10378         const MachineOperand& op = MI.getOperand(i);
10379         if (op.isReg() && op.isUse()) {
10380           int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
10381           if (DefIdx != -1)
10382             MI.tieOperands(DefIdx, i);
10383         }
10384       }
10385
10386       MI.addOperand(MachineOperand::CreateImm(ARMCC::AL));
10387       MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
10388       ccOutIdx = 1;
10389     } else
10390       ccOutIdx = MCID->getNumOperands() - 1;
10391   } else
10392     ccOutIdx = MCID->getNumOperands() - 1;
10393
10394   // Any ARM instruction that sets the 's' bit should specify an optional
10395   // "cc_out" operand in the last operand position.
10396   if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
10397     assert(!NewOpc && "Optional cc_out operand required");
10398     return;
10399   }
10400   // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
10401   // since we already have an optional CPSR def.
10402   bool definesCPSR = false;
10403   bool deadCPSR = false;
10404   for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
10405        ++i) {
10406     const MachineOperand &MO = MI.getOperand(i);
10407     if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
10408       definesCPSR = true;
10409       if (MO.isDead())
10410         deadCPSR = true;
10411       MI.RemoveOperand(i);
10412       break;
10413     }
10414   }
10415   if (!definesCPSR) {
10416     assert(!NewOpc && "Optional cc_out operand required");
10417     return;
10418   }
10419   assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
10420   if (deadCPSR) {
10421     assert(!MI.getOperand(ccOutIdx).getReg() &&
10422            "expect uninitialized optional cc_out operand");
10423     // Thumb1 instructions must have the S bit even if the CPSR is dead.
10424     if (!Subtarget->isThumb1Only())
10425       return;
10426   }
10427
10428   // If this instruction was defined with an optional CPSR def and its dag node
10429   // had a live implicit CPSR def, then activate the optional CPSR def.
10430   MachineOperand &MO = MI.getOperand(ccOutIdx);
10431   MO.setReg(ARM::CPSR);
10432   MO.setIsDef(true);
10433 }
10434
10435 //===----------------------------------------------------------------------===//
10436 //                           ARM Optimization Hooks
10437 //===----------------------------------------------------------------------===//
10438
10439 // Helper function that checks if N is a null or all ones constant.
10440 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
10441   return AllOnes ? isAllOnesConstant(N) : isNullConstant(N);
10442 }
10443
10444 // Return true if N is conditionally 0 or all ones.
10445 // Detects these expressions where cc is an i1 value:
10446 //
10447 //   (select cc 0, y)   [AllOnes=0]
10448 //   (select cc y, 0)   [AllOnes=0]
10449 //   (zext cc)          [AllOnes=0]
10450 //   (sext cc)          [AllOnes=0/1]
10451 //   (select cc -1, y)  [AllOnes=1]
10452 //   (select cc y, -1)  [AllOnes=1]
10453 //
10454 // Invert is set when N is the null/all ones constant when CC is false.
10455 // OtherOp is set to the alternative value of N.
10456 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
10457                                        SDValue &CC, bool &Invert,
10458                                        SDValue &OtherOp,
10459                                        SelectionDAG &DAG) {
10460   switch (N->getOpcode()) {
10461   default: return false;
10462   case ISD::SELECT: {
10463     CC = N->getOperand(0);
10464     SDValue N1 = N->getOperand(1);
10465     SDValue N2 = N->getOperand(2);
10466     if (isZeroOrAllOnes(N1, AllOnes)) {
10467       Invert = false;
10468       OtherOp = N2;
10469       return true;
10470     }
10471     if (isZeroOrAllOnes(N2, AllOnes)) {
10472       Invert = true;
10473       OtherOp = N1;
10474       return true;
10475     }
10476     return false;
10477   }
10478   case ISD::ZERO_EXTEND:
10479     // (zext cc) can never be the all ones value.
10480     if (AllOnes)
10481       return false;
10482     LLVM_FALLTHROUGH;
10483   case ISD::SIGN_EXTEND: {
10484     SDLoc dl(N);
10485     EVT VT = N->getValueType(0);
10486     CC = N->getOperand(0);
10487     if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
10488       return false;
10489     Invert = !AllOnes;
10490     if (AllOnes)
10491       // When looking for an AllOnes constant, N is an sext, and the 'other'
10492       // value is 0.
10493       OtherOp = DAG.getConstant(0, dl, VT);
10494     else if (N->getOpcode() == ISD::ZERO_EXTEND)
10495       // When looking for a 0 constant, N can be zext or sext.
10496       OtherOp = DAG.getConstant(1, dl, VT);
10497     else
10498       OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl,
10499                                 VT);
10500     return true;
10501   }
10502   }
10503 }
10504
10505 // Combine a constant select operand into its use:
10506 //
10507 //   (add (select cc, 0, c), x)  -> (select cc, x, (add, x, c))
10508 //   (sub x, (select cc, 0, c))  -> (select cc, x, (sub, x, c))
10509 //   (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))  [AllOnes=1]
10510 //   (or  (select cc, 0, c), x)  -> (select cc, x, (or, x, c))
10511 //   (xor (select cc, 0, c), x)  -> (select cc, x, (xor, x, c))
10512 //
10513 // The transform is rejected if the select doesn't have a constant operand that
10514 // is null, or all ones when AllOnes is set.
10515 //
10516 // Also recognize sext/zext from i1:
10517 //
10518 //   (add (zext cc), x) -> (select cc (add x, 1), x)
10519 //   (add (sext cc), x) -> (select cc (add x, -1), x)
10520 //
10521 // These transformations eventually create predicated instructions.
10522 //
10523 // @param N       The node to transform.
10524 // @param Slct    The N operand that is a select.
10525 // @param OtherOp The other N operand (x above).
10526 // @param DCI     Context.
10527 // @param AllOnes Require the select constant to be all ones instead of null.
10528 // @returns The new node, or SDValue() on failure.
10529 static
10530 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
10531                             TargetLowering::DAGCombinerInfo &DCI,
10532                             bool AllOnes = false) {
10533   SelectionDAG &DAG = DCI.DAG;
10534   EVT VT = N->getValueType(0);
10535   SDValue NonConstantVal;
10536   SDValue CCOp;
10537   bool SwapSelectOps;
10538   if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
10539                                   NonConstantVal, DAG))
10540     return SDValue();
10541
10542   // Slct is now know to be the desired identity constant when CC is true.
10543   SDValue TrueVal = OtherOp;
10544   SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
10545                                  OtherOp, NonConstantVal);
10546   // Unless SwapSelectOps says CC should be false.
10547   if (SwapSelectOps)
10548     std::swap(TrueVal, FalseVal);
10549
10550   return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
10551                      CCOp, TrueVal, FalseVal);
10552 }
10553
10554 // Attempt combineSelectAndUse on each operand of a commutative operator N.
10555 static
10556 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
10557                                        TargetLowering::DAGCombinerInfo &DCI) {
10558   SDValue N0 = N->getOperand(0);
10559   SDValue N1 = N->getOperand(1);
10560   if (N0.getNode()->hasOneUse())
10561     if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
10562       return Result;
10563   if (N1.getNode()->hasOneUse())
10564     if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
10565       return Result;
10566   return SDValue();
10567 }
10568
10569 static bool IsVUZPShuffleNode(SDNode *N) {
10570   // VUZP shuffle node.
10571   if (N->getOpcode() == ARMISD::VUZP)
10572     return true;
10573
10574   // "VUZP" on i32 is an alias for VTRN.
10575   if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
10576     return true;
10577
10578   return false;
10579 }
10580
10581 static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1,
10582                                  TargetLowering::DAGCombinerInfo &DCI,
10583                                  const ARMSubtarget *Subtarget) {
10584   // Look for ADD(VUZP.0, VUZP.1).
10585   if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
10586       N0 == N1)
10587    return SDValue();
10588
10589   // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
10590   if (!N->getValueType(0).is64BitVector())
10591     return SDValue();
10592
10593   // Generate vpadd.
10594   SelectionDAG &DAG = DCI.DAG;
10595   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10596   SDLoc dl(N);
10597   SDNode *Unzip = N0.getNode();
10598   EVT VT = N->getValueType(0);
10599
10600   SmallVector<SDValue, 8> Ops;
10601   Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
10602                                 TLI.getPointerTy(DAG.getDataLayout())));
10603   Ops.push_back(Unzip->getOperand(0));
10604   Ops.push_back(Unzip->getOperand(1));
10605
10606   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
10607 }
10608
10609 static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1,
10610                                       TargetLowering::DAGCombinerInfo &DCI,
10611                                       const ARMSubtarget *Subtarget) {
10612   // Check for two extended operands.
10613   if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
10614         N1.getOpcode() == ISD::SIGN_EXTEND) &&
10615       !(N0.getOpcode() == ISD::ZERO_EXTEND &&
10616         N1.getOpcode() == ISD::ZERO_EXTEND))
10617     return SDValue();
10618
10619   SDValue N00 = N0.getOperand(0);
10620   SDValue N10 = N1.getOperand(0);
10621
10622   // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
10623   if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
10624       N00 == N10)
10625     return SDValue();
10626
10627   // We only recognize Q register paddl here; this can't be reached until
10628   // after type legalization.
10629   if (!N00.getValueType().is64BitVector() ||
10630       !N0.getValueType().is128BitVector())
10631     return SDValue();
10632
10633   // Generate vpaddl.
10634   SelectionDAG &DAG = DCI.DAG;
10635   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10636   SDLoc dl(N);
10637   EVT VT = N->getValueType(0);
10638
10639   SmallVector<SDValue, 8> Ops;
10640   // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
10641   unsigned Opcode;
10642   if (N0.getOpcode() == ISD::SIGN_EXTEND)
10643     Opcode = Intrinsic::arm_neon_vpaddls;
10644   else
10645     Opcode = Intrinsic::arm_neon_vpaddlu;
10646   Ops.push_back(DAG.getConstant(Opcode, dl,
10647                                 TLI.getPointerTy(DAG.getDataLayout())));
10648   EVT ElemTy = N00.getValueType().getVectorElementType();
10649   unsigned NumElts = VT.getVectorNumElements();
10650   EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
10651   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
10652                                N00.getOperand(0), N00.getOperand(1));
10653   Ops.push_back(Concat);
10654
10655   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
10656 }
10657
10658 // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
10659 // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
10660 // much easier to match.
10661 static SDValue
10662 AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1,
10663                                TargetLowering::DAGCombinerInfo &DCI,
10664                                const ARMSubtarget *Subtarget) {
10665   // Only perform optimization if after legalize, and if NEON is available. We
10666   // also expected both operands to be BUILD_VECTORs.
10667   if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
10668       || N0.getOpcode() != ISD::BUILD_VECTOR
10669       || N1.getOpcode() != ISD::BUILD_VECTOR)
10670     return SDValue();
10671
10672   // Check output type since VPADDL operand elements can only be 8, 16, or 32.
10673   EVT VT = N->getValueType(0);
10674   if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
10675     return SDValue();
10676
10677   // Check that the vector operands are of the right form.
10678   // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
10679   // operands, where N is the size of the formed vector.
10680   // Each EXTRACT_VECTOR should have the same input vector and odd or even
10681   // index such that we have a pair wise add pattern.
10682
10683   // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
10684   if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10685     return SDValue();
10686   SDValue Vec = N0->getOperand(0)->getOperand(0);
10687   SDNode *V = Vec.getNode();
10688   unsigned nextIndex = 0;
10689
10690   // For each operands to the ADD which are BUILD_VECTORs,
10691   // check to see if each of their operands are an EXTRACT_VECTOR with
10692   // the same vector and appropriate index.
10693   for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
10694     if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
10695         && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
10696
10697       SDValue ExtVec0 = N0->getOperand(i);
10698       SDValue ExtVec1 = N1->getOperand(i);
10699
10700       // First operand is the vector, verify its the same.
10701       if (V != ExtVec0->getOperand(0).getNode() ||
10702           V != ExtVec1->getOperand(0).getNode())
10703         return SDValue();
10704
10705       // Second is the constant, verify its correct.
10706       ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
10707       ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
10708
10709       // For the constant, we want to see all the even or all the odd.
10710       if (!C0 || !C1 || C0->getZExtValue() != nextIndex
10711           || C1->getZExtValue() != nextIndex+1)
10712         return SDValue();
10713
10714       // Increment index.
10715       nextIndex+=2;
10716     } else
10717       return SDValue();
10718   }
10719
10720   // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
10721   // we're using the entire input vector, otherwise there's a size/legality
10722   // mismatch somewhere.
10723   if (nextIndex != Vec.getValueType().getVectorNumElements() ||
10724       Vec.getValueType().getVectorElementType() == VT.getVectorElementType())
10725     return SDValue();
10726
10727   // Create VPADDL node.
10728   SelectionDAG &DAG = DCI.DAG;
10729   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10730
10731   SDLoc dl(N);
10732
10733   // Build operand list.
10734   SmallVector<SDValue, 8> Ops;
10735   Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
10736                                 TLI.getPointerTy(DAG.getDataLayout())));
10737
10738   // Input is the vector.
10739   Ops.push_back(Vec);
10740
10741   // Get widened type and narrowed type.
10742   MVT widenType;
10743   unsigned numElem = VT.getVectorNumElements();
10744
10745   EVT inputLaneType = Vec.getValueType().getVectorElementType();
10746   switch (inputLaneType.getSimpleVT().SimpleTy) {
10747     case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
10748     case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
10749     case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
10750     default:
10751       llvm_unreachable("Invalid vector element type for padd optimization.");
10752   }
10753
10754   SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
10755   unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
10756   return DAG.getNode(ExtOp, dl, VT, tmp);
10757 }
10758
10759 static SDValue findMUL_LOHI(SDValue V) {
10760   if (V->getOpcode() == ISD::UMUL_LOHI ||
10761       V->getOpcode() == ISD::SMUL_LOHI)
10762     return V;
10763   return SDValue();
10764 }
10765
10766 static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
10767                                         TargetLowering::DAGCombinerInfo &DCI,
10768                                         const ARMSubtarget *Subtarget) {
10769   if (Subtarget->isThumb()) {
10770     if (!Subtarget->hasDSP())
10771       return SDValue();
10772   } else if (!Subtarget->hasV5TEOps())
10773     return SDValue();
10774
10775   // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
10776   // accumulates the product into a 64-bit value. The 16-bit values will
10777   // be sign extended somehow or SRA'd into 32-bit values
10778   // (addc (adde (mul 16bit, 16bit), lo), hi)
10779   SDValue Mul = AddcNode->getOperand(0);
10780   SDValue Lo = AddcNode->getOperand(1);
10781   if (Mul.getOpcode() != ISD::MUL) {
10782     Lo = AddcNode->getOperand(0);
10783     Mul = AddcNode->getOperand(1);
10784     if (Mul.getOpcode() != ISD::MUL)
10785       return SDValue();
10786   }
10787
10788   SDValue SRA = AddeNode->getOperand(0);
10789   SDValue Hi = AddeNode->getOperand(1);
10790   if (SRA.getOpcode() != ISD::SRA) {
10791     SRA = AddeNode->getOperand(1);
10792     Hi = AddeNode->getOperand(0);
10793     if (SRA.getOpcode() != ISD::SRA)
10794       return SDValue();
10795   }
10796   if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
10797     if (Const->getZExtValue() != 31)
10798       return SDValue();
10799   } else
10800     return SDValue();
10801
10802   if (SRA.getOperand(0) != Mul)
10803     return SDValue();
10804
10805   SelectionDAG &DAG = DCI.DAG;
10806   SDLoc dl(AddcNode);
10807   unsigned Opcode = 0;
10808   SDValue Op0;
10809   SDValue Op1;
10810
10811   if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
10812     Opcode = ARMISD::SMLALBB;
10813     Op0 = Mul.getOperand(0);
10814     Op1 = Mul.getOperand(1);
10815   } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
10816     Opcode = ARMISD::SMLALBT;
10817     Op0 = Mul.getOperand(0);
10818     Op1 = Mul.getOperand(1).getOperand(0);
10819   } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
10820     Opcode = ARMISD::SMLALTB;
10821     Op0 = Mul.getOperand(0).getOperand(0);
10822     Op1 = Mul.getOperand(1);
10823   } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
10824     Opcode = ARMISD::SMLALTT;
10825     Op0 = Mul->getOperand(0).getOperand(0);
10826     Op1 = Mul->getOperand(1).getOperand(0);
10827   }
10828
10829   if (!Op0 || !Op1)
10830     return SDValue();
10831
10832   SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
10833                               Op0, Op1, Lo, Hi);
10834   // Replace the ADDs' nodes uses by the MLA node's values.
10835   SDValue HiMLALResult(SMLAL.getNode(), 1);
10836   SDValue LoMLALResult(SMLAL.getNode(), 0);
10837
10838   DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
10839   DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
10840
10841   // Return original node to notify the driver to stop replacing.
10842   SDValue resNode(AddcNode, 0);
10843   return resNode;
10844 }
10845
10846 static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode,
10847                                      TargetLowering::DAGCombinerInfo &DCI,
10848                                      const ARMSubtarget *Subtarget) {
10849   // Look for multiply add opportunities.
10850   // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
10851   // each add nodes consumes a value from ISD::UMUL_LOHI and there is
10852   // a glue link from the first add to the second add.
10853   // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
10854   // a S/UMLAL instruction.
10855   //                  UMUL_LOHI
10856   //                 / :lo    \ :hi
10857   //                V          \          [no multiline comment]
10858   //    loAdd ->  ADDC         |
10859   //                 \ :carry /
10860   //                  V      V
10861   //                    ADDE   <- hiAdd
10862   //
10863   // In the special case where only the higher part of a signed result is used
10864   // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
10865   // a constant with the exact value of 0x80000000, we recognize we are dealing
10866   // with a "rounded multiply and add" (or subtract) and transform it into
10867   // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
10868
10869   assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
10870           AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
10871          "Expect an ADDE or SUBE");
10872
10873   assert(AddeSubeNode->getNumOperands() == 3 &&
10874          AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
10875          "ADDE node has the wrong inputs");
10876
10877   // Check that we are chained to the right ADDC or SUBC node.
10878   SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
10879   if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
10880        AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
10881       (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
10882        AddcSubcNode->getOpcode() != ARMISD::SUBC))
10883     return SDValue();
10884
10885   SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
10886   SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
10887
10888   // Check if the two operands are from the same mul_lohi node.
10889   if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
10890     return SDValue();
10891
10892   assert(AddcSubcNode->getNumValues() == 2 &&
10893          AddcSubcNode->getValueType(0) == MVT::i32 &&
10894          "Expect ADDC with two result values. First: i32");
10895
10896   // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
10897   // maybe a SMLAL which multiplies two 16-bit values.
10898   if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
10899       AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
10900       AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
10901       AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
10902       AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
10903     return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
10904
10905   // Check for the triangle shape.
10906   SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
10907   SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
10908
10909   // Make sure that the ADDE/SUBE operands are not coming from the same node.
10910   if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
10911     return SDValue();
10912
10913   // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
10914   bool IsLeftOperandMUL = false;
10915   SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
10916   if (MULOp == SDValue())
10917     MULOp = findMUL_LOHI(AddeSubeOp1);
10918   else
10919     IsLeftOperandMUL = true;
10920   if (MULOp == SDValue())
10921     return SDValue();
10922
10923   // Figure out the right opcode.
10924   unsigned Opc = MULOp->getOpcode();
10925   unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
10926
10927   // Figure out the high and low input values to the MLAL node.
10928   SDValue *HiAddSub = nullptr;
10929   SDValue *LoMul = nullptr;
10930   SDValue *LowAddSub = nullptr;
10931
10932   // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
10933   if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
10934     return SDValue();
10935
10936   if (IsLeftOperandMUL)
10937     HiAddSub = &AddeSubeOp1;
10938   else
10939     HiAddSub = &AddeSubeOp0;
10940
10941   // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
10942   // whose low result is fed to the ADDC/SUBC we are checking.
10943
10944   if (AddcSubcOp0 == MULOp.getValue(0)) {
10945     LoMul = &AddcSubcOp0;
10946     LowAddSub = &AddcSubcOp1;
10947   }
10948   if (AddcSubcOp1 == MULOp.getValue(0)) {
10949     LoMul = &AddcSubcOp1;
10950     LowAddSub = &AddcSubcOp0;
10951   }
10952
10953   if (!LoMul)
10954     return SDValue();
10955
10956   // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
10957   // the replacement below will create a cycle.
10958   if (AddcSubcNode == HiAddSub->getNode() ||
10959       AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
10960     return SDValue();
10961
10962   // Create the merged node.
10963   SelectionDAG &DAG = DCI.DAG;
10964
10965   // Start building operand list.
10966   SmallVector<SDValue, 8> Ops;
10967   Ops.push_back(LoMul->getOperand(0));
10968   Ops.push_back(LoMul->getOperand(1));
10969
10970   // Check whether we can use SMMLAR, SMMLSR or SMMULR instead.  For this to be
10971   // the case, we must be doing signed multiplication and only use the higher
10972   // part of the result of the MLAL, furthermore the LowAddSub must be a constant
10973   // addition or subtraction with the value of 0x800000.
10974   if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
10975       FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
10976       LowAddSub->getNode()->getOpcode() == ISD::Constant &&
10977       static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
10978           0x80000000) {
10979     Ops.push_back(*HiAddSub);
10980     if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
10981       FinalOpc = ARMISD::SMMLSR;
10982     } else {
10983       FinalOpc = ARMISD::SMMLAR;
10984     }
10985     SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
10986     DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
10987
10988     return SDValue(AddeSubeNode, 0);
10989   } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
10990     // SMMLS is generated during instruction selection and the rest of this
10991     // function can not handle the case where AddcSubcNode is a SUBC.
10992     return SDValue();
10993
10994   // Finish building the operand list for {U/S}MLAL
10995   Ops.push_back(*LowAddSub);
10996   Ops.push_back(*HiAddSub);
10997
10998   SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
10999                                  DAG.getVTList(MVT::i32, MVT::i32), Ops);
11000
11001   // Replace the ADDs' nodes uses by the MLA node's values.
11002   SDValue HiMLALResult(MLALNode.getNode(), 1);
11003   DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
11004
11005   SDValue LoMLALResult(MLALNode.getNode(), 0);
11006   DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
11007
11008   // Return original node to notify the driver to stop replacing.
11009   return SDValue(AddeSubeNode, 0);
11010 }
11011
11012 static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode,
11013                                       TargetLowering::DAGCombinerInfo &DCI,
11014                                       const ARMSubtarget *Subtarget) {
11015   // UMAAL is similar to UMLAL except that it adds two unsigned values.
11016   // While trying to combine for the other MLAL nodes, first search for the
11017   // chance to use UMAAL. Check if Addc uses a node which has already
11018   // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
11019   // as the addend, and it's handled in PerformUMLALCombine.
11020
11021   if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
11022     return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
11023
11024   // Check that we have a glued ADDC node.
11025   SDNode* AddcNode = AddeNode->getOperand(2).getNode();
11026   if (AddcNode->getOpcode() != ARMISD::ADDC)
11027     return SDValue();
11028
11029   // Find the converted UMAAL or quit if it doesn't exist.
11030   SDNode *UmlalNode = nullptr;
11031   SDValue AddHi;
11032   if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
11033     UmlalNode = AddcNode->getOperand(0).getNode();
11034     AddHi = AddcNode->getOperand(1);
11035   } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
11036     UmlalNode = AddcNode->getOperand(1).getNode();
11037     AddHi = AddcNode->getOperand(0);
11038   } else {
11039     return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
11040   }
11041
11042   // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
11043   // the ADDC as well as Zero.
11044   if (!isNullConstant(UmlalNode->getOperand(3)))
11045     return SDValue();
11046
11047   if ((isNullConstant(AddeNode->getOperand(0)) &&
11048        AddeNode->getOperand(1).getNode() == UmlalNode) ||
11049       (AddeNode->getOperand(0).getNode() == UmlalNode &&
11050        isNullConstant(AddeNode->getOperand(1)))) {
11051     SelectionDAG &DAG = DCI.DAG;
11052     SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
11053                       UmlalNode->getOperand(2), AddHi };
11054     SDValue UMAAL =  DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
11055                                  DAG.getVTList(MVT::i32, MVT::i32), Ops);
11056
11057     // Replace the ADDs' nodes uses by the UMAAL node's values.
11058     DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
11059     DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
11060
11061     // Return original node to notify the driver to stop replacing.
11062     return SDValue(AddeNode, 0);
11063   }
11064   return SDValue();
11065 }
11066
11067 static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG,
11068                                    const ARMSubtarget *Subtarget) {
11069   if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
11070     return SDValue();
11071
11072   // Check that we have a pair of ADDC and ADDE as operands.
11073   // Both addends of the ADDE must be zero.
11074   SDNode* AddcNode = N->getOperand(2).getNode();
11075   SDNode* AddeNode = N->getOperand(3).getNode();
11076   if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
11077       (AddeNode->getOpcode() == ARMISD::ADDE) &&
11078       isNullConstant(AddeNode->getOperand(0)) &&
11079       isNullConstant(AddeNode->getOperand(1)) &&
11080       (AddeNode->getOperand(2).getNode() == AddcNode))
11081     return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
11082                        DAG.getVTList(MVT::i32, MVT::i32),
11083                        {N->getOperand(0), N->getOperand(1),
11084                         AddcNode->getOperand(0), AddcNode->getOperand(1)});
11085   else
11086     return SDValue();
11087 }
11088
11089 static SDValue PerformAddcSubcCombine(SDNode *N,
11090                                       TargetLowering::DAGCombinerInfo &DCI,
11091                                       const ARMSubtarget *Subtarget) {
11092   SelectionDAG &DAG(DCI.DAG);
11093
11094   if (N->getOpcode() == ARMISD::SUBC) {
11095     // (SUBC (ADDE 0, 0, C), 1) -> C
11096     SDValue LHS = N->getOperand(0);
11097     SDValue RHS = N->getOperand(1);
11098     if (LHS->getOpcode() == ARMISD::ADDE &&
11099         isNullConstant(LHS->getOperand(0)) &&
11100         isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
11101       return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
11102     }
11103   }
11104
11105   if (Subtarget->isThumb1Only()) {
11106     SDValue RHS = N->getOperand(1);
11107     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
11108       int32_t imm = C->getSExtValue();
11109       if (imm < 0 && imm > std::numeric_limits<int>::min()) {
11110         SDLoc DL(N);
11111         RHS = DAG.getConstant(-imm, DL, MVT::i32);
11112         unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
11113                                                            : ARMISD::ADDC;
11114         return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
11115       }
11116     }
11117   }
11118
11119   return SDValue();
11120 }
11121
11122 static SDValue PerformAddeSubeCombine(SDNode *N,
11123                                       TargetLowering::DAGCombinerInfo &DCI,
11124                                       const ARMSubtarget *Subtarget) {
11125   if (Subtarget->isThumb1Only()) {
11126     SelectionDAG &DAG = DCI.DAG;
11127     SDValue RHS = N->getOperand(1);
11128     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
11129       int64_t imm = C->getSExtValue();
11130       if (imm < 0) {
11131         SDLoc DL(N);
11132
11133         // The with-carry-in form matches bitwise not instead of the negation.
11134         // Effectively, the inverse interpretation of the carry flag already
11135         // accounts for part of the negation.
11136         RHS = DAG.getConstant(~imm, DL, MVT::i32);
11137
11138         unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
11139                                                            : ARMISD::ADDE;
11140         return DAG.getNode(Opcode, DL, N->getVTList(),
11141                            N->getOperand(0), RHS, N->getOperand(2));
11142       }
11143     }
11144   } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
11145     return AddCombineTo64bitMLAL(N, DCI, Subtarget);
11146   }
11147   return SDValue();
11148 }
11149
11150 static SDValue PerformABSCombine(SDNode *N,
11151                                   TargetLowering::DAGCombinerInfo &DCI,
11152                                   const ARMSubtarget *Subtarget) {
11153   SDValue res;
11154   SelectionDAG &DAG = DCI.DAG;
11155   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11156
11157   if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0)))
11158     return SDValue();
11159
11160   if (!TLI.expandABS(N, res, DAG))
11161       return SDValue();
11162
11163   return res;
11164 }
11165
11166 /// PerformADDECombine - Target-specific dag combine transform from
11167 /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
11168 /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
11169 static SDValue PerformADDECombine(SDNode *N,
11170                                   TargetLowering::DAGCombinerInfo &DCI,
11171                                   const ARMSubtarget *Subtarget) {
11172   // Only ARM and Thumb2 support UMLAL/SMLAL.
11173   if (Subtarget->isThumb1Only())
11174     return PerformAddeSubeCombine(N, DCI, Subtarget);
11175
11176   // Only perform the checks after legalize when the pattern is available.
11177   if (DCI.isBeforeLegalize()) return SDValue();
11178
11179   return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
11180 }
11181
11182 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
11183 /// operands N0 and N1.  This is a helper for PerformADDCombine that is
11184 /// called with the default operands, and if that fails, with commuted
11185 /// operands.
11186 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
11187                                           TargetLowering::DAGCombinerInfo &DCI,
11188                                           const ARMSubtarget *Subtarget){
11189   // Attempt to create vpadd for this add.
11190   if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
11191     return Result;
11192
11193   // Attempt to create vpaddl for this add.
11194   if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
11195     return Result;
11196   if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
11197                                                       Subtarget))
11198     return Result;
11199
11200   // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
11201   if (N0.getNode()->hasOneUse())
11202     if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
11203       return Result;
11204   return SDValue();
11205 }
11206
11207 bool
11208 ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
11209                                                  CombineLevel Level) const {
11210   if (Level == BeforeLegalizeTypes)
11211     return true;
11212
11213   if (N->getOpcode() != ISD::SHL)
11214     return true;
11215
11216   if (Subtarget->isThumb1Only()) {
11217     // Avoid making expensive immediates by commuting shifts. (This logic
11218     // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
11219     // for free.)
11220     if (N->getOpcode() != ISD::SHL)
11221       return true;
11222     SDValue N1 = N->getOperand(0);
11223     if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
11224         N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
11225       return true;
11226     if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
11227       if (Const->getAPIntValue().ult(256))
11228         return false;
11229       if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
11230           Const->getAPIntValue().sgt(-256))
11231         return false;
11232     }
11233     return true;
11234   }
11235
11236   // Turn off commute-with-shift transform after legalization, so it doesn't
11237   // conflict with PerformSHLSimplify.  (We could try to detect when
11238   // PerformSHLSimplify would trigger more precisely, but it isn't
11239   // really necessary.)
11240   return false;
11241 }
11242
11243 bool ARMTargetLowering::shouldFoldConstantShiftPairToMask(
11244     const SDNode *N, CombineLevel Level) const {
11245   if (!Subtarget->isThumb1Only())
11246     return true;
11247
11248   if (Level == BeforeLegalizeTypes)
11249     return true;
11250
11251   return false;
11252 }
11253
11254 bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
11255   if (!Subtarget->hasNEON()) {
11256     if (Subtarget->isThumb1Only())
11257       return VT.getScalarSizeInBits() <= 32;
11258     return true;
11259   }
11260   return VT.isScalarInteger();
11261 }
11262
11263 static SDValue PerformSHLSimplify(SDNode *N,
11264                                 TargetLowering::DAGCombinerInfo &DCI,
11265                                 const ARMSubtarget *ST) {
11266   // Allow the generic combiner to identify potential bswaps.
11267   if (DCI.isBeforeLegalize())
11268     return SDValue();
11269
11270   // DAG combiner will fold:
11271   // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
11272   // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
11273   // Other code patterns that can be also be modified have the following form:
11274   // b + ((a << 1) | 510)
11275   // b + ((a << 1) & 510)
11276   // b + ((a << 1) ^ 510)
11277   // b + ((a << 1) + 510)
11278
11279   // Many instructions can  perform the shift for free, but it requires both
11280   // the operands to be registers. If c1 << c2 is too large, a mov immediate
11281   // instruction will needed. So, unfold back to the original pattern if:
11282   // - if c1 and c2 are small enough that they don't require mov imms.
11283   // - the user(s) of the node can perform an shl
11284
11285   // No shifted operands for 16-bit instructions.
11286   if (ST->isThumb() && ST->isThumb1Only())
11287     return SDValue();
11288
11289   // Check that all the users could perform the shl themselves.
11290   for (auto U : N->uses()) {
11291     switch(U->getOpcode()) {
11292     default:
11293       return SDValue();
11294     case ISD::SUB:
11295     case ISD::ADD:
11296     case ISD::AND:
11297     case ISD::OR:
11298     case ISD::XOR:
11299     case ISD::SETCC:
11300     case ARMISD::CMP:
11301       // Check that the user isn't already using a constant because there
11302       // aren't any instructions that support an immediate operand and a
11303       // shifted operand.
11304       if (isa<ConstantSDNode>(U->getOperand(0)) ||
11305           isa<ConstantSDNode>(U->getOperand(1)))
11306         return SDValue();
11307
11308       // Check that it's not already using a shift.
11309       if (U->getOperand(0).getOpcode() == ISD::SHL ||
11310           U->getOperand(1).getOpcode() == ISD::SHL)
11311         return SDValue();
11312       break;
11313     }
11314   }
11315
11316   if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
11317       N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
11318     return SDValue();
11319
11320   if (N->getOperand(0).getOpcode() != ISD::SHL)
11321     return SDValue();
11322
11323   SDValue SHL = N->getOperand(0);
11324
11325   auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
11326   auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
11327   if (!C1ShlC2 || !C2)
11328     return SDValue();
11329
11330   APInt C2Int = C2->getAPIntValue();
11331   APInt C1Int = C1ShlC2->getAPIntValue();
11332
11333   // Check that performing a lshr will not lose any information.
11334   APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(),
11335                                      C2Int.getBitWidth() - C2->getZExtValue());
11336   if ((C1Int & Mask) != C1Int)
11337     return SDValue();
11338
11339   // Shift the first constant.
11340   C1Int.lshrInPlace(C2Int);
11341
11342   // The immediates are encoded as an 8-bit value that can be rotated.
11343   auto LargeImm = [](const APInt &Imm) {
11344     unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros();
11345     return Imm.getBitWidth() - Zeros > 8;
11346   };
11347
11348   if (LargeImm(C1Int) || LargeImm(C2Int))
11349     return SDValue();
11350
11351   SelectionDAG &DAG = DCI.DAG;
11352   SDLoc dl(N);
11353   SDValue X = SHL.getOperand(0);
11354   SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
11355                               DAG.getConstant(C1Int, dl, MVT::i32));
11356   // Shift left to compensate for the lshr of C1Int.
11357   SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
11358
11359   LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
11360              SHL.dump(); N->dump());
11361   LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
11362   return Res;
11363 }
11364
11365
11366 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
11367 ///
11368 static SDValue PerformADDCombine(SDNode *N,
11369                                  TargetLowering::DAGCombinerInfo &DCI,
11370                                  const ARMSubtarget *Subtarget) {
11371   SDValue N0 = N->getOperand(0);
11372   SDValue N1 = N->getOperand(1);
11373
11374   // Only works one way, because it needs an immediate operand.
11375   if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
11376     return Result;
11377
11378   // First try with the default operand order.
11379   if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
11380     return Result;
11381
11382   // If that didn't work, try again with the operands commuted.
11383   return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
11384 }
11385
11386 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
11387 ///
11388 static SDValue PerformSUBCombine(SDNode *N,
11389                                  TargetLowering::DAGCombinerInfo &DCI) {
11390   SDValue N0 = N->getOperand(0);
11391   SDValue N1 = N->getOperand(1);
11392
11393   // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
11394   if (N1.getNode()->hasOneUse())
11395     if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
11396       return Result;
11397
11398   return SDValue();
11399 }
11400
11401 /// PerformVMULCombine
11402 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
11403 /// special multiplier accumulator forwarding.
11404 ///   vmul d3, d0, d2
11405 ///   vmla d3, d1, d2
11406 /// is faster than
11407 ///   vadd d3, d0, d1
11408 ///   vmul d3, d3, d2
11409 //  However, for (A + B) * (A + B),
11410 //    vadd d2, d0, d1
11411 //    vmul d3, d0, d2
11412 //    vmla d3, d1, d2
11413 //  is slower than
11414 //    vadd d2, d0, d1
11415 //    vmul d3, d2, d2
11416 static SDValue PerformVMULCombine(SDNode *N,
11417                                   TargetLowering::DAGCombinerInfo &DCI,
11418                                   const ARMSubtarget *Subtarget) {
11419   if (!Subtarget->hasVMLxForwarding())
11420     return SDValue();
11421
11422   SelectionDAG &DAG = DCI.DAG;
11423   SDValue N0 = N->getOperand(0);
11424   SDValue N1 = N->getOperand(1);
11425   unsigned Opcode = N0.getOpcode();
11426   if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
11427       Opcode != ISD::FADD && Opcode != ISD::FSUB) {
11428     Opcode = N1.getOpcode();
11429     if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
11430         Opcode != ISD::FADD && Opcode != ISD::FSUB)
11431       return SDValue();
11432     std::swap(N0, N1);
11433   }
11434
11435   if (N0 == N1)
11436     return SDValue();
11437
11438   EVT VT = N->getValueType(0);
11439   SDLoc DL(N);
11440   SDValue N00 = N0->getOperand(0);
11441   SDValue N01 = N0->getOperand(1);
11442   return DAG.getNode(Opcode, DL, VT,
11443                      DAG.getNode(ISD::MUL, DL, VT, N00, N1),
11444                      DAG.getNode(ISD::MUL, DL, VT, N01, N1));
11445 }
11446
11447 static SDValue PerformMULCombine(SDNode *N,
11448                                  TargetLowering::DAGCombinerInfo &DCI,
11449                                  const ARMSubtarget *Subtarget) {
11450   SelectionDAG &DAG = DCI.DAG;
11451
11452   if (Subtarget->isThumb1Only())
11453     return SDValue();
11454
11455   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
11456     return SDValue();
11457
11458   EVT VT = N->getValueType(0);
11459   if (VT.is64BitVector() || VT.is128BitVector())
11460     return PerformVMULCombine(N, DCI, Subtarget);
11461   if (VT != MVT::i32)
11462     return SDValue();
11463
11464   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
11465   if (!C)
11466     return SDValue();
11467
11468   int64_t MulAmt = C->getSExtValue();
11469   unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);
11470
11471   ShiftAmt = ShiftAmt & (32 - 1);
11472   SDValue V = N->getOperand(0);
11473   SDLoc DL(N);
11474
11475   SDValue Res;
11476   MulAmt >>= ShiftAmt;
11477
11478   if (MulAmt >= 0) {
11479     if (isPowerOf2_32(MulAmt - 1)) {
11480       // (mul x, 2^N + 1) => (add (shl x, N), x)
11481       Res = DAG.getNode(ISD::ADD, DL, VT,
11482                         V,
11483                         DAG.getNode(ISD::SHL, DL, VT,
11484                                     V,
11485                                     DAG.getConstant(Log2_32(MulAmt - 1), DL,
11486                                                     MVT::i32)));
11487     } else if (isPowerOf2_32(MulAmt + 1)) {
11488       // (mul x, 2^N - 1) => (sub (shl x, N), x)
11489       Res = DAG.getNode(ISD::SUB, DL, VT,
11490                         DAG.getNode(ISD::SHL, DL, VT,
11491                                     V,
11492                                     DAG.getConstant(Log2_32(MulAmt + 1), DL,
11493                                                     MVT::i32)),
11494                         V);
11495     } else
11496       return SDValue();
11497   } else {
11498     uint64_t MulAmtAbs = -MulAmt;
11499     if (isPowerOf2_32(MulAmtAbs + 1)) {
11500       // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
11501       Res = DAG.getNode(ISD::SUB, DL, VT,
11502                         V,
11503                         DAG.getNode(ISD::SHL, DL, VT,
11504                                     V,
11505                                     DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
11506                                                     MVT::i32)));
11507     } else if (isPowerOf2_32(MulAmtAbs - 1)) {
11508       // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
11509       Res = DAG.getNode(ISD::ADD, DL, VT,
11510                         V,
11511                         DAG.getNode(ISD::SHL, DL, VT,
11512                                     V,
11513                                     DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
11514                                                     MVT::i32)));
11515       Res = DAG.getNode(ISD::SUB, DL, VT,
11516                         DAG.getConstant(0, DL, MVT::i32), Res);
11517     } else
11518       return SDValue();
11519   }
11520
11521   if (ShiftAmt != 0)
11522     Res = DAG.getNode(ISD::SHL, DL, VT,
11523                       Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
11524
11525   // Do not add new nodes to DAG combiner worklist.
11526   DCI.CombineTo(N, Res, false);
11527   return SDValue();
11528 }
11529
11530 static SDValue CombineANDShift(SDNode *N,
11531                                TargetLowering::DAGCombinerInfo &DCI,
11532                                const ARMSubtarget *Subtarget) {
11533   // Allow DAGCombine to pattern-match before we touch the canonical form.
11534   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
11535     return SDValue();
11536
11537   if (N->getValueType(0) != MVT::i32)
11538     return SDValue();
11539
11540   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
11541   if (!N1C)
11542     return SDValue();
11543
11544   uint32_t C1 = (uint32_t)N1C->getZExtValue();
11545   // Don't transform uxtb/uxth.
11546   if (C1 == 255 || C1 == 65535)
11547     return SDValue();
11548
11549   SDNode *N0 = N->getOperand(0).getNode();
11550   if (!N0->hasOneUse())
11551     return SDValue();
11552
11553   if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
11554     return SDValue();
11555
11556   bool LeftShift = N0->getOpcode() == ISD::SHL;
11557
11558   ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1));
11559   if (!N01C)
11560     return SDValue();
11561
11562   uint32_t C2 = (uint32_t)N01C->getZExtValue();
11563   if (!C2 || C2 >= 32)
11564     return SDValue();
11565
11566   // Clear irrelevant bits in the mask.
11567   if (LeftShift)
11568     C1 &= (-1U << C2);
11569   else
11570     C1 &= (-1U >> C2);
11571
11572   SelectionDAG &DAG = DCI.DAG;
11573   SDLoc DL(N);
11574
11575   // We have a pattern of the form "(and (shl x, c2) c1)" or
11576   // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
11577   // transform to a pair of shifts, to save materializing c1.
11578
11579   // First pattern: right shift, then mask off leading bits.
11580   // FIXME: Use demanded bits?
11581   if (!LeftShift && isMask_32(C1)) {
11582     uint32_t C3 = countLeadingZeros(C1);
11583     if (C2 < C3) {
11584       SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
11585                                 DAG.getConstant(C3 - C2, DL, MVT::i32));
11586       return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
11587                          DAG.getConstant(C3, DL, MVT::i32));
11588     }
11589   }
11590
11591   // First pattern, reversed: left shift, then mask off trailing bits.
11592   if (LeftShift && isMask_32(~C1)) {
11593     uint32_t C3 = countTrailingZeros(C1);
11594     if (C2 < C3) {
11595       SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
11596                                 DAG.getConstant(C3 - C2, DL, MVT::i32));
11597       return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
11598                          DAG.getConstant(C3, DL, MVT::i32));
11599     }
11600   }
11601
11602   // Second pattern: left shift, then mask off leading bits.
11603   // FIXME: Use demanded bits?
11604   if (LeftShift && isShiftedMask_32(C1)) {
11605     uint32_t Trailing = countTrailingZeros(C1);
11606     uint32_t C3 = countLeadingZeros(C1);
11607     if (Trailing == C2 && C2 + C3 < 32) {
11608       SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
11609                                 DAG.getConstant(C2 + C3, DL, MVT::i32));
11610       return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
11611                         DAG.getConstant(C3, DL, MVT::i32));
11612     }
11613   }
11614
11615   // Second pattern, reversed: right shift, then mask off trailing bits.
11616   // FIXME: Handle other patterns of known/demanded bits.
11617   if (!LeftShift && isShiftedMask_32(C1)) {
11618     uint32_t Leading = countLeadingZeros(C1);
11619     uint32_t C3 = countTrailingZeros(C1);
11620     if (Leading == C2 && C2 + C3 < 32) {
11621       SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
11622                                 DAG.getConstant(C2 + C3, DL, MVT::i32));
11623       return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
11624                          DAG.getConstant(C3, DL, MVT::i32));
11625     }
11626   }
11627
11628   // FIXME: Transform "(and (shl x, c2) c1)" ->
11629   // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than
11630   // c1.
11631   return SDValue();
11632 }
11633
11634 static SDValue PerformANDCombine(SDNode *N,
11635                                  TargetLowering::DAGCombinerInfo &DCI,
11636                                  const ARMSubtarget *Subtarget) {
11637   // Attempt to use immediate-form VBIC
11638   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
11639   SDLoc dl(N);
11640   EVT VT = N->getValueType(0);
11641   SelectionDAG &DAG = DCI.DAG;
11642
11643   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
11644     return SDValue();
11645
11646   APInt SplatBits, SplatUndef;
11647   unsigned SplatBitSize;
11648   bool HasAnyUndefs;
11649   if (BVN && Subtarget->hasNEON() &&
11650       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
11651     if (SplatBitSize <= 64) {
11652       EVT VbicVT;
11653       SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
11654                                       SplatUndef.getZExtValue(), SplatBitSize,
11655                                       DAG, dl, VbicVT, VT.is128BitVector(),
11656                                       OtherModImm);
11657       if (Val.getNode()) {
11658         SDValue Input =
11659           DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
11660         SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
11661         return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
11662       }
11663     }
11664   }
11665
11666   if (!Subtarget->isThumb1Only()) {
11667     // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
11668     if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
11669       return Result;
11670
11671     if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
11672       return Result;
11673   }
11674
11675   if (Subtarget->isThumb1Only())
11676     if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
11677       return Result;
11678
11679   return SDValue();
11680 }
11681
11682 // Try combining OR nodes to SMULWB, SMULWT.
11683 static SDValue PerformORCombineToSMULWBT(SDNode *OR,
11684                                          TargetLowering::DAGCombinerInfo &DCI,
11685                                          const ARMSubtarget *Subtarget) {
11686   if (!Subtarget->hasV6Ops() ||
11687       (Subtarget->isThumb() &&
11688        (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
11689     return SDValue();
11690
11691   SDValue SRL = OR->getOperand(0);
11692   SDValue SHL = OR->getOperand(1);
11693
11694   if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
11695     SRL = OR->getOperand(1);
11696     SHL = OR->getOperand(0);
11697   }
11698   if (!isSRL16(SRL) || !isSHL16(SHL))
11699     return SDValue();
11700
11701   // The first operands to the shifts need to be the two results from the
11702   // same smul_lohi node.
11703   if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
11704        SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
11705     return SDValue();
11706
11707   SDNode *SMULLOHI = SRL.getOperand(0).getNode();
11708   if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
11709       SHL.getOperand(0) != SDValue(SMULLOHI, 1))
11710     return SDValue();
11711
11712   // Now we have:
11713   // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
11714   // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
11715   // For SMUWB the 16-bit value will signed extended somehow.
11716   // For SMULWT only the SRA is required.
11717   // Check both sides of SMUL_LOHI
11718   SDValue OpS16 = SMULLOHI->getOperand(0);
11719   SDValue OpS32 = SMULLOHI->getOperand(1);
11720
11721   SelectionDAG &DAG = DCI.DAG;
11722   if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
11723     OpS16 = OpS32;
11724     OpS32 = SMULLOHI->getOperand(0);
11725   }
11726
11727   SDLoc dl(OR);
11728   unsigned Opcode = 0;
11729   if (isS16(OpS16, DAG))
11730     Opcode = ARMISD::SMULWB;
11731   else if (isSRA16(OpS16)) {
11732     Opcode = ARMISD::SMULWT;
11733     OpS16 = OpS16->getOperand(0);
11734   }
11735   else
11736     return SDValue();
11737
11738   SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
11739   DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
11740   return SDValue(OR, 0);
11741 }
11742
11743 static SDValue PerformORCombineToBFI(SDNode *N,
11744                                      TargetLowering::DAGCombinerInfo &DCI,
11745                                      const ARMSubtarget *Subtarget) {
11746   // BFI is only available on V6T2+
11747   if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
11748     return SDValue();
11749
11750   EVT VT = N->getValueType(0);
11751   SDValue N0 = N->getOperand(0);
11752   SDValue N1 = N->getOperand(1);
11753   SelectionDAG &DAG = DCI.DAG;
11754   SDLoc DL(N);
11755   // 1) or (and A, mask), val => ARMbfi A, val, mask
11756   //      iff (val & mask) == val
11757   //
11758   // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
11759   //  2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
11760   //          && mask == ~mask2
11761   //  2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
11762   //          && ~mask == mask2
11763   //  (i.e., copy a bitfield value into another bitfield of the same width)
11764
11765   if (VT != MVT::i32)
11766     return SDValue();
11767
11768   SDValue N00 = N0.getOperand(0);
11769
11770   // The value and the mask need to be constants so we can verify this is
11771   // actually a bitfield set. If the mask is 0xffff, we can do better
11772   // via a movt instruction, so don't use BFI in that case.
11773   SDValue MaskOp = N0.getOperand(1);
11774   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
11775   if (!MaskC)
11776     return SDValue();
11777   unsigned Mask = MaskC->getZExtValue();
11778   if (Mask == 0xffff)
11779     return SDValue();
11780   SDValue Res;
11781   // Case (1): or (and A, mask), val => ARMbfi A, val, mask
11782   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
11783   if (N1C) {
11784     unsigned Val = N1C->getZExtValue();
11785     if ((Val & ~Mask) != Val)
11786       return SDValue();
11787
11788     if (ARM::isBitFieldInvertedMask(Mask)) {
11789       Val >>= countTrailingZeros(~Mask);
11790
11791       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
11792                         DAG.getConstant(Val, DL, MVT::i32),
11793                         DAG.getConstant(Mask, DL, MVT::i32));
11794
11795       DCI.CombineTo(N, Res, false);
11796       // Return value from the original node to inform the combiner than N is
11797       // now dead.
11798       return SDValue(N, 0);
11799     }
11800   } else if (N1.getOpcode() == ISD::AND) {
11801     // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
11802     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
11803     if (!N11C)
11804       return SDValue();
11805     unsigned Mask2 = N11C->getZExtValue();
11806
11807     // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
11808     // as is to match.
11809     if (ARM::isBitFieldInvertedMask(Mask) &&
11810         (Mask == ~Mask2)) {
11811       // The pack halfword instruction works better for masks that fit it,
11812       // so use that when it's available.
11813       if (Subtarget->hasDSP() &&
11814           (Mask == 0xffff || Mask == 0xffff0000))
11815         return SDValue();
11816       // 2a
11817       unsigned amt = countTrailingZeros(Mask2);
11818       Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
11819                         DAG.getConstant(amt, DL, MVT::i32));
11820       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
11821                         DAG.getConstant(Mask, DL, MVT::i32));
11822       DCI.CombineTo(N, Res, false);
11823       // Return value from the original node to inform the combiner than N is
11824       // now dead.
11825       return SDValue(N, 0);
11826     } else if (ARM::isBitFieldInvertedMask(~Mask) &&
11827                (~Mask == Mask2)) {
11828       // The pack halfword instruction works better for masks that fit it,
11829       // so use that when it's available.
11830       if (Subtarget->hasDSP() &&
11831           (Mask2 == 0xffff || Mask2 == 0xffff0000))
11832         return SDValue();
11833       // 2b
11834       unsigned lsb = countTrailingZeros(Mask);
11835       Res = DAG.getNode(ISD::SRL, DL, VT, N00,
11836                         DAG.getConstant(lsb, DL, MVT::i32));
11837       Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
11838                         DAG.getConstant(Mask2, DL, MVT::i32));
11839       DCI.CombineTo(N, Res, false);
11840       // Return value from the original node to inform the combiner than N is
11841       // now dead.
11842       return SDValue(N, 0);
11843     }
11844   }
11845
11846   if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
11847       N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
11848       ARM::isBitFieldInvertedMask(~Mask)) {
11849     // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
11850     // where lsb(mask) == #shamt and masked bits of B are known zero.
11851     SDValue ShAmt = N00.getOperand(1);
11852     unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
11853     unsigned LSB = countTrailingZeros(Mask);
11854     if (ShAmtC != LSB)
11855       return SDValue();
11856
11857     Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
11858                       DAG.getConstant(~Mask, DL, MVT::i32));
11859
11860     DCI.CombineTo(N, Res, false);
11861     // Return value from the original node to inform the combiner than N is
11862     // now dead.
11863     return SDValue(N, 0);
11864   }
11865
11866   return SDValue();
11867 }
11868
11869 static bool isValidMVECond(unsigned CC, bool IsFloat) {
11870   switch (CC) {
11871   case ARMCC::EQ:
11872   case ARMCC::NE:
11873   case ARMCC::LE:
11874   case ARMCC::GT:
11875   case ARMCC::GE:
11876   case ARMCC::LT:
11877     return true;
11878   case ARMCC::HS:
11879   case ARMCC::HI:
11880     return !IsFloat;
11881   default:
11882     return false;
11883   };
11884 }
11885
11886 static SDValue PerformORCombine_i1(SDNode *N,
11887                                    TargetLowering::DAGCombinerInfo &DCI,
11888                                    const ARMSubtarget *Subtarget) {
11889   // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
11890   // together with predicates
11891   EVT VT = N->getValueType(0);
11892   SDValue N0 = N->getOperand(0);
11893   SDValue N1 = N->getOperand(1);
11894
11895   ARMCC::CondCodes CondCode0 = ARMCC::AL;
11896   ARMCC::CondCodes CondCode1 = ARMCC::AL;
11897   if (N0->getOpcode() == ARMISD::VCMP)
11898     CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(2))
11899                     ->getZExtValue();
11900   else if (N0->getOpcode() == ARMISD::VCMPZ)
11901     CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(1))
11902                     ->getZExtValue();
11903   if (N1->getOpcode() == ARMISD::VCMP)
11904     CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(2))
11905                     ->getZExtValue();
11906   else if (N1->getOpcode() == ARMISD::VCMPZ)
11907     CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(1))
11908                     ->getZExtValue();
11909
11910   if (CondCode0 == ARMCC::AL || CondCode1 == ARMCC::AL)
11911     return SDValue();
11912
11913   unsigned Opposite0 = ARMCC::getOppositeCondition(CondCode0);
11914   unsigned Opposite1 = ARMCC::getOppositeCondition(CondCode1);
11915
11916   if (!isValidMVECond(Opposite0,
11917                       N0->getOperand(0)->getValueType(0).isFloatingPoint()) ||
11918       !isValidMVECond(Opposite1,
11919                       N1->getOperand(0)->getValueType(0).isFloatingPoint()))
11920     return SDValue();
11921
11922   SmallVector<SDValue, 4> Ops0;
11923   Ops0.push_back(N0->getOperand(0));
11924   if (N0->getOpcode() == ARMISD::VCMP)
11925     Ops0.push_back(N0->getOperand(1));
11926   Ops0.push_back(DCI.DAG.getConstant(Opposite0, SDLoc(N0), MVT::i32));
11927   SmallVector<SDValue, 4> Ops1;
11928   Ops1.push_back(N1->getOperand(0));
11929   if (N1->getOpcode() == ARMISD::VCMP)
11930     Ops1.push_back(N1->getOperand(1));
11931   Ops1.push_back(DCI.DAG.getConstant(Opposite1, SDLoc(N1), MVT::i32));
11932
11933   SDValue NewN0 = DCI.DAG.getNode(N0->getOpcode(), SDLoc(N0), VT, Ops0);
11934   SDValue NewN1 = DCI.DAG.getNode(N1->getOpcode(), SDLoc(N1), VT, Ops1);
11935   SDValue And = DCI.DAG.getNode(ISD::AND, SDLoc(N), VT, NewN0, NewN1);
11936   return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VT, And,
11937                          DCI.DAG.getAllOnesConstant(SDLoc(N), VT));
11938 }
11939
11940 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR
11941 static SDValue PerformORCombine(SDNode *N,
11942                                 TargetLowering::DAGCombinerInfo &DCI,
11943                                 const ARMSubtarget *Subtarget) {
11944   // Attempt to use immediate-form VORR
11945   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
11946   SDLoc dl(N);
11947   EVT VT = N->getValueType(0);
11948   SelectionDAG &DAG = DCI.DAG;
11949
11950   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
11951     return SDValue();
11952
11953   APInt SplatBits, SplatUndef;
11954   unsigned SplatBitSize;
11955   bool HasAnyUndefs;
11956   if (BVN && Subtarget->hasNEON() &&
11957       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
11958     if (SplatBitSize <= 64) {
11959       EVT VorrVT;
11960       SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(),
11961                                       SplatUndef.getZExtValue(), SplatBitSize,
11962                                       DAG, dl, VorrVT, VT.is128BitVector(),
11963                                       OtherModImm);
11964       if (Val.getNode()) {
11965         SDValue Input =
11966           DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
11967         SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
11968         return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
11969       }
11970     }
11971   }
11972
11973   if (!Subtarget->isThumb1Only()) {
11974     // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
11975     if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
11976       return Result;
11977     if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
11978       return Result;
11979   }
11980
11981   SDValue N0 = N->getOperand(0);
11982   SDValue N1 = N->getOperand(1);
11983
11984   // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
11985   if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
11986       DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
11987
11988     // The code below optimizes (or (and X, Y), Z).
11989     // The AND operand needs to have a single user to make these optimizations
11990     // profitable.
11991     if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
11992       return SDValue();
11993
11994     APInt SplatUndef;
11995     unsigned SplatBitSize;
11996     bool HasAnyUndefs;
11997
11998     APInt SplatBits0, SplatBits1;
11999     BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
12000     BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
12001     // Ensure that the second operand of both ands are constants
12002     if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
12003                                       HasAnyUndefs) && !HasAnyUndefs) {
12004         if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
12005                                           HasAnyUndefs) && !HasAnyUndefs) {
12006             // Ensure that the bit width of the constants are the same and that
12007             // the splat arguments are logical inverses as per the pattern we
12008             // are trying to simplify.
12009             if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
12010                 SplatBits0 == ~SplatBits1) {
12011                 // Canonicalize the vector type to make instruction selection
12012                 // simpler.
12013                 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
12014                 SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
12015                                              N0->getOperand(1),
12016                                              N0->getOperand(0),
12017                                              N1->getOperand(0));
12018                 return DAG.getNode(ISD::BITCAST, dl, VT, Result);
12019             }
12020         }
12021     }
12022   }
12023
12024   if (Subtarget->hasMVEIntegerOps() &&
12025       (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1))
12026     return PerformORCombine_i1(N, DCI, Subtarget);
12027
12028   // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
12029   // reasonable.
12030   if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
12031     if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
12032       return Res;
12033   }
12034
12035   if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
12036     return Result;
12037
12038   return SDValue();
12039 }
12040
12041 static SDValue PerformXORCombine(SDNode *N,
12042                                  TargetLowering::DAGCombinerInfo &DCI,
12043                                  const ARMSubtarget *Subtarget) {
12044   EVT VT = N->getValueType(0);
12045   SelectionDAG &DAG = DCI.DAG;
12046
12047   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
12048     return SDValue();
12049
12050   if (!Subtarget->isThumb1Only()) {
12051     // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
12052     if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
12053       return Result;
12054
12055     if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
12056       return Result;
12057   }
12058
12059   return SDValue();
12060 }
12061
12062 // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
12063 // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
12064 // their position in "to" (Rd).
12065 static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
12066   assert(N->getOpcode() == ARMISD::BFI);
12067
12068   SDValue From = N->getOperand(1);
12069   ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue();
12070   FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation());
12071
12072   // If the Base came from a SHR #C, we can deduce that it is really testing bit
12073   // #C in the base of the SHR.
12074   if (From->getOpcode() == ISD::SRL &&
12075       isa<ConstantSDNode>(From->getOperand(1))) {
12076     APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue();
12077     assert(Shift.getLimitedValue() < 32 && "Shift too large!");
12078     FromMask <<= Shift.getLimitedValue(31);
12079     From = From->getOperand(0);
12080   }
12081
12082   return From;
12083 }
12084
12085 // If A and B contain one contiguous set of bits, does A | B == A . B?
12086 //
12087 // Neither A nor B must be zero.
12088 static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
12089   unsigned LastActiveBitInA =  A.countTrailingZeros();
12090   unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1;
12091   return LastActiveBitInA - 1 == FirstActiveBitInB;
12092 }
12093
12094 static SDValue FindBFIToCombineWith(SDNode *N) {
12095   // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with,
12096   // if one exists.
12097   APInt ToMask, FromMask;
12098   SDValue From = ParseBFI(N, ToMask, FromMask);
12099   SDValue To = N->getOperand(0);
12100
12101   // Now check for a compatible BFI to merge with. We can pass through BFIs that
12102   // aren't compatible, but not if they set the same bit in their destination as
12103   // we do (or that of any BFI we're going to combine with).
12104   SDValue V = To;
12105   APInt CombinedToMask = ToMask;
12106   while (V.getOpcode() == ARMISD::BFI) {
12107     APInt NewToMask, NewFromMask;
12108     SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
12109     if (NewFrom != From) {
12110       // This BFI has a different base. Keep going.
12111       CombinedToMask |= NewToMask;
12112       V = V.getOperand(0);
12113       continue;
12114     }
12115
12116     // Do the written bits conflict with any we've seen so far?
12117     if ((NewToMask & CombinedToMask).getBoolValue())
12118       // Conflicting bits - bail out because going further is unsafe.
12119       return SDValue();
12120
12121     // Are the new bits contiguous when combined with the old bits?
12122     if (BitsProperlyConcatenate(ToMask, NewToMask) &&
12123         BitsProperlyConcatenate(FromMask, NewFromMask))
12124       return V;
12125     if (BitsProperlyConcatenate(NewToMask, ToMask) &&
12126         BitsProperlyConcatenate(NewFromMask, FromMask))
12127       return V;
12128
12129     // We've seen a write to some bits, so track it.
12130     CombinedToMask |= NewToMask;
12131     // Keep going...
12132     V = V.getOperand(0);
12133   }
12134
12135   return SDValue();
12136 }
12137
12138 static SDValue PerformBFICombine(SDNode *N,
12139                                  TargetLowering::DAGCombinerInfo &DCI) {
12140   SDValue N1 = N->getOperand(1);
12141   if (N1.getOpcode() == ISD::AND) {
12142     // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
12143     // the bits being cleared by the AND are not demanded by the BFI.
12144     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
12145     if (!N11C)
12146       return SDValue();
12147     unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
12148     unsigned LSB = countTrailingZeros(~InvMask);
12149     unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
12150     assert(Width <
12151                static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
12152            "undefined behavior");
12153     unsigned Mask = (1u << Width) - 1;
12154     unsigned Mask2 = N11C->getZExtValue();
12155     if ((Mask & (~Mask2)) == 0)
12156       return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
12157                              N->getOperand(0), N1.getOperand(0),
12158                              N->getOperand(2));
12159   } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
12160     // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes.
12161     // Keep track of any consecutive bits set that all come from the same base
12162     // value. We can combine these together into a single BFI.
12163     SDValue CombineBFI = FindBFIToCombineWith(N);
12164     if (CombineBFI == SDValue())
12165       return SDValue();
12166
12167     // We've found a BFI.
12168     APInt ToMask1, FromMask1;
12169     SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
12170
12171     APInt ToMask2, FromMask2;
12172     SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
12173     assert(From1 == From2);
12174     (void)From2;
12175
12176     // First, unlink CombineBFI.
12177     DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0));
12178     // Then create a new BFI, combining the two together.
12179     APInt NewFromMask = FromMask1 | FromMask2;
12180     APInt NewToMask = ToMask1 | ToMask2;
12181
12182     EVT VT = N->getValueType(0);
12183     SDLoc dl(N);
12184
12185     if (NewFromMask[0] == 0)
12186       From1 = DCI.DAG.getNode(
12187         ISD::SRL, dl, VT, From1,
12188         DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT));
12189     return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1,
12190                            DCI.DAG.getConstant(~NewToMask, dl, VT));
12191   }
12192   return SDValue();
12193 }
12194
12195 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for
12196 /// ARMISD::VMOVRRD.
12197 static SDValue PerformVMOVRRDCombine(SDNode *N,
12198                                      TargetLowering::DAGCombinerInfo &DCI,
12199                                      const ARMSubtarget *Subtarget) {
12200   // vmovrrd(vmovdrr x, y) -> x,y
12201   SDValue InDouble = N->getOperand(0);
12202   if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
12203     return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
12204
12205   // vmovrrd(load f64) -> (load i32), (load i32)
12206   SDNode *InNode = InDouble.getNode();
12207   if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
12208       InNode->getValueType(0) == MVT::f64 &&
12209       InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
12210       !cast<LoadSDNode>(InNode)->isVolatile()) {
12211     // TODO: Should this be done for non-FrameIndex operands?
12212     LoadSDNode *LD = cast<LoadSDNode>(InNode);
12213
12214     SelectionDAG &DAG = DCI.DAG;
12215     SDLoc DL(LD);
12216     SDValue BasePtr = LD->getBasePtr();
12217     SDValue NewLD1 =
12218         DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
12219                     LD->getAlignment(), LD->getMemOperand()->getFlags());
12220
12221     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
12222                                     DAG.getConstant(4, DL, MVT::i32));
12223
12224     SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
12225                                  LD->getPointerInfo().getWithOffset(4),
12226                                  std::min(4U, LD->getAlignment()),
12227                                  LD->getMemOperand()->getFlags());
12228
12229     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
12230     if (DCI.DAG.getDataLayout().isBigEndian())
12231       std::swap (NewLD1, NewLD2);
12232     SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
12233     return Result;
12234   }
12235
12236   return SDValue();
12237 }
12238
12239 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for
12240 /// ARMISD::VMOVDRR.  This is also used for BUILD_VECTORs with 2 operands.
12241 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
12242   // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
12243   SDValue Op0 = N->getOperand(0);
12244   SDValue Op1 = N->getOperand(1);
12245   if (Op0.getOpcode() == ISD::BITCAST)
12246     Op0 = Op0.getOperand(0);
12247   if (Op1.getOpcode() == ISD::BITCAST)
12248     Op1 = Op1.getOperand(0);
12249   if (Op0.getOpcode() == ARMISD::VMOVRRD &&
12250       Op0.getNode() == Op1.getNode() &&
12251       Op0.getResNo() == 0 && Op1.getResNo() == 1)
12252     return DAG.getNode(ISD::BITCAST, SDLoc(N),
12253                        N->getValueType(0), Op0.getOperand(0));
12254   return SDValue();
12255 }
12256
12257 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
12258 /// are normal, non-volatile loads.  If so, it is profitable to bitcast an
12259 /// i64 vector to have f64 elements, since the value can then be loaded
12260 /// directly into a VFP register.
12261 static bool hasNormalLoadOperand(SDNode *N) {
12262   unsigned NumElts = N->getValueType(0).getVectorNumElements();
12263   for (unsigned i = 0; i < NumElts; ++i) {
12264     SDNode *Elt = N->getOperand(i).getNode();
12265     if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
12266       return true;
12267   }
12268   return false;
12269 }
12270
12271 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
12272 /// ISD::BUILD_VECTOR.
12273 static SDValue PerformBUILD_VECTORCombine(SDNode *N,
12274                                           TargetLowering::DAGCombinerInfo &DCI,
12275                                           const ARMSubtarget *Subtarget) {
12276   // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
12277   // VMOVRRD is introduced when legalizing i64 types.  It forces the i64 value
12278   // into a pair of GPRs, which is fine when the value is used as a scalar,
12279   // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
12280   SelectionDAG &DAG = DCI.DAG;
12281   if (N->getNumOperands() == 2)
12282     if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
12283       return RV;
12284
12285   // Load i64 elements as f64 values so that type legalization does not split
12286   // them up into i32 values.
12287   EVT VT = N->getValueType(0);
12288   if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
12289     return SDValue();
12290   SDLoc dl(N);
12291   SmallVector<SDValue, 8> Ops;
12292   unsigned NumElts = VT.getVectorNumElements();
12293   for (unsigned i = 0; i < NumElts; ++i) {
12294     SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
12295     Ops.push_back(V);
12296     // Make the DAGCombiner fold the bitcast.
12297     DCI.AddToWorklist(V.getNode());
12298   }
12299   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
12300   SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
12301   return DAG.getNode(ISD::BITCAST, dl, VT, BV);
12302 }
12303
12304 /// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
12305 static SDValue
12306 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
12307   // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
12308   // At that time, we may have inserted bitcasts from integer to float.
12309   // If these bitcasts have survived DAGCombine, change the lowering of this
12310   // BUILD_VECTOR in something more vector friendly, i.e., that does not
12311   // force to use floating point types.
12312
12313   // Make sure we can change the type of the vector.
12314   // This is possible iff:
12315   // 1. The vector is only used in a bitcast to a integer type. I.e.,
12316   //    1.1. Vector is used only once.
12317   //    1.2. Use is a bit convert to an integer type.
12318   // 2. The size of its operands are 32-bits (64-bits are not legal).
12319   EVT VT = N->getValueType(0);
12320   EVT EltVT = VT.getVectorElementType();
12321
12322   // Check 1.1. and 2.
12323   if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
12324     return SDValue();
12325
12326   // By construction, the input type must be float.
12327   assert(EltVT == MVT::f32 && "Unexpected type!");
12328
12329   // Check 1.2.
12330   SDNode *Use = *N->use_begin();
12331   if (Use->getOpcode() != ISD::BITCAST ||
12332       Use->getValueType(0).isFloatingPoint())
12333     return SDValue();
12334
12335   // Check profitability.
12336   // Model is, if more than half of the relevant operands are bitcast from
12337   // i32, turn the build_vector into a sequence of insert_vector_elt.
12338   // Relevant operands are everything that is not statically
12339   // (i.e., at compile time) bitcasted.
12340   unsigned NumOfBitCastedElts = 0;
12341   unsigned NumElts = VT.getVectorNumElements();
12342   unsigned NumOfRelevantElts = NumElts;
12343   for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
12344     SDValue Elt = N->getOperand(Idx);
12345     if (Elt->getOpcode() == ISD::BITCAST) {
12346       // Assume only bit cast to i32 will go away.
12347       if (Elt->getOperand(0).getValueType() == MVT::i32)
12348         ++NumOfBitCastedElts;
12349     } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
12350       // Constants are statically casted, thus do not count them as
12351       // relevant operands.
12352       --NumOfRelevantElts;
12353   }
12354
12355   // Check if more than half of the elements require a non-free bitcast.
12356   if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
12357     return SDValue();
12358
12359   SelectionDAG &DAG = DCI.DAG;
12360   // Create the new vector type.
12361   EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
12362   // Check if the type is legal.
12363   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12364   if (!TLI.isTypeLegal(VecVT))
12365     return SDValue();
12366
12367   // Combine:
12368   // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
12369   // => BITCAST INSERT_VECTOR_ELT
12370   //                      (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
12371   //                      (BITCAST EN), N.
12372   SDValue Vec = DAG.getUNDEF(VecVT);
12373   SDLoc dl(N);
12374   for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
12375     SDValue V = N->getOperand(Idx);
12376     if (V.isUndef())
12377       continue;
12378     if (V.getOpcode() == ISD::BITCAST &&
12379         V->getOperand(0).getValueType() == MVT::i32)
12380       // Fold obvious case.
12381       V = V.getOperand(0);
12382     else {
12383       V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
12384       // Make the DAGCombiner fold the bitcasts.
12385       DCI.AddToWorklist(V.getNode());
12386     }
12387     SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
12388     Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
12389   }
12390   Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
12391   // Make the DAGCombiner fold the bitcasts.
12392   DCI.AddToWorklist(Vec.getNode());
12393   return Vec;
12394 }
12395
12396 /// PerformInsertEltCombine - Target-specific dag combine xforms for
12397 /// ISD::INSERT_VECTOR_ELT.
12398 static SDValue PerformInsertEltCombine(SDNode *N,
12399                                        TargetLowering::DAGCombinerInfo &DCI) {
12400   // Bitcast an i64 load inserted into a vector to f64.
12401   // Otherwise, the i64 value will be legalized to a pair of i32 values.
12402   EVT VT = N->getValueType(0);
12403   SDNode *Elt = N->getOperand(1).getNode();
12404   if (VT.getVectorElementType() != MVT::i64 ||
12405       !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
12406     return SDValue();
12407
12408   SelectionDAG &DAG = DCI.DAG;
12409   SDLoc dl(N);
12410   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
12411                                  VT.getVectorNumElements());
12412   SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
12413   SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
12414   // Make the DAGCombiner fold the bitcasts.
12415   DCI.AddToWorklist(Vec.getNode());
12416   DCI.AddToWorklist(V.getNode());
12417   SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
12418                                Vec, V, N->getOperand(2));
12419   return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
12420 }
12421
12422 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
12423 /// ISD::VECTOR_SHUFFLE.
12424 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
12425   // The LLVM shufflevector instruction does not require the shuffle mask
12426   // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
12427   // have that requirement.  When translating to ISD::VECTOR_SHUFFLE, if the
12428   // operands do not match the mask length, they are extended by concatenating
12429   // them with undef vectors.  That is probably the right thing for other
12430   // targets, but for NEON it is better to concatenate two double-register
12431   // size vector operands into a single quad-register size vector.  Do that
12432   // transformation here:
12433   //   shuffle(concat(v1, undef), concat(v2, undef)) ->
12434   //   shuffle(concat(v1, v2), undef)
12435   SDValue Op0 = N->getOperand(0);
12436   SDValue Op1 = N->getOperand(1);
12437   if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
12438       Op1.getOpcode() != ISD::CONCAT_VECTORS ||
12439       Op0.getNumOperands() != 2 ||
12440       Op1.getNumOperands() != 2)
12441     return SDValue();
12442   SDValue Concat0Op1 = Op0.getOperand(1);
12443   SDValue Concat1Op1 = Op1.getOperand(1);
12444   if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
12445     return SDValue();
12446   // Skip the transformation if any of the types are illegal.
12447   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12448   EVT VT = N->getValueType(0);
12449   if (!TLI.isTypeLegal(VT) ||
12450       !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
12451       !TLI.isTypeLegal(Concat1Op1.getValueType()))
12452     return SDValue();
12453
12454   SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
12455                                   Op0.getOperand(0), Op1.getOperand(0));
12456   // Translate the shuffle mask.
12457   SmallVector<int, 16> NewMask;
12458   unsigned NumElts = VT.getVectorNumElements();
12459   unsigned HalfElts = NumElts/2;
12460   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
12461   for (unsigned n = 0; n < NumElts; ++n) {
12462     int MaskElt = SVN->getMaskElt(n);
12463     int NewElt = -1;
12464     if (MaskElt < (int)HalfElts)
12465       NewElt = MaskElt;
12466     else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
12467       NewElt = HalfElts + MaskElt - NumElts;
12468     NewMask.push_back(NewElt);
12469   }
12470   return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
12471                               DAG.getUNDEF(VT), NewMask);
12472 }
12473
12474 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
12475 /// NEON load/store intrinsics, and generic vector load/stores, to merge
12476 /// base address updates.
12477 /// For generic load/stores, the memory type is assumed to be a vector.
12478 /// The caller is assumed to have checked legality.
12479 static SDValue CombineBaseUpdate(SDNode *N,
12480                                  TargetLowering::DAGCombinerInfo &DCI) {
12481   SelectionDAG &DAG = DCI.DAG;
12482   const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
12483                             N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
12484   const bool isStore = N->getOpcode() == ISD::STORE;
12485   const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
12486   SDValue Addr = N->getOperand(AddrOpIdx);
12487   MemSDNode *MemN = cast<MemSDNode>(N);
12488   SDLoc dl(N);
12489
12490   // Search for a use of the address operand that is an increment.
12491   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
12492          UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
12493     SDNode *User = *UI;
12494     if (User->getOpcode() != ISD::ADD ||
12495         UI.getUse().getResNo() != Addr.getResNo())
12496       continue;
12497
12498     // Check that the add is independent of the load/store.  Otherwise, folding
12499     // it would create a cycle. We can avoid searching through Addr as it's a
12500     // predecessor to both.
12501     SmallPtrSet<const SDNode *, 32> Visited;
12502     SmallVector<const SDNode *, 16> Worklist;
12503     Visited.insert(Addr.getNode());
12504     Worklist.push_back(N);
12505     Worklist.push_back(User);
12506     if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
12507         SDNode::hasPredecessorHelper(User, Visited, Worklist))
12508       continue;
12509
12510     // Find the new opcode for the updating load/store.
12511     bool isLoadOp = true;
12512     bool isLaneOp = false;
12513     unsigned NewOpc = 0;
12514     unsigned NumVecs = 0;
12515     if (isIntrinsic) {
12516       unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
12517       switch (IntNo) {
12518       default: llvm_unreachable("unexpected intrinsic for Neon base update");
12519       case Intrinsic::arm_neon_vld1:     NewOpc = ARMISD::VLD1_UPD;
12520         NumVecs = 1; break;
12521       case Intrinsic::arm_neon_vld2:     NewOpc = ARMISD::VLD2_UPD;
12522         NumVecs = 2; break;
12523       case Intrinsic::arm_neon_vld3:     NewOpc = ARMISD::VLD3_UPD;
12524         NumVecs = 3; break;
12525       case Intrinsic::arm_neon_vld4:     NewOpc = ARMISD::VLD4_UPD;
12526         NumVecs = 4; break;
12527       case Intrinsic::arm_neon_vld2dup:
12528       case Intrinsic::arm_neon_vld3dup:
12529       case Intrinsic::arm_neon_vld4dup:
12530         // TODO: Support updating VLDxDUP nodes. For now, we just skip
12531         // combining base updates for such intrinsics.
12532         continue;
12533       case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
12534         NumVecs = 2; isLaneOp = true; break;
12535       case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
12536         NumVecs = 3; isLaneOp = true; break;
12537       case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
12538         NumVecs = 4; isLaneOp = true; break;
12539       case Intrinsic::arm_neon_vst1:     NewOpc = ARMISD::VST1_UPD;
12540         NumVecs = 1; isLoadOp = false; break;
12541       case Intrinsic::arm_neon_vst2:     NewOpc = ARMISD::VST2_UPD;
12542         NumVecs = 2; isLoadOp = false; break;
12543       case Intrinsic::arm_neon_vst3:     NewOpc = ARMISD::VST3_UPD;
12544         NumVecs = 3; isLoadOp = false; break;
12545       case Intrinsic::arm_neon_vst4:     NewOpc = ARMISD::VST4_UPD;
12546         NumVecs = 4; isLoadOp = false; break;
12547       case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
12548         NumVecs = 2; isLoadOp = false; isLaneOp = true; break;
12549       case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
12550         NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
12551       case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
12552         NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
12553       }
12554     } else {
12555       isLaneOp = true;
12556       switch (N->getOpcode()) {
12557       default: llvm_unreachable("unexpected opcode for Neon base update");
12558       case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break;
12559       case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
12560       case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
12561       case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
12562       case ISD::LOAD:       NewOpc = ARMISD::VLD1_UPD;
12563         NumVecs = 1; isLaneOp = false; break;
12564       case ISD::STORE:      NewOpc = ARMISD::VST1_UPD;
12565         NumVecs = 1; isLaneOp = false; isLoadOp = false; break;
12566       }
12567     }
12568
12569     // Find the size of memory referenced by the load/store.
12570     EVT VecTy;
12571     if (isLoadOp) {
12572       VecTy = N->getValueType(0);
12573     } else if (isIntrinsic) {
12574       VecTy = N->getOperand(AddrOpIdx+1).getValueType();
12575     } else {
12576       assert(isStore && "Node has to be a load, a store, or an intrinsic!");
12577       VecTy = N->getOperand(1).getValueType();
12578     }
12579
12580     unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
12581     if (isLaneOp)
12582       NumBytes /= VecTy.getVectorNumElements();
12583
12584     // If the increment is a constant, it must match the memory ref size.
12585     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
12586     ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
12587     if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) {
12588       // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
12589       // separate instructions that make it harder to use a non-constant update.
12590       continue;
12591     }
12592
12593     // OK, we found an ADD we can fold into the base update.
12594     // Now, create a _UPD node, taking care of not breaking alignment.
12595
12596     EVT AlignedVecTy = VecTy;
12597     unsigned Alignment = MemN->getAlignment();
12598
12599     // If this is a less-than-standard-aligned load/store, change the type to
12600     // match the standard alignment.
12601     // The alignment is overlooked when selecting _UPD variants; and it's
12602     // easier to introduce bitcasts here than fix that.
12603     // There are 3 ways to get to this base-update combine:
12604     // - intrinsics: they are assumed to be properly aligned (to the standard
12605     //   alignment of the memory type), so we don't need to do anything.
12606     // - ARMISD::VLDx nodes: they are only generated from the aforementioned
12607     //   intrinsics, so, likewise, there's nothing to do.
12608     // - generic load/store instructions: the alignment is specified as an
12609     //   explicit operand, rather than implicitly as the standard alignment
12610     //   of the memory type (like the intrisics).  We need to change the
12611     //   memory type to match the explicit alignment.  That way, we don't
12612     //   generate non-standard-aligned ARMISD::VLDx nodes.
12613     if (isa<LSBaseSDNode>(N)) {
12614       if (Alignment == 0)
12615         Alignment = 1;
12616       if (Alignment < VecTy.getScalarSizeInBits() / 8) {
12617         MVT EltTy = MVT::getIntegerVT(Alignment * 8);
12618         assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
12619         assert(!isLaneOp && "Unexpected generic load/store lane.");
12620         unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
12621         AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
12622       }
12623       // Don't set an explicit alignment on regular load/stores that we want
12624       // to transform to VLD/VST 1_UPD nodes.
12625       // This matches the behavior of regular load/stores, which only get an
12626       // explicit alignment if the MMO alignment is larger than the standard
12627       // alignment of the memory type.
12628       // Intrinsics, however, always get an explicit alignment, set to the
12629       // alignment of the MMO.
12630       Alignment = 1;
12631     }
12632
12633     // Create the new updating load/store node.
12634     // First, create an SDVTList for the new updating node's results.
12635     EVT Tys[6];
12636     unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
12637     unsigned n;
12638     for (n = 0; n < NumResultVecs; ++n)
12639       Tys[n] = AlignedVecTy;
12640     Tys[n++] = MVT::i32;
12641     Tys[n] = MVT::Other;
12642     SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));
12643
12644     // Then, gather the new node's operands.
12645     SmallVector<SDValue, 8> Ops;
12646     Ops.push_back(N->getOperand(0)); // incoming chain
12647     Ops.push_back(N->getOperand(AddrOpIdx));
12648     Ops.push_back(Inc);
12649
12650     if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
12651       // Try to match the intrinsic's signature
12652       Ops.push_back(StN->getValue());
12653     } else {
12654       // Loads (and of course intrinsics) match the intrinsics' signature,
12655       // so just add all but the alignment operand.
12656       for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i)
12657         Ops.push_back(N->getOperand(i));
12658     }
12659
12660     // For all node types, the alignment operand is always the last one.
12661     Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
12662
12663     // If this is a non-standard-aligned STORE, the penultimate operand is the
12664     // stored value.  Bitcast it to the aligned type.
12665     if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
12666       SDValue &StVal = Ops[Ops.size()-2];
12667       StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
12668     }
12669
12670     EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
12671     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
12672                                            MemN->getMemOperand());
12673
12674     // Update the uses.
12675     SmallVector<SDValue, 5> NewResults;
12676     for (unsigned i = 0; i < NumResultVecs; ++i)
12677       NewResults.push_back(SDValue(UpdN.getNode(), i));
12678
12679     // If this is an non-standard-aligned LOAD, the first result is the loaded
12680     // value.  Bitcast it to the expected result type.
12681     if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
12682       SDValue &LdVal = NewResults[0];
12683       LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
12684     }
12685
12686     NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
12687     DCI.CombineTo(N, NewResults);
12688     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
12689
12690     break;
12691   }
12692   return SDValue();
12693 }
12694
12695 static SDValue PerformVLDCombine(SDNode *N,
12696                                  TargetLowering::DAGCombinerInfo &DCI) {
12697   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
12698     return SDValue();
12699
12700   return CombineBaseUpdate(N, DCI);
12701 }
12702
12703 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
12704 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
12705 /// are also VDUPLANEs.  If so, combine them to a vldN-dup operation and
12706 /// return true.
12707 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
12708   SelectionDAG &DAG = DCI.DAG;
12709   EVT VT = N->getValueType(0);
12710   // vldN-dup instructions only support 64-bit vectors for N > 1.
12711   if (!VT.is64BitVector())
12712     return false;
12713
12714   // Check if the VDUPLANE operand is a vldN-dup intrinsic.
12715   SDNode *VLD = N->getOperand(0).getNode();
12716   if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
12717     return false;
12718   unsigned NumVecs = 0;
12719   unsigned NewOpc = 0;
12720   unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
12721   if (IntNo == Intrinsic::arm_neon_vld2lane) {
12722     NumVecs = 2;
12723     NewOpc = ARMISD::VLD2DUP;
12724   } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
12725     NumVecs = 3;
12726     NewOpc = ARMISD::VLD3DUP;
12727   } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
12728     NumVecs = 4;
12729     NewOpc = ARMISD::VLD4DUP;
12730   } else {
12731     return false;
12732   }
12733
12734   // First check that all the vldN-lane uses are VDUPLANEs and that the lane
12735   // numbers match the load.
12736   unsigned VLDLaneNo =
12737     cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
12738   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
12739        UI != UE; ++UI) {
12740     // Ignore uses of the chain result.
12741     if (UI.getUse().getResNo() == NumVecs)
12742       continue;
12743     SDNode *User = *UI;
12744     if (User->getOpcode() != ARMISD::VDUPLANE ||
12745         VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
12746       return false;
12747   }
12748
12749   // Create the vldN-dup node.
12750   EVT Tys[5];
12751   unsigned n;
12752   for (n = 0; n < NumVecs; ++n)
12753     Tys[n] = VT;
12754   Tys[n] = MVT::Other;
12755   SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1));
12756   SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
12757   MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
12758   SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
12759                                            Ops, VLDMemInt->getMemoryVT(),
12760                                            VLDMemInt->getMemOperand());
12761
12762   // Update the uses.
12763   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
12764        UI != UE; ++UI) {
12765     unsigned ResNo = UI.getUse().getResNo();
12766     // Ignore uses of the chain result.
12767     if (ResNo == NumVecs)
12768       continue;
12769     SDNode *User = *UI;
12770     DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
12771   }
12772
12773   // Now the vldN-lane intrinsic is dead except for its chain result.
12774   // Update uses of the chain.
12775   std::vector<SDValue> VLDDupResults;
12776   for (unsigned n = 0; n < NumVecs; ++n)
12777     VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
12778   VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
12779   DCI.CombineTo(VLD, VLDDupResults);
12780
12781   return true;
12782 }
12783
12784 /// PerformVDUPLANECombine - Target-specific dag combine xforms for
12785 /// ARMISD::VDUPLANE.
12786 static SDValue PerformVDUPLANECombine(SDNode *N,
12787                                       TargetLowering::DAGCombinerInfo &DCI) {
12788   SDValue Op = N->getOperand(0);
12789
12790   // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
12791   // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
12792   if (CombineVLDDUP(N, DCI))
12793     return SDValue(N, 0);
12794
12795   // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
12796   // redundant.  Ignore bit_converts for now; element sizes are checked below.
12797   while (Op.getOpcode() == ISD::BITCAST)
12798     Op = Op.getOperand(0);
12799   if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
12800     return SDValue();
12801
12802   // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
12803   unsigned EltSize = Op.getScalarValueSizeInBits();
12804   // The canonical VMOV for a zero vector uses a 32-bit element size.
12805   unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
12806   unsigned EltBits;
12807   if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
12808     EltSize = 8;
12809   EVT VT = N->getValueType(0);
12810   if (EltSize > VT.getScalarSizeInBits())
12811     return SDValue();
12812
12813   return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
12814 }
12815
12816 /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
12817 static SDValue PerformVDUPCombine(SDNode *N,
12818                                   TargetLowering::DAGCombinerInfo &DCI,
12819                                   const ARMSubtarget *Subtarget) {
12820   SelectionDAG &DAG = DCI.DAG;
12821   SDValue Op = N->getOperand(0);
12822
12823   if (!Subtarget->hasNEON())
12824     return SDValue();
12825
12826   // Match VDUP(LOAD) -> VLD1DUP.
12827   // We match this pattern here rather than waiting for isel because the
12828   // transform is only legal for unindexed loads.
12829   LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
12830   if (LD && Op.hasOneUse() && LD->isUnindexed() &&
12831       LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
12832     SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1),
12833                       DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) };
12834     SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
12835     SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys,
12836                                              Ops, LD->getMemoryVT(),
12837                                              LD->getMemOperand());
12838     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
12839     return VLDDup;
12840   }
12841
12842   return SDValue();
12843 }
12844
12845 static SDValue PerformLOADCombine(SDNode *N,
12846                                   TargetLowering::DAGCombinerInfo &DCI) {
12847   EVT VT = N->getValueType(0);
12848
12849   // If this is a legal vector load, try to combine it into a VLD1_UPD.
12850   if (ISD::isNormalLoad(N) && VT.isVector() &&
12851       DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
12852     return CombineBaseUpdate(N, DCI);
12853
12854   return SDValue();
12855 }
12856
12857 /// PerformSTORECombine - Target-specific dag combine xforms for
12858 /// ISD::STORE.
12859 static SDValue PerformSTORECombine(SDNode *N,
12860                                    TargetLowering::DAGCombinerInfo &DCI) {
12861   StoreSDNode *St = cast<StoreSDNode>(N);
12862   if (St->isVolatile())
12863     return SDValue();
12864
12865   // Optimize trunc store (of multiple scalars) to shuffle and store.  First,
12866   // pack all of the elements in one place.  Next, store to memory in fewer
12867   // chunks.
12868   SDValue StVal = St->getValue();
12869   EVT VT = StVal.getValueType();
12870   if (St->isTruncatingStore() && VT.isVector()) {
12871     SelectionDAG &DAG = DCI.DAG;
12872     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12873     EVT StVT = St->getMemoryVT();
12874     unsigned NumElems = VT.getVectorNumElements();
12875     assert(StVT != VT && "Cannot truncate to the same type");
12876     unsigned FromEltSz = VT.getScalarSizeInBits();
12877     unsigned ToEltSz = StVT.getScalarSizeInBits();
12878
12879     // From, To sizes and ElemCount must be pow of two
12880     if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
12881
12882     // We are going to use the original vector elt for storing.
12883     // Accumulated smaller vector elements must be a multiple of the store size.
12884     if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
12885
12886     unsigned SizeRatio  = FromEltSz / ToEltSz;
12887     assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
12888
12889     // Create a type on which we perform the shuffle.
12890     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
12891                                      NumElems*SizeRatio);
12892     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
12893
12894     SDLoc DL(St);
12895     SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
12896     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
12897     for (unsigned i = 0; i < NumElems; ++i)
12898       ShuffleVec[i] = DAG.getDataLayout().isBigEndian()
12899                           ? (i + 1) * SizeRatio - 1
12900                           : i * SizeRatio;
12901
12902     // Can't shuffle using an illegal type.
12903     if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
12904
12905     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
12906                                 DAG.getUNDEF(WideVec.getValueType()),
12907                                 ShuffleVec);
12908     // At this point all of the data is stored at the bottom of the
12909     // register. We now need to save it to mem.
12910
12911     // Find the largest store unit
12912     MVT StoreType = MVT::i8;
12913     for (MVT Tp : MVT::integer_valuetypes()) {
12914       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
12915         StoreType = Tp;
12916     }
12917     // Didn't find a legal store type.
12918     if (!TLI.isTypeLegal(StoreType))
12919       return SDValue();
12920
12921     // Bitcast the original vector into a vector of store-size units
12922     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
12923             StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
12924     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
12925     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
12926     SmallVector<SDValue, 8> Chains;
12927     SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
12928                                         TLI.getPointerTy(DAG.getDataLayout()));
12929     SDValue BasePtr = St->getBasePtr();
12930
12931     // Perform one or more big stores into memory.
12932     unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
12933     for (unsigned I = 0; I < E; I++) {
12934       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
12935                                    StoreType, ShuffWide,
12936                                    DAG.getIntPtrConstant(I, DL));
12937       SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
12938                                 St->getPointerInfo(), St->getAlignment(),
12939                                 St->getMemOperand()->getFlags());
12940       BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
12941                             Increment);
12942       Chains.push_back(Ch);
12943     }
12944     return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
12945   }
12946
12947   if (!ISD::isNormalStore(St))
12948     return SDValue();
12949
12950   // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
12951   // ARM stores of arguments in the same cache line.
12952   if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
12953       StVal.getNode()->hasOneUse()) {
12954     SelectionDAG  &DAG = DCI.DAG;
12955     bool isBigEndian = DAG.getDataLayout().isBigEndian();
12956     SDLoc DL(St);
12957     SDValue BasePtr = St->getBasePtr();
12958     SDValue NewST1 = DAG.getStore(
12959         St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
12960         BasePtr, St->getPointerInfo(), St->getAlignment(),
12961         St->getMemOperand()->getFlags());
12962
12963     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
12964                                     DAG.getConstant(4, DL, MVT::i32));
12965     return DAG.getStore(NewST1.getValue(0), DL,
12966                         StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
12967                         OffsetPtr, St->getPointerInfo(),
12968                         std::min(4U, St->getAlignment() / 2),
12969                         St->getMemOperand()->getFlags());
12970   }
12971
12972   if (StVal.getValueType() == MVT::i64 &&
12973       StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
12974
12975     // Bitcast an i64 store extracted from a vector to f64.
12976     // Otherwise, the i64 value will be legalized to a pair of i32 values.
12977     SelectionDAG &DAG = DCI.DAG;
12978     SDLoc dl(StVal);
12979     SDValue IntVec = StVal.getOperand(0);
12980     EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
12981                                    IntVec.getValueType().getVectorNumElements());
12982     SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
12983     SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
12984                                  Vec, StVal.getOperand(1));
12985     dl = SDLoc(N);
12986     SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
12987     // Make the DAGCombiner fold the bitcasts.
12988     DCI.AddToWorklist(Vec.getNode());
12989     DCI.AddToWorklist(ExtElt.getNode());
12990     DCI.AddToWorklist(V.getNode());
12991     return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
12992                         St->getPointerInfo(), St->getAlignment(),
12993                         St->getMemOperand()->getFlags(), St->getAAInfo());
12994   }
12995
12996   // If this is a legal vector store, try to combine it into a VST1_UPD.
12997   if (ISD::isNormalStore(N) && VT.isVector() &&
12998       DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
12999     return CombineBaseUpdate(N, DCI);
13000
13001   return SDValue();
13002 }
13003
13004 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
13005 /// can replace combinations of VMUL and VCVT (floating-point to integer)
13006 /// when the VMUL has a constant operand that is a power of 2.
13007 ///
13008 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
13009 ///  vmul.f32        d16, d17, d16
13010 ///  vcvt.s32.f32    d16, d16
13011 /// becomes:
13012 ///  vcvt.s32.f32    d16, d16, #3
13013 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
13014                                   const ARMSubtarget *Subtarget) {
13015   if (!Subtarget->hasNEON())
13016     return SDValue();
13017
13018   SDValue Op = N->getOperand(0);
13019   if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
13020       Op.getOpcode() != ISD::FMUL)
13021     return SDValue();
13022
13023   SDValue ConstVec = Op->getOperand(1);
13024   if (!isa<BuildVectorSDNode>(ConstVec))
13025     return SDValue();
13026
13027   MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
13028   uint32_t FloatBits = FloatTy.getSizeInBits();
13029   MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
13030   uint32_t IntBits = IntTy.getSizeInBits();
13031   unsigned NumLanes = Op.getValueType().getVectorNumElements();
13032   if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
13033     // These instructions only exist converting from f32 to i32. We can handle
13034     // smaller integers by generating an extra truncate, but larger ones would
13035     // be lossy. We also can't handle anything other than 2 or 4 lanes, since
13036     // these intructions only support v2i32/v4i32 types.
13037     return SDValue();
13038   }
13039
13040   BitVector UndefElements;
13041   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
13042   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
13043   if (C == -1 || C == 0 || C > 32)
13044     return SDValue();
13045
13046   SDLoc dl(N);
13047   bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
13048   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
13049     Intrinsic::arm_neon_vcvtfp2fxu;
13050   SDValue FixConv = DAG.getNode(
13051       ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
13052       DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
13053       DAG.getConstant(C, dl, MVT::i32));
13054
13055   if (IntBits < FloatBits)
13056     FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
13057
13058   return FixConv;
13059 }
13060
13061 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
13062 /// can replace combinations of VCVT (integer to floating-point) and VDIV
13063 /// when the VDIV has a constant operand that is a power of 2.
13064 ///
13065 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
13066 ///  vcvt.f32.s32    d16, d16
13067 ///  vdiv.f32        d16, d17, d16
13068 /// becomes:
13069 ///  vcvt.f32.s32    d16, d16, #3
13070 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
13071                                   const ARMSubtarget *Subtarget) {
13072   if (!Subtarget->hasNEON())
13073     return SDValue();
13074
13075   SDValue Op = N->getOperand(0);
13076   unsigned OpOpcode = Op.getNode()->getOpcode();
13077   if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
13078       (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
13079     return SDValue();
13080
13081   SDValue ConstVec = N->getOperand(1);
13082   if (!isa<BuildVectorSDNode>(ConstVec))
13083     return SDValue();
13084
13085   MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
13086   uint32_t FloatBits = FloatTy.getSizeInBits();
13087   MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
13088   uint32_t IntBits = IntTy.getSizeInBits();
13089   unsigned NumLanes = Op.getValueType().getVectorNumElements();
13090   if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
13091     // These instructions only exist converting from i32 to f32. We can handle
13092     // smaller integers by generating an extra extend, but larger ones would
13093     // be lossy. We also can't handle anything other than 2 or 4 lanes, since
13094     // these intructions only support v2i32/v4i32 types.
13095     return SDValue();
13096   }
13097
13098   BitVector UndefElements;
13099   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
13100   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
13101   if (C == -1 || C == 0 || C > 32)
13102     return SDValue();
13103
13104   SDLoc dl(N);
13105   bool isSigned = OpOpcode == ISD::SINT_TO_FP;
13106   SDValue ConvInput = Op.getOperand(0);
13107   if (IntBits < FloatBits)
13108     ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
13109                             dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
13110                             ConvInput);
13111
13112   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
13113     Intrinsic::arm_neon_vcvtfxu2fp;
13114   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
13115                      Op.getValueType(),
13116                      DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
13117                      ConvInput, DAG.getConstant(C, dl, MVT::i32));
13118 }
13119
13120 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
13121 static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
13122   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
13123   switch (IntNo) {
13124   default:
13125     // Don't do anything for most intrinsics.
13126     break;
13127
13128   // Vector shifts: check for immediate versions and lower them.
13129   // Note: This is done during DAG combining instead of DAG legalizing because
13130   // the build_vectors for 64-bit vector element shift counts are generally
13131   // not legal, and it is hard to see their values after they get legalized to
13132   // loads from a constant pool.
13133   case Intrinsic::arm_neon_vshifts:
13134   case Intrinsic::arm_neon_vshiftu:
13135   case Intrinsic::arm_neon_vrshifts:
13136   case Intrinsic::arm_neon_vrshiftu:
13137   case Intrinsic::arm_neon_vrshiftn:
13138   case Intrinsic::arm_neon_vqshifts:
13139   case Intrinsic::arm_neon_vqshiftu:
13140   case Intrinsic::arm_neon_vqshiftsu:
13141   case Intrinsic::arm_neon_vqshiftns:
13142   case Intrinsic::arm_neon_vqshiftnu:
13143   case Intrinsic::arm_neon_vqshiftnsu:
13144   case Intrinsic::arm_neon_vqrshiftns:
13145   case Intrinsic::arm_neon_vqrshiftnu:
13146   case Intrinsic::arm_neon_vqrshiftnsu: {
13147     EVT VT = N->getOperand(1).getValueType();
13148     int64_t Cnt;
13149     unsigned VShiftOpc = 0;
13150
13151     switch (IntNo) {
13152     case Intrinsic::arm_neon_vshifts:
13153     case Intrinsic::arm_neon_vshiftu:
13154       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
13155         VShiftOpc = ARMISD::VSHLIMM;
13156         break;
13157       }
13158       if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
13159         VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
13160                                                           : ARMISD::VSHRuIMM);
13161         break;
13162       }
13163       return SDValue();
13164
13165     case Intrinsic::arm_neon_vrshifts:
13166     case Intrinsic::arm_neon_vrshiftu:
13167       if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
13168         break;
13169       return SDValue();
13170
13171     case Intrinsic::arm_neon_vqshifts:
13172     case Intrinsic::arm_neon_vqshiftu:
13173       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
13174         break;
13175       return SDValue();
13176
13177     case Intrinsic::arm_neon_vqshiftsu:
13178       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
13179         break;
13180       llvm_unreachable("invalid shift count for vqshlu intrinsic");
13181
13182     case Intrinsic::arm_neon_vrshiftn:
13183     case Intrinsic::arm_neon_vqshiftns:
13184     case Intrinsic::arm_neon_vqshiftnu:
13185     case Intrinsic::arm_neon_vqshiftnsu:
13186     case Intrinsic::arm_neon_vqrshiftns:
13187     case Intrinsic::arm_neon_vqrshiftnu:
13188     case Intrinsic::arm_neon_vqrshiftnsu:
13189       // Narrowing shifts require an immediate right shift.
13190       if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
13191         break;
13192       llvm_unreachable("invalid shift count for narrowing vector shift "
13193                        "intrinsic");
13194
13195     default:
13196       llvm_unreachable("unhandled vector shift");
13197     }
13198
13199     switch (IntNo) {
13200     case Intrinsic::arm_neon_vshifts:
13201     case Intrinsic::arm_neon_vshiftu:
13202       // Opcode already set above.
13203       break;
13204     case Intrinsic::arm_neon_vrshifts:
13205       VShiftOpc = ARMISD::VRSHRsIMM;
13206       break;
13207     case Intrinsic::arm_neon_vrshiftu:
13208       VShiftOpc = ARMISD::VRSHRuIMM;
13209       break;
13210     case Intrinsic::arm_neon_vrshiftn:
13211       VShiftOpc = ARMISD::VRSHRNIMM;
13212       break;
13213     case Intrinsic::arm_neon_vqshifts:
13214       VShiftOpc = ARMISD::VQSHLsIMM;
13215       break;
13216     case Intrinsic::arm_neon_vqshiftu:
13217       VShiftOpc = ARMISD::VQSHLuIMM;
13218       break;
13219     case Intrinsic::arm_neon_vqshiftsu:
13220       VShiftOpc = ARMISD::VQSHLsuIMM;
13221       break;
13222     case Intrinsic::arm_neon_vqshiftns:
13223       VShiftOpc = ARMISD::VQSHRNsIMM;
13224       break;
13225     case Intrinsic::arm_neon_vqshiftnu:
13226       VShiftOpc = ARMISD::VQSHRNuIMM;
13227       break;
13228     case Intrinsic::arm_neon_vqshiftnsu:
13229       VShiftOpc = ARMISD::VQSHRNsuIMM;
13230       break;
13231     case Intrinsic::arm_neon_vqrshiftns:
13232       VShiftOpc = ARMISD::VQRSHRNsIMM;
13233       break;
13234     case Intrinsic::arm_neon_vqrshiftnu:
13235       VShiftOpc = ARMISD::VQRSHRNuIMM;
13236       break;
13237     case Intrinsic::arm_neon_vqrshiftnsu:
13238       VShiftOpc = ARMISD::VQRSHRNsuIMM;
13239       break;
13240     }
13241
13242     SDLoc dl(N);
13243     return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
13244                        N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
13245   }
13246
13247   case Intrinsic::arm_neon_vshiftins: {
13248     EVT VT = N->getOperand(1).getValueType();
13249     int64_t Cnt;
13250     unsigned VShiftOpc = 0;
13251
13252     if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
13253       VShiftOpc = ARMISD::VSLIIMM;
13254     else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
13255       VShiftOpc = ARMISD::VSRIIMM;
13256     else {
13257       llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
13258     }
13259
13260     SDLoc dl(N);
13261     return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
13262                        N->getOperand(1), N->getOperand(2),
13263                        DAG.getConstant(Cnt, dl, MVT::i32));
13264   }
13265
13266   case Intrinsic::arm_neon_vqrshifts:
13267   case Intrinsic::arm_neon_vqrshiftu:
13268     // No immediate versions of these to check for.
13269     break;
13270   }
13271
13272   return SDValue();
13273 }
13274
13275 /// PerformShiftCombine - Checks for immediate versions of vector shifts and
13276 /// lowers them.  As with the vector shift intrinsics, this is done during DAG
13277 /// combining instead of DAG legalizing because the build_vectors for 64-bit
13278 /// vector element shift counts are generally not legal, and it is hard to see
13279 /// their values after they get legalized to loads from a constant pool.
13280 static SDValue PerformShiftCombine(SDNode *N,
13281                                    TargetLowering::DAGCombinerInfo &DCI,
13282                                    const ARMSubtarget *ST) {
13283   SelectionDAG &DAG = DCI.DAG;
13284   EVT VT = N->getValueType(0);
13285   if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
13286     // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
13287     // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16.
13288     SDValue N1 = N->getOperand(1);
13289     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
13290       SDValue N0 = N->getOperand(0);
13291       if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
13292           DAG.MaskedValueIsZero(N0.getOperand(0),
13293                                 APInt::getHighBitsSet(32, 16)))
13294         return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1);
13295     }
13296   }
13297
13298   if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
13299       N->getOperand(0)->getOpcode() == ISD::AND &&
13300       N->getOperand(0)->hasOneUse()) {
13301     if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
13302       return SDValue();
13303     // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
13304     // usually show up because instcombine prefers to canonicalize it to
13305     // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
13306     // out of GEP lowering in some cases.
13307     SDValue N0 = N->getOperand(0);
13308     ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
13309     if (!ShiftAmtNode)
13310       return SDValue();
13311     uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
13312     ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
13313     if (!AndMaskNode)
13314       return SDValue();
13315     uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
13316     // Don't transform uxtb/uxth.
13317     if (AndMask == 255 || AndMask == 65535)
13318       return SDValue();
13319     if (isMask_32(AndMask)) {
13320       uint32_t MaskedBits = countLeadingZeros(AndMask);
13321       if (MaskedBits > ShiftAmt) {
13322         SDLoc DL(N);
13323         SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
13324                                   DAG.getConstant(MaskedBits, DL, MVT::i32));
13325         return DAG.getNode(
13326             ISD::SRL, DL, MVT::i32, SHL,
13327             DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
13328       }
13329     }
13330   }
13331
13332   // Nothing to be done for scalar shifts.
13333   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13334   if (!VT.isVector() || !TLI.isTypeLegal(VT))
13335     return SDValue();
13336   if (ST->hasMVEIntegerOps() && VT == MVT::v2i64)
13337     return SDValue();
13338
13339   int64_t Cnt;
13340
13341   switch (N->getOpcode()) {
13342   default: llvm_unreachable("unexpected shift opcode");
13343
13344   case ISD::SHL:
13345     if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
13346       SDLoc dl(N);
13347       return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
13348                          DAG.getConstant(Cnt, dl, MVT::i32));
13349     }
13350     break;
13351
13352   case ISD::SRA:
13353   case ISD::SRL:
13354     if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
13355       unsigned VShiftOpc =
13356           (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
13357       SDLoc dl(N);
13358       return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
13359                          DAG.getConstant(Cnt, dl, MVT::i32));
13360     }
13361   }
13362   return SDValue();
13363 }
13364
13365 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
13366 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
13367 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
13368                                     const ARMSubtarget *ST) {
13369   SDValue N0 = N->getOperand(0);
13370
13371   // Check for sign- and zero-extensions of vector extract operations of 8-
13372   // and 16-bit vector elements.  NEON supports these directly.  They are
13373   // handled during DAG combining because type legalization will promote them
13374   // to 32-bit types and it is messy to recognize the operations after that.
13375   if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
13376     SDValue Vec = N0.getOperand(0);
13377     SDValue Lane = N0.getOperand(1);
13378     EVT VT = N->getValueType(0);
13379     EVT EltVT = N0.getValueType();
13380     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13381
13382     if (VT == MVT::i32 &&
13383         (EltVT == MVT::i8 || EltVT == MVT::i16) &&
13384         TLI.isTypeLegal(Vec.getValueType()) &&
13385         isa<ConstantSDNode>(Lane)) {
13386
13387       unsigned Opc = 0;
13388       switch (N->getOpcode()) {
13389       default: llvm_unreachable("unexpected opcode");
13390       case ISD::SIGN_EXTEND:
13391         Opc = ARMISD::VGETLANEs;
13392         break;
13393       case ISD::ZERO_EXTEND:
13394       case ISD::ANY_EXTEND:
13395         Opc = ARMISD::VGETLANEu;
13396         break;
13397       }
13398       return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
13399     }
13400   }
13401
13402   return SDValue();
13403 }
13404
13405 static const APInt *isPowerOf2Constant(SDValue V) {
13406   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
13407   if (!C)
13408     return nullptr;
13409   const APInt *CV = &C->getAPIntValue();
13410   return CV->isPowerOf2() ? CV : nullptr;
13411 }
13412
13413 SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const {
13414   // If we have a CMOV, OR and AND combination such as:
13415   //   if (x & CN)
13416   //     y |= CM;
13417   //
13418   // And:
13419   //   * CN is a single bit;
13420   //   * All bits covered by CM are known zero in y
13421   //
13422   // Then we can convert this into a sequence of BFI instructions. This will
13423   // always be a win if CM is a single bit, will always be no worse than the
13424   // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
13425   // three bits (due to the extra IT instruction).
13426
13427   SDValue Op0 = CMOV->getOperand(0);
13428   SDValue Op1 = CMOV->getOperand(1);
13429   auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2));
13430   auto CC = CCNode->getAPIntValue().getLimitedValue();
13431   SDValue CmpZ = CMOV->getOperand(4);
13432
13433   // The compare must be against zero.
13434   if (!isNullConstant(CmpZ->getOperand(1)))
13435     return SDValue();
13436
13437   assert(CmpZ->getOpcode() == ARMISD::CMPZ);
13438   SDValue And = CmpZ->getOperand(0);
13439   if (And->getOpcode() != ISD::AND)
13440     return SDValue();
13441   const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
13442   if (!AndC)
13443     return SDValue();
13444   SDValue X = And->getOperand(0);
13445
13446   if (CC == ARMCC::EQ) {
13447     // We're performing an "equal to zero" compare. Swap the operands so we
13448     // canonicalize on a "not equal to zero" compare.
13449     std::swap(Op0, Op1);
13450   } else {
13451     assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
13452   }
13453
13454   if (Op1->getOpcode() != ISD::OR)
13455     return SDValue();
13456
13457   ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1));
13458   if (!OrC)
13459     return SDValue();
13460   SDValue Y = Op1->getOperand(0);
13461
13462   if (Op0 != Y)
13463     return SDValue();
13464
13465   // Now, is it profitable to continue?
13466   APInt OrCI = OrC->getAPIntValue();
13467   unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
13468   if (OrCI.countPopulation() > Heuristic)
13469     return SDValue();
13470
13471   // Lastly, can we determine that the bits defined by OrCI
13472   // are zero in Y?
13473   KnownBits Known = DAG.computeKnownBits(Y);
13474   if ((OrCI & Known.Zero) != OrCI)
13475     return SDValue();
13476
13477   // OK, we can do the combine.
13478   SDValue V = Y;
13479   SDLoc dl(X);
13480   EVT VT = X.getValueType();
13481   unsigned BitInX = AndC->logBase2();
13482
13483   if (BitInX != 0) {
13484     // We must shift X first.
13485     X = DAG.getNode(ISD::SRL, dl, VT, X,
13486                     DAG.getConstant(BitInX, dl, VT));
13487   }
13488
13489   for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
13490        BitInY < NumActiveBits; ++BitInY) {
13491     if (OrCI[BitInY] == 0)
13492       continue;
13493     APInt Mask(VT.getSizeInBits(), 0);
13494     Mask.setBit(BitInY);
13495     V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
13496                     // Confusingly, the operand is an *inverted* mask.
13497                     DAG.getConstant(~Mask, dl, VT));
13498   }
13499
13500   return V;
13501 }
13502
13503 // Given N, the value controlling the conditional branch, search for the loop
13504 // intrinsic, returning it, along with how the value is used. We need to handle
13505 // patterns such as the following:
13506 // (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
13507 // (brcond (setcc (loop.decrement), 0, eq), exit)
13508 // (brcond (setcc (loop.decrement), 0, ne), header)
13509 static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm,
13510                                    bool &Negate) {
13511   switch (N->getOpcode()) {
13512   default:
13513     break;
13514   case ISD::XOR: {
13515     if (!isa<ConstantSDNode>(N.getOperand(1)))
13516       return SDValue();
13517     if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
13518       return SDValue();
13519     Negate = !Negate;
13520     return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
13521   }
13522   case ISD::SETCC: {
13523     auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
13524     if (!Const)
13525       return SDValue();
13526     if (Const->isNullValue())
13527       Imm = 0;
13528     else if (Const->isOne())
13529       Imm = 1;
13530     else
13531       return SDValue();
13532     CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
13533     return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
13534   }
13535   case ISD::INTRINSIC_W_CHAIN: {
13536     unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue();
13537     if (IntOp != Intrinsic::test_set_loop_iterations &&
13538         IntOp != Intrinsic::loop_decrement_reg)
13539       return SDValue();
13540     return N;
13541   }
13542   }
13543   return SDValue();
13544 }
13545
13546 static SDValue PerformHWLoopCombine(SDNode *N,
13547                                     TargetLowering::DAGCombinerInfo &DCI,
13548                                     const ARMSubtarget *ST) {
13549
13550   // The hwloop intrinsics that we're interested are used for control-flow,
13551   // either for entering or exiting the loop:
13552   // - test.set.loop.iterations will test whether its operand is zero. If it
13553   //   is zero, the proceeding branch should not enter the loop.
13554   // - loop.decrement.reg also tests whether its operand is zero. If it is
13555   //   zero, the proceeding branch should not branch back to the beginning of
13556   //   the loop.
13557   // So here, we need to check that how the brcond is using the result of each
13558   // of the intrinsics to ensure that we're branching to the right place at the
13559   // right time.
13560
13561   ISD::CondCode CC;
13562   SDValue Cond;
13563   int Imm = 1;
13564   bool Negate = false;
13565   SDValue Chain = N->getOperand(0);
13566   SDValue Dest;
13567
13568   if (N->getOpcode() == ISD::BRCOND) {
13569     CC = ISD::SETEQ;
13570     Cond = N->getOperand(1);
13571     Dest = N->getOperand(2);
13572   } else {
13573     assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
13574     CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
13575     Cond = N->getOperand(2);
13576     Dest = N->getOperand(4);
13577     if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
13578       if (!Const->isOne() && !Const->isNullValue())
13579         return SDValue();
13580       Imm = Const->getZExtValue();
13581     } else
13582       return SDValue();
13583   }
13584
13585   SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
13586   if (!Int)
13587     return SDValue();
13588
13589   if (Negate)
13590     CC = ISD::getSetCCInverse(CC, true);
13591
13592   auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
13593     return (CC == ISD::SETEQ && Imm == 0) ||
13594            (CC == ISD::SETNE && Imm == 1) ||
13595            (CC == ISD::SETLT && Imm == 1) ||
13596            (CC == ISD::SETULT && Imm == 1);
13597   };
13598
13599   auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
13600     return (CC == ISD::SETEQ && Imm == 1) ||
13601            (CC == ISD::SETNE && Imm == 0) ||
13602            (CC == ISD::SETGT && Imm == 0) ||
13603            (CC == ISD::SETUGT && Imm == 0) ||
13604            (CC == ISD::SETGE && Imm == 1) ||
13605            (CC == ISD::SETUGE && Imm == 1);
13606   };
13607
13608   assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
13609          "unsupported condition");
13610
13611   SDLoc dl(Int);
13612   SelectionDAG &DAG = DCI.DAG;
13613   SDValue Elements = Int.getOperand(2);
13614   unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue();
13615   assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR)
13616           && "expected single br user");
13617   SDNode *Br = *N->use_begin();
13618   SDValue OtherTarget = Br->getOperand(1);
13619
13620   // Update the unconditional branch to branch to the given Dest.
13621   auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
13622     SDValue NewBrOps[] = { Br->getOperand(0), Dest };
13623     SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
13624     DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
13625   };
13626
13627   if (IntOp == Intrinsic::test_set_loop_iterations) {
13628     SDValue Res;
13629     // We expect this 'instruction' to branch when the counter is zero.
13630     if (IsTrueIfZero(CC, Imm)) {
13631       SDValue Ops[] = { Chain, Elements, Dest };
13632       Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
13633     } else {
13634       // The logic is the reverse of what we need for WLS, so find the other
13635       // basic block target: the target of the proceeding br.
13636       UpdateUncondBr(Br, Dest, DAG);
13637
13638       SDValue Ops[] = { Chain, Elements, OtherTarget };
13639       Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
13640     }
13641     DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0));
13642     return Res;
13643   } else {
13644     SDValue Size = DAG.getTargetConstant(
13645       cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32);
13646     SDValue Args[] = { Int.getOperand(0), Elements, Size, };
13647     SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
13648                                   DAG.getVTList(MVT::i32, MVT::Other), Args);
13649     DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
13650
13651     // We expect this instruction to branch when the count is not zero.
13652     SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
13653
13654     // Update the unconditional branch to target the loop preheader if we've
13655     // found the condition has been reversed.
13656     if (Target == OtherTarget)
13657       UpdateUncondBr(Br, Dest, DAG);
13658
13659     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
13660                         SDValue(LoopDec.getNode(), 1), Chain);
13661
13662     SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
13663     return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
13664   }
13665   return SDValue();
13666 }
13667
13668 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
13669 SDValue
13670 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {
13671   SDValue Cmp = N->getOperand(4);
13672   if (Cmp.getOpcode() != ARMISD::CMPZ)
13673     // Only looking at NE cases.
13674     return SDValue();
13675
13676   EVT VT = N->getValueType(0);
13677   SDLoc dl(N);
13678   SDValue LHS = Cmp.getOperand(0);
13679   SDValue RHS = Cmp.getOperand(1);
13680   SDValue Chain = N->getOperand(0);
13681   SDValue BB = N->getOperand(1);
13682   SDValue ARMcc = N->getOperand(2);
13683   ARMCC::CondCodes CC =
13684     (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
13685
13686   // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))
13687   // -> (brcond Chain BB CC CPSR Cmp)
13688   if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
13689       LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
13690       LHS->getOperand(0)->hasOneUse()) {
13691     auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0));
13692     auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1));
13693     auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
13694     auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
13695     if ((LHS00C && LHS00C->getZExtValue() == 0) &&
13696         (LHS01C && LHS01C->getZExtValue() == 1) &&
13697         (LHS1C && LHS1C->getZExtValue() == 1) &&
13698         (RHSC && RHSC->getZExtValue() == 0)) {
13699       return DAG.getNode(
13700           ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2),
13701           LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4));
13702     }
13703   }
13704
13705   return SDValue();
13706 }
13707
13708 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
13709 SDValue
13710 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
13711   SDValue Cmp = N->getOperand(4);
13712   if (Cmp.getOpcode() != ARMISD::CMPZ)
13713     // Only looking at EQ and NE cases.
13714     return SDValue();
13715
13716   EVT VT = N->getValueType(0);
13717   SDLoc dl(N);
13718   SDValue LHS = Cmp.getOperand(0);
13719   SDValue RHS = Cmp.getOperand(1);
13720   SDValue FalseVal = N->getOperand(0);
13721   SDValue TrueVal = N->getOperand(1);
13722   SDValue ARMcc = N->getOperand(2);
13723   ARMCC::CondCodes CC =
13724     (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
13725
13726   // BFI is only available on V6T2+.
13727   if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
13728     SDValue R = PerformCMOVToBFICombine(N, DAG);
13729     if (R)
13730       return R;
13731   }
13732
13733   // Simplify
13734   //   mov     r1, r0
13735   //   cmp     r1, x
13736   //   mov     r0, y
13737   //   moveq   r0, x
13738   // to
13739   //   cmp     r0, x
13740   //   movne   r0, y
13741   //
13742   //   mov     r1, r0
13743   //   cmp     r1, x
13744   //   mov     r0, x
13745   //   movne   r0, y
13746   // to
13747   //   cmp     r0, x
13748   //   movne   r0, y
13749   /// FIXME: Turn this into a target neutral optimization?
13750   SDValue Res;
13751   if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
13752     Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
13753                       N->getOperand(3), Cmp);
13754   } else if (CC == ARMCC::EQ && TrueVal == RHS) {
13755     SDValue ARMcc;
13756     SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
13757     Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,
13758                       N->getOperand(3), NewCmp);
13759   }
13760
13761   // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0))
13762   // -> (cmov F T CC CPSR Cmp)
13763   if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) {
13764     auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0));
13765     auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
13766     auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
13767     if ((LHS0C && LHS0C->getZExtValue() == 0) &&
13768         (LHS1C && LHS1C->getZExtValue() == 1) &&
13769         (RHSC && RHSC->getZExtValue() == 0)) {
13770       return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
13771                          LHS->getOperand(2), LHS->getOperand(3),
13772                          LHS->getOperand(4));
13773     }
13774   }
13775
13776   if (!VT.isInteger())
13777       return SDValue();
13778
13779   // Materialize a boolean comparison for integers so we can avoid branching.
13780   if (isNullConstant(FalseVal)) {
13781     if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
13782       if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
13783         // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
13784         // right 5 bits will make that 32 be 1, otherwise it will be 0.
13785         // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
13786         SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
13787         Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
13788                           DAG.getConstant(5, dl, MVT::i32));
13789       } else {
13790         // CMOV 0, 1, ==, (CMPZ x, y) ->
13791         //     (ADDCARRY (SUB x, y), t:0, t:1)
13792         // where t = (SUBCARRY 0, (SUB x, y), 0)
13793         //
13794         // The SUBCARRY computes 0 - (x - y) and this will give a borrow when
13795         // x != y. In other words, a carry C == 1 when x == y, C == 0
13796         // otherwise.
13797         // The final ADDCARRY computes
13798         //     x - y + (0 - (x - y)) + C == C
13799         SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
13800         SDVTList VTs = DAG.getVTList(VT, MVT::i32);
13801         SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
13802         // ISD::SUBCARRY returns a borrow but we want the carry here
13803         // actually.
13804         SDValue Carry =
13805             DAG.getNode(ISD::SUB, dl, MVT::i32,
13806                         DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
13807         Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry);
13808       }
13809     } else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
13810                (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
13811       // This seems pointless but will allow us to combine it further below.
13812       // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
13813       SDValue Sub =
13814           DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
13815       SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
13816                                           Sub.getValue(1), SDValue());
13817       Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
13818                         N->getOperand(3), CPSRGlue.getValue(1));
13819       FalseVal = Sub;
13820     }
13821   } else if (isNullConstant(TrueVal)) {
13822     if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
13823         (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
13824       // This seems pointless but will allow us to combine it further below
13825       // Note that we change == for != as this is the dual for the case above.
13826       // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
13827       SDValue Sub =
13828           DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
13829       SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
13830                                           Sub.getValue(1), SDValue());
13831       Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
13832                         DAG.getConstant(ARMCC::NE, dl, MVT::i32),
13833                         N->getOperand(3), CPSRGlue.getValue(1));
13834       FalseVal = Sub;
13835     }
13836   }
13837
13838   // On Thumb1, the DAG above may be further combined if z is a power of 2
13839   // (z == 2 ^ K).
13840   // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 ->
13841   // t1 = (USUBO (SUB x, y), 1)
13842   // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1)
13843   // Result = if K != 0 then (SHL t2:0, K) else t2:0
13844   //
13845   // This also handles the special case of comparing against zero; it's
13846   // essentially, the same pattern, except there's no SUBS:
13847   // CMOV x, z, !=, (CMPZ x, 0) ->
13848   // t1 = (USUBO x, 1)
13849   // t2 = (SUBCARRY x, t1:0, t1:1)
13850   // Result = if K != 0 then (SHL t2:0, K) else t2:0
13851   const APInt *TrueConst;
13852   if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
13853       ((FalseVal.getOpcode() == ARMISD::SUBS &&
13854         FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) ||
13855        (FalseVal == LHS && isNullConstant(RHS))) &&
13856       (TrueConst = isPowerOf2Constant(TrueVal))) {
13857     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
13858     unsigned ShiftAmount = TrueConst->logBase2();
13859     if (ShiftAmount)
13860       TrueVal = DAG.getConstant(1, dl, VT);
13861     SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
13862     Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1));
13863
13864     if (ShiftAmount)
13865       Res = DAG.getNode(ISD::SHL, dl, VT, Res,
13866                         DAG.getConstant(ShiftAmount, dl, MVT::i32));
13867   }
13868
13869   if (Res.getNode()) {
13870     KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
13871     // Capture demanded bits information that would be otherwise lost.
13872     if (Known.Zero == 0xfffffffe)
13873       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
13874                         DAG.getValueType(MVT::i1));
13875     else if (Known.Zero == 0xffffff00)
13876       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
13877                         DAG.getValueType(MVT::i8));
13878     else if (Known.Zero == 0xffff0000)
13879       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
13880                         DAG.getValueType(MVT::i16));
13881   }
13882
13883   return Res;
13884 }
13885
13886 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
13887                                              DAGCombinerInfo &DCI) const {
13888   switch (N->getOpcode()) {
13889   default: break;
13890   case ISD::ABS:        return PerformABSCombine(N, DCI, Subtarget);
13891   case ARMISD::ADDE:    return PerformADDECombine(N, DCI, Subtarget);
13892   case ARMISD::UMLAL:   return PerformUMLALCombine(N, DCI.DAG, Subtarget);
13893   case ISD::ADD:        return PerformADDCombine(N, DCI, Subtarget);
13894   case ISD::SUB:        return PerformSUBCombine(N, DCI);
13895   case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
13896   case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
13897   case ISD::XOR:        return PerformXORCombine(N, DCI, Subtarget);
13898   case ISD::AND:        return PerformANDCombine(N, DCI, Subtarget);
13899   case ISD::BRCOND:
13900   case ISD::BR_CC:      return PerformHWLoopCombine(N, DCI, Subtarget);
13901   case ARMISD::ADDC:
13902   case ARMISD::SUBC:    return PerformAddcSubcCombine(N, DCI, Subtarget);
13903   case ARMISD::SUBE:    return PerformAddeSubeCombine(N, DCI, Subtarget);
13904   case ARMISD::BFI:     return PerformBFICombine(N, DCI);
13905   case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
13906   case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
13907   case ISD::STORE:      return PerformSTORECombine(N, DCI);
13908   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
13909   case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
13910   case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
13911   case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
13912   case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget);
13913   case ISD::FP_TO_SINT:
13914   case ISD::FP_TO_UINT:
13915     return PerformVCVTCombine(N, DCI.DAG, Subtarget);
13916   case ISD::FDIV:
13917     return PerformVDIVCombine(N, DCI.DAG, Subtarget);
13918   case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
13919   case ISD::SHL:
13920   case ISD::SRA:
13921   case ISD::SRL:
13922     return PerformShiftCombine(N, DCI, Subtarget);
13923   case ISD::SIGN_EXTEND:
13924   case ISD::ZERO_EXTEND:
13925   case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
13926   case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
13927   case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
13928   case ISD::LOAD:       return PerformLOADCombine(N, DCI);
13929   case ARMISD::VLD1DUP:
13930   case ARMISD::VLD2DUP:
13931   case ARMISD::VLD3DUP:
13932   case ARMISD::VLD4DUP:
13933     return PerformVLDCombine(N, DCI);
13934   case ARMISD::BUILD_VECTOR:
13935     return PerformARMBUILD_VECTORCombine(N, DCI);
13936   case ARMISD::SMULWB: {
13937     unsigned BitWidth = N->getValueType(0).getSizeInBits();
13938     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
13939     if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
13940       return SDValue();
13941     break;
13942   }
13943   case ARMISD::SMULWT: {
13944     unsigned BitWidth = N->getValueType(0).getSizeInBits();
13945     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
13946     if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
13947       return SDValue();
13948     break;
13949   }
13950   case ARMISD::SMLALBB: {
13951     unsigned BitWidth = N->getValueType(0).getSizeInBits();
13952     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
13953     if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
13954         (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
13955       return SDValue();
13956     break;
13957   }
13958   case ARMISD::SMLALBT: {
13959     unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
13960     APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
13961     unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
13962     APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
13963     if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
13964         (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
13965       return SDValue();
13966     break;
13967   }
13968   case ARMISD::SMLALTB: {
13969     unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
13970     APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
13971     unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
13972     APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
13973     if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
13974         (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
13975       return SDValue();
13976     break;
13977   }
13978   case ARMISD::SMLALTT: {
13979     unsigned BitWidth = N->getValueType(0).getSizeInBits();
13980     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
13981     if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
13982         (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
13983       return SDValue();
13984     break;
13985   }
13986   case ISD::INTRINSIC_VOID:
13987   case ISD::INTRINSIC_W_CHAIN:
13988     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
13989     case Intrinsic::arm_neon_vld1:
13990     case Intrinsic::arm_neon_vld1x2:
13991     case Intrinsic::arm_neon_vld1x3:
13992     case Intrinsic::arm_neon_vld1x4:
13993     case Intrinsic::arm_neon_vld2:
13994     case Intrinsic::arm_neon_vld3:
13995     case Intrinsic::arm_neon_vld4:
13996     case Intrinsic::arm_neon_vld2lane:
13997     case Intrinsic::arm_neon_vld3lane:
13998     case Intrinsic::arm_neon_vld4lane:
13999     case Intrinsic::arm_neon_vld2dup:
14000     case Intrinsic::arm_neon_vld3dup:
14001     case Intrinsic::arm_neon_vld4dup:
14002     case Intrinsic::arm_neon_vst1:
14003     case Intrinsic::arm_neon_vst1x2:
14004     case Intrinsic::arm_neon_vst1x3:
14005     case Intrinsic::arm_neon_vst1x4:
14006     case Intrinsic::arm_neon_vst2:
14007     case Intrinsic::arm_neon_vst3:
14008     case Intrinsic::arm_neon_vst4:
14009     case Intrinsic::arm_neon_vst2lane:
14010     case Intrinsic::arm_neon_vst3lane:
14011     case Intrinsic::arm_neon_vst4lane:
14012       return PerformVLDCombine(N, DCI);
14013     default: break;
14014     }
14015     break;
14016   }
14017   return SDValue();
14018 }
14019
14020 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
14021                                                           EVT VT) const {
14022   return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
14023 }
14024
14025 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
14026                                                        unsigned Alignment,
14027                                                        MachineMemOperand::Flags,
14028                                                        bool *Fast) const {
14029   // Depends what it gets converted into if the type is weird.
14030   if (!VT.isSimple())
14031     return false;
14032
14033   // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
14034   bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
14035   auto Ty = VT.getSimpleVT().SimpleTy;
14036
14037   if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
14038     // Unaligned access can use (for example) LRDB, LRDH, LDR
14039     if (AllowsUnaligned) {
14040       if (Fast)
14041         *Fast = Subtarget->hasV7Ops();
14042       return true;
14043     }
14044   }
14045
14046   if (Ty == MVT::f64 || Ty == MVT::v2f64) {
14047     // For any little-endian targets with neon, we can support unaligned ld/st
14048     // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
14049     // A big-endian target may also explicitly support unaligned accesses
14050     if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
14051       if (Fast)
14052         *Fast = true;
14053       return true;
14054     }
14055   }
14056
14057   if (!Subtarget->hasMVEIntegerOps())
14058     return false;
14059
14060   // These are for predicates
14061   if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1)) {
14062     if (Fast)
14063       *Fast = true;
14064     return true;
14065   }
14066
14067   if (Ty != MVT::v16i8 && Ty != MVT::v8i16 && Ty != MVT::v8f16 &&
14068       Ty != MVT::v4i32 && Ty != MVT::v4f32 && Ty != MVT::v2i64 &&
14069       Ty != MVT::v2f64 &&
14070       // These are for truncated stores
14071       Ty != MVT::v4i8 && Ty != MVT::v8i8 && Ty != MVT::v4i16)
14072     return false;
14073
14074   if (Subtarget->isLittle()) {
14075     // In little-endian MVE, the store instructions VSTRB.U8,
14076     // VSTRH.U16 and VSTRW.U32 all store the vector register in
14077     // exactly the same format, and differ only in the range of
14078     // their immediate offset field and the required alignment.
14079     //
14080     // In particular, VSTRB.U8 can store a vector at byte alignment.
14081     // So at this stage we can simply say that loads/stores of all
14082     // 128-bit wide vector types are permitted at any alignment,
14083     // because we know at least _one_ instruction can manage that.
14084     //
14085     // Later on we might find that some of those loads are better
14086     // generated as VLDRW.U32 if alignment permits, to take
14087     // advantage of the larger immediate range. But for the moment,
14088     // all that matters is that if we don't lower the load then
14089     // _some_ instruction can handle it.
14090     if (Fast)
14091       *Fast = true;
14092     return true;
14093   } else {
14094     // In big-endian MVE, those instructions aren't so similar
14095     // after all, because they reorder the bytes of the vector
14096     // differently. So this time we can only store a particular
14097     // kind of vector if its alignment is at least the element
14098     // type. And we can't store vectors of i64 or f64 at all
14099     // without having to do some postprocessing, because there's
14100     // no VSTRD.U64.
14101     if (Ty == MVT::v16i8 ||
14102         ((Ty == MVT::v8i16 || Ty == MVT::v8f16) && Alignment >= 2) ||
14103         ((Ty == MVT::v4i32 || Ty == MVT::v4f32) && Alignment >= 4)) {
14104       if (Fast)
14105         *Fast = true;
14106       return true;
14107     }
14108   }
14109
14110   return false;
14111 }
14112
14113 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
14114                        unsigned AlignCheck) {
14115   return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
14116           (DstAlign == 0 || DstAlign % AlignCheck == 0));
14117 }
14118
14119 EVT ARMTargetLowering::getOptimalMemOpType(
14120     uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
14121     bool ZeroMemset, bool MemcpyStrSrc,
14122     const AttributeList &FuncAttributes) const {
14123   // See if we can use NEON instructions for this...
14124   if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() &&
14125       !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
14126     bool Fast;
14127     if (Size >= 16 &&
14128         (memOpAlign(SrcAlign, DstAlign, 16) ||
14129          (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1,
14130                                          MachineMemOperand::MONone, &Fast) &&
14131           Fast))) {
14132       return MVT::v2f64;
14133     } else if (Size >= 8 &&
14134                (memOpAlign(SrcAlign, DstAlign, 8) ||
14135                 (allowsMisalignedMemoryAccesses(
14136                      MVT::f64, 0, 1, MachineMemOperand::MONone, &Fast) &&
14137                  Fast))) {
14138       return MVT::f64;
14139     }
14140   }
14141
14142   // Let the target-independent logic figure it out.
14143   return MVT::Other;
14144 }
14145
14146 // 64-bit integers are split into their high and low parts and held in two
14147 // different registers, so the trunc is free since the low register can just
14148 // be used.
14149 bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
14150   if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
14151     return false;
14152   unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
14153   unsigned DestBits = DstTy->getPrimitiveSizeInBits();
14154   return (SrcBits == 64 && DestBits == 32);
14155 }
14156
14157 bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const {
14158   if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
14159       !DstVT.isInteger())
14160     return false;
14161   unsigned SrcBits = SrcVT.getSizeInBits();
14162   unsigned DestBits = DstVT.getSizeInBits();
14163   return (SrcBits == 64 && DestBits == 32);
14164 }
14165
14166 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
14167   if (Val.getOpcode() != ISD::LOAD)
14168     return false;
14169
14170   EVT VT1 = Val.getValueType();
14171   if (!VT1.isSimple() || !VT1.isInteger() ||
14172       !VT2.isSimple() || !VT2.isInteger())
14173     return false;
14174
14175   switch (VT1.getSimpleVT().SimpleTy) {
14176   default: break;
14177   case MVT::i1:
14178   case MVT::i8:
14179   case MVT::i16:
14180     // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
14181     return true;
14182   }
14183
14184   return false;
14185 }
14186
14187 bool ARMTargetLowering::isFNegFree(EVT VT) const {
14188   if (!VT.isSimple())
14189     return false;
14190
14191   // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
14192   // negate values directly (fneg is free). So, we don't want to let the DAG
14193   // combiner rewrite fneg into xors and some other instructions.  For f16 and
14194   // FullFP16 argument passing, some bitcast nodes may be introduced,
14195   // triggering this DAG combine rewrite, so we are avoiding that with this.
14196   switch (VT.getSimpleVT().SimpleTy) {
14197   default: break;
14198   case MVT::f16:
14199     return Subtarget->hasFullFP16();
14200   }
14201
14202   return false;
14203 }
14204
14205 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
14206 /// of the vector elements.
14207 static bool areExtractExts(Value *Ext1, Value *Ext2) {
14208   auto areExtDoubled = [](Instruction *Ext) {
14209     return Ext->getType()->getScalarSizeInBits() ==
14210            2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
14211   };
14212
14213   if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
14214       !match(Ext2, m_ZExtOrSExt(m_Value())) ||
14215       !areExtDoubled(cast<Instruction>(Ext1)) ||
14216       !areExtDoubled(cast<Instruction>(Ext2)))
14217     return false;
14218
14219   return true;
14220 }
14221
14222 /// Check if sinking \p I's operands to I's basic block is profitable, because
14223 /// the operands can be folded into a target instruction, e.g.
14224 /// sext/zext can be folded into vsubl.
14225 bool ARMTargetLowering::shouldSinkOperands(Instruction *I,
14226                                            SmallVectorImpl<Use *> &Ops) const {
14227   if (!Subtarget->hasNEON() || !I->getType()->isVectorTy())
14228     return false;
14229
14230   switch (I->getOpcode()) {
14231   case Instruction::Sub:
14232   case Instruction::Add: {
14233     if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
14234       return false;
14235     Ops.push_back(&I->getOperandUse(0));
14236     Ops.push_back(&I->getOperandUse(1));
14237     return true;
14238   }
14239   default:
14240     return false;
14241   }
14242   return false;
14243 }
14244
14245 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
14246   EVT VT = ExtVal.getValueType();
14247
14248   if (!isTypeLegal(VT))
14249     return false;
14250
14251   // Don't create a loadext if we can fold the extension into a wide/long
14252   // instruction.
14253   // If there's more than one user instruction, the loadext is desirable no
14254   // matter what.  There can be two uses by the same instruction.
14255   if (ExtVal->use_empty() ||
14256       !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode()))
14257     return true;
14258
14259   SDNode *U = *ExtVal->use_begin();
14260   if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
14261        U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
14262     return false;
14263
14264   return true;
14265 }
14266
14267 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
14268   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
14269     return false;
14270
14271   if (!isTypeLegal(EVT::getEVT(Ty1)))
14272     return false;
14273
14274   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
14275
14276   // Assuming the caller doesn't have a zeroext or signext return parameter,
14277   // truncation all the way down to i1 is valid.
14278   return true;
14279 }
14280
14281 int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL,
14282                                                 const AddrMode &AM, Type *Ty,
14283                                                 unsigned AS) const {
14284   if (isLegalAddressingMode(DL, AM, Ty, AS)) {
14285     if (Subtarget->hasFPAO())
14286       return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
14287     return 0;
14288   }
14289   return -1;
14290 }
14291
14292 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
14293   if (V < 0)
14294     return false;
14295
14296   unsigned Scale = 1;
14297   switch (VT.getSimpleVT().SimpleTy) {
14298   case MVT::i1:
14299   case MVT::i8:
14300     // Scale == 1;
14301     break;
14302   case MVT::i16:
14303     // Scale == 2;
14304     Scale = 2;
14305     break;
14306   default:
14307     // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
14308     // Scale == 4;
14309     Scale = 4;
14310     break;
14311   }
14312
14313   if ((V & (Scale - 1)) != 0)
14314     return false;
14315   return isUInt<5>(V / Scale);
14316 }
14317
14318 static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
14319                                       const ARMSubtarget *Subtarget) {
14320   if (!VT.isInteger() && !VT.isFloatingPoint())
14321     return false;
14322   if (VT.isVector() && Subtarget->hasNEON())
14323     return false;
14324   if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
14325       !Subtarget->hasMVEFloatOps())
14326     return false;
14327
14328   bool IsNeg = false;
14329   if (V < 0) {
14330     IsNeg = true;
14331     V = -V;
14332   }
14333
14334   unsigned NumBytes = std::max(VT.getSizeInBits() / 8, 1U);
14335
14336   // MVE: size * imm7
14337   if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
14338     switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
14339     case MVT::i32:
14340     case MVT::f32:
14341       return isShiftedUInt<7,2>(V);
14342     case MVT::i16:
14343     case MVT::f16:
14344       return isShiftedUInt<7,1>(V);
14345     case MVT::i8:
14346       return isUInt<7>(V);
14347     default:
14348       return false;
14349     }
14350   }
14351
14352   // half VLDR: 2 * imm8
14353   if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
14354     return isShiftedUInt<8, 1>(V);
14355   // VLDR and LDRD: 4 * imm8
14356   if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
14357     return isShiftedUInt<8, 2>(V);
14358
14359   if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
14360     // + imm12 or - imm8
14361     if (IsNeg)
14362       return isUInt<8>(V);
14363     return isUInt<12>(V);
14364   }
14365
14366   return false;
14367 }
14368
14369 /// isLegalAddressImmediate - Return true if the integer value can be used
14370 /// as the offset of the target addressing mode for load / store of the
14371 /// given type.
14372 static bool isLegalAddressImmediate(int64_t V, EVT VT,
14373                                     const ARMSubtarget *Subtarget) {
14374   if (V == 0)
14375     return true;
14376
14377   if (!VT.isSimple())
14378     return false;
14379
14380   if (Subtarget->isThumb1Only())
14381     return isLegalT1AddressImmediate(V, VT);
14382   else if (Subtarget->isThumb2())
14383     return isLegalT2AddressImmediate(V, VT, Subtarget);
14384
14385   // ARM mode.
14386   if (V < 0)
14387     V = - V;
14388   switch (VT.getSimpleVT().SimpleTy) {
14389   default: return false;
14390   case MVT::i1:
14391   case MVT::i8:
14392   case MVT::i32:
14393     // +- imm12
14394     return isUInt<12>(V);
14395   case MVT::i16:
14396     // +- imm8
14397     return isUInt<8>(V);
14398   case MVT::f32:
14399   case MVT::f64:
14400     if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
14401       return false;
14402     return isShiftedUInt<8, 2>(V);
14403   }
14404 }
14405
14406 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
14407                                                       EVT VT) const {
14408   int Scale = AM.Scale;
14409   if (Scale < 0)
14410     return false;
14411
14412   switch (VT.getSimpleVT().SimpleTy) {
14413   default: return false;
14414   case MVT::i1:
14415   case MVT::i8:
14416   case MVT::i16:
14417   case MVT::i32:
14418     if (Scale == 1)
14419       return true;
14420     // r + r << imm
14421     Scale = Scale & ~1;
14422     return Scale == 2 || Scale == 4 || Scale == 8;
14423   case MVT::i64:
14424     // FIXME: What are we trying to model here? ldrd doesn't have an r + r
14425     // version in Thumb mode.
14426     // r + r
14427     if (Scale == 1)
14428       return true;
14429     // r * 2 (this can be lowered to r + r).
14430     if (!AM.HasBaseReg && Scale == 2)
14431       return true;
14432     return false;
14433   case MVT::isVoid:
14434     // Note, we allow "void" uses (basically, uses that aren't loads or
14435     // stores), because arm allows folding a scale into many arithmetic
14436     // operations.  This should be made more precise and revisited later.
14437
14438     // Allow r << imm, but the imm has to be a multiple of two.
14439     if (Scale & 1) return false;
14440     return isPowerOf2_32(Scale);
14441   }
14442 }
14443
14444 bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM,
14445                                                       EVT VT) const {
14446   const int Scale = AM.Scale;
14447
14448   // Negative scales are not supported in Thumb1.
14449   if (Scale < 0)
14450     return false;
14451
14452   // Thumb1 addressing modes do not support register scaling excepting the
14453   // following cases:
14454   // 1. Scale == 1 means no scaling.
14455   // 2. Scale == 2 this can be lowered to r + r if there is no base register.
14456   return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
14457 }
14458
14459 /// isLegalAddressingMode - Return true if the addressing mode represented
14460 /// by AM is legal for this target, for a load/store of the specified type.
14461 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL,
14462                                               const AddrMode &AM, Type *Ty,
14463                                               unsigned AS, Instruction *I) const {
14464   EVT VT = getValueType(DL, Ty, true);
14465   if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
14466     return false;
14467
14468   // Can never fold addr of global into load/store.
14469   if (AM.BaseGV)
14470     return false;
14471
14472   switch (AM.Scale) {
14473   case 0:  // no scale reg, must be "r+i" or "r", or "i".
14474     break;
14475   default:
14476     // ARM doesn't support any R+R*scale+imm addr modes.
14477     if (AM.BaseOffs)
14478       return false;
14479
14480     if (!VT.isSimple())
14481       return false;
14482
14483     if (Subtarget->isThumb1Only())
14484       return isLegalT1ScaledAddressingMode(AM, VT);
14485
14486     if (Subtarget->isThumb2())
14487       return isLegalT2ScaledAddressingMode(AM, VT);
14488
14489     int Scale = AM.Scale;
14490     switch (VT.getSimpleVT().SimpleTy) {
14491     default: return false;
14492     case MVT::i1:
14493     case MVT::i8:
14494     case MVT::i32:
14495       if (Scale < 0) Scale = -Scale;
14496       if (Scale == 1)
14497         return true;
14498       // r + r << imm
14499       return isPowerOf2_32(Scale & ~1);
14500     case MVT::i16:
14501     case MVT::i64:
14502       // r +/- r
14503       if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
14504         return true;
14505       // r * 2 (this can be lowered to r + r).
14506       if (!AM.HasBaseReg && Scale == 2)
14507         return true;
14508       return false;
14509
14510     case MVT::isVoid:
14511       // Note, we allow "void" uses (basically, uses that aren't loads or
14512       // stores), because arm allows folding a scale into many arithmetic
14513       // operations.  This should be made more precise and revisited later.
14514
14515       // Allow r << imm, but the imm has to be a multiple of two.
14516       if (Scale & 1) return false;
14517       return isPowerOf2_32(Scale);
14518     }
14519   }
14520   return true;
14521 }
14522
14523 /// isLegalICmpImmediate - Return true if the specified immediate is legal
14524 /// icmp immediate, that is the target has icmp instructions which can compare
14525 /// a register against the immediate without having to materialize the
14526 /// immediate into a register.
14527 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
14528   // Thumb2 and ARM modes can use cmn for negative immediates.
14529   if (!Subtarget->isThumb())
14530     return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
14531            ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
14532   if (Subtarget->isThumb2())
14533     return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
14534            ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
14535   // Thumb1 doesn't have cmn, and only 8-bit immediates.
14536   return Imm >= 0 && Imm <= 255;
14537 }
14538
14539 /// isLegalAddImmediate - Return true if the specified immediate is a legal add
14540 /// *or sub* immediate, that is the target has add or sub instructions which can
14541 /// add a register with the immediate without having to materialize the
14542 /// immediate into a register.
14543 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
14544   // Same encoding for add/sub, just flip the sign.
14545   int64_t AbsImm = std::abs(Imm);
14546   if (!Subtarget->isThumb())
14547     return ARM_AM::getSOImmVal(AbsImm) != -1;
14548   if (Subtarget->isThumb2())
14549     return ARM_AM::getT2SOImmVal(AbsImm) != -1;
14550   // Thumb1 only has 8-bit unsigned immediate.
14551   return AbsImm >= 0 && AbsImm <= 255;
14552 }
14553
14554 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
14555                                       bool isSEXTLoad, SDValue &Base,
14556                                       SDValue &Offset, bool &isInc,
14557                                       SelectionDAG &DAG) {
14558   if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
14559     return false;
14560
14561   if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
14562     // AddressingMode 3
14563     Base = Ptr->getOperand(0);
14564     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
14565       int RHSC = (int)RHS->getZExtValue();
14566       if (RHSC < 0 && RHSC > -256) {
14567         assert(Ptr->getOpcode() == ISD::ADD);
14568         isInc = false;
14569         Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
14570         return true;
14571       }
14572     }
14573     isInc = (Ptr->getOpcode() == ISD::ADD);
14574     Offset = Ptr->getOperand(1);
14575     return true;
14576   } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
14577     // AddressingMode 2
14578     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
14579       int RHSC = (int)RHS->getZExtValue();
14580       if (RHSC < 0 && RHSC > -0x1000) {
14581         assert(Ptr->getOpcode() == ISD::ADD);
14582         isInc = false;
14583         Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
14584         Base = Ptr->getOperand(0);
14585         return true;
14586       }
14587     }
14588
14589     if (Ptr->getOpcode() == ISD::ADD) {
14590       isInc = true;
14591       ARM_AM::ShiftOpc ShOpcVal=
14592         ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
14593       if (ShOpcVal != ARM_AM::no_shift) {
14594         Base = Ptr->getOperand(1);
14595         Offset = Ptr->getOperand(0);
14596       } else {
14597         Base = Ptr->getOperand(0);
14598         Offset = Ptr->getOperand(1);
14599       }
14600       return true;
14601     }
14602
14603     isInc = (Ptr->getOpcode() == ISD::ADD);
14604     Base = Ptr->getOperand(0);
14605     Offset = Ptr->getOperand(1);
14606     return true;
14607   }
14608
14609   // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
14610   return false;
14611 }
14612
14613 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
14614                                      bool isSEXTLoad, SDValue &Base,
14615                                      SDValue &Offset, bool &isInc,
14616                                      SelectionDAG &DAG) {
14617   if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
14618     return false;
14619
14620   Base = Ptr->getOperand(0);
14621   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
14622     int RHSC = (int)RHS->getZExtValue();
14623     if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
14624       assert(Ptr->getOpcode() == ISD::ADD);
14625       isInc = false;
14626       Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
14627       return true;
14628     } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
14629       isInc = Ptr->getOpcode() == ISD::ADD;
14630       Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
14631       return true;
14632     }
14633   }
14634
14635   return false;
14636 }
14637
14638 /// getPreIndexedAddressParts - returns true by value, base pointer and
14639 /// offset pointer and addressing mode by reference if the node's address
14640 /// can be legally represented as pre-indexed load / store address.
14641 bool
14642 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
14643                                              SDValue &Offset,
14644                                              ISD::MemIndexedMode &AM,
14645                                              SelectionDAG &DAG) const {
14646   if (Subtarget->isThumb1Only())
14647     return false;
14648
14649   EVT VT;
14650   SDValue Ptr;
14651   bool isSEXTLoad = false;
14652   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
14653     Ptr = LD->getBasePtr();
14654     VT  = LD->getMemoryVT();
14655     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
14656   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
14657     Ptr = ST->getBasePtr();
14658     VT  = ST->getMemoryVT();
14659   } else
14660     return false;
14661
14662   bool isInc;
14663   bool isLegal = false;
14664   if (Subtarget->isThumb2())
14665     isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
14666                                        Offset, isInc, DAG);
14667   else
14668     isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
14669                                         Offset, isInc, DAG);
14670   if (!isLegal)
14671     return false;
14672
14673   AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
14674   return true;
14675 }
14676
14677 /// getPostIndexedAddressParts - returns true by value, base pointer and
14678 /// offset pointer and addressing mode by reference if this node can be
14679 /// combined with a load / store to form a post-indexed load / store.
14680 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
14681                                                    SDValue &Base,
14682                                                    SDValue &Offset,
14683                                                    ISD::MemIndexedMode &AM,
14684                                                    SelectionDAG &DAG) const {
14685   EVT VT;
14686   SDValue Ptr;
14687   bool isSEXTLoad = false, isNonExt;
14688   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
14689     VT  = LD->getMemoryVT();
14690     Ptr = LD->getBasePtr();
14691     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
14692     isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
14693   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
14694     VT  = ST->getMemoryVT();
14695     Ptr = ST->getBasePtr();
14696     isNonExt = !ST->isTruncatingStore();
14697   } else
14698     return false;
14699
14700   if (Subtarget->isThumb1Only()) {
14701     // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
14702     // must be non-extending/truncating, i32, with an offset of 4.
14703     assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
14704     if (Op->getOpcode() != ISD::ADD || !isNonExt)
14705       return false;
14706     auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14707     if (!RHS || RHS->getZExtValue() != 4)
14708       return false;
14709
14710     Offset = Op->getOperand(1);
14711     Base = Op->getOperand(0);
14712     AM = ISD::POST_INC;
14713     return true;
14714   }
14715
14716   bool isInc;
14717   bool isLegal = false;
14718   if (Subtarget->isThumb2())
14719     isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
14720                                        isInc, DAG);
14721   else
14722     isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
14723                                         isInc, DAG);
14724   if (!isLegal)
14725     return false;
14726
14727   if (Ptr != Base) {
14728     // Swap base ptr and offset to catch more post-index load / store when
14729     // it's legal. In Thumb2 mode, offset must be an immediate.
14730     if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
14731         !Subtarget->isThumb2())
14732       std::swap(Base, Offset);
14733
14734     // Post-indexed load / store update the base pointer.
14735     if (Ptr != Base)
14736       return false;
14737   }
14738
14739   AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
14740   return true;
14741 }
14742
14743 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
14744                                                       KnownBits &Known,
14745                                                       const APInt &DemandedElts,
14746                                                       const SelectionDAG &DAG,
14747                                                       unsigned Depth) const {
14748   unsigned BitWidth = Known.getBitWidth();
14749   Known.resetAll();
14750   switch (Op.getOpcode()) {
14751   default: break;
14752   case ARMISD::ADDC:
14753   case ARMISD::ADDE:
14754   case ARMISD::SUBC:
14755   case ARMISD::SUBE:
14756     // Special cases when we convert a carry to a boolean.
14757     if (Op.getResNo() == 0) {
14758       SDValue LHS = Op.getOperand(0);
14759       SDValue RHS = Op.getOperand(1);
14760       // (ADDE 0, 0, C) will give us a single bit.
14761       if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
14762           isNullConstant(RHS)) {
14763         Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
14764         return;
14765       }
14766     }
14767     break;
14768   case ARMISD::CMOV: {
14769     // Bits are known zero/one if known on the LHS and RHS.
14770     Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
14771     if (Known.isUnknown())
14772       return;
14773
14774     KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
14775     Known.Zero &= KnownRHS.Zero;
14776     Known.One  &= KnownRHS.One;
14777     return;
14778   }
14779   case ISD::INTRINSIC_W_CHAIN: {
14780     ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
14781     Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
14782     switch (IntID) {
14783     default: return;
14784     case Intrinsic::arm_ldaex:
14785     case Intrinsic::arm_ldrex: {
14786       EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
14787       unsigned MemBits = VT.getScalarSizeInBits();
14788       Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
14789       return;
14790     }
14791     }
14792   }
14793   case ARMISD::BFI: {
14794     // Conservatively, we can recurse down the first operand
14795     // and just mask out all affected bits.
14796     Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
14797
14798     // The operand to BFI is already a mask suitable for removing the bits it
14799     // sets.
14800     ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2));
14801     const APInt &Mask = CI->getAPIntValue();
14802     Known.Zero &= Mask;
14803     Known.One &= Mask;
14804     return;
14805   }
14806   case ARMISD::VGETLANEs:
14807   case ARMISD::VGETLANEu: {
14808     const SDValue &SrcSV = Op.getOperand(0);
14809     EVT VecVT = SrcSV.getValueType();
14810     assert(VecVT.isVector() && "VGETLANE expected a vector type");
14811     const unsigned NumSrcElts = VecVT.getVectorNumElements();
14812     ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
14813     assert(Pos->getAPIntValue().ult(NumSrcElts) &&
14814            "VGETLANE index out of bounds");
14815     unsigned Idx = Pos->getZExtValue();
14816     APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
14817     Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);
14818
14819     EVT VT = Op.getValueType();
14820     const unsigned DstSz = VT.getScalarSizeInBits();
14821     const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
14822     (void)SrcSz;
14823     assert(SrcSz == Known.getBitWidth());
14824     assert(DstSz > SrcSz);
14825     if (Op.getOpcode() == ARMISD::VGETLANEs)
14826       Known = Known.sext(DstSz);
14827     else {
14828       Known = Known.zext(DstSz, true /* extended bits are known zero */);
14829     }
14830     assert(DstSz == Known.getBitWidth());
14831     break;
14832   }
14833   }
14834 }
14835
14836 bool
14837 ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op,
14838                                                 const APInt &DemandedAPInt,
14839                                                 TargetLoweringOpt &TLO) const {
14840   // Delay optimization, so we don't have to deal with illegal types, or block
14841   // optimizations.
14842   if (!TLO.LegalOps)
14843     return false;
14844
14845   // Only optimize AND for now.
14846   if (Op.getOpcode() != ISD::AND)
14847     return false;
14848
14849   EVT VT = Op.getValueType();
14850
14851   // Ignore vectors.
14852   if (VT.isVector())
14853     return false;
14854
14855   assert(VT == MVT::i32 && "Unexpected integer type");
14856
14857   // Make sure the RHS really is a constant.
14858   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
14859   if (!C)
14860     return false;
14861
14862   unsigned Mask = C->getZExtValue();
14863
14864   unsigned Demanded = DemandedAPInt.getZExtValue();
14865   unsigned ShrunkMask = Mask & Demanded;
14866   unsigned ExpandedMask = Mask | ~Demanded;
14867
14868   // If the mask is all zeros, let the target-independent code replace the
14869   // result with zero.
14870   if (ShrunkMask == 0)
14871     return false;
14872
14873   // If the mask is all ones, erase the AND. (Currently, the target-independent
14874   // code won't do this, so we have to do it explicitly to avoid an infinite
14875   // loop in obscure cases.)
14876   if (ExpandedMask == ~0U)
14877     return TLO.CombineTo(Op, Op.getOperand(0));
14878
14879   auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
14880     return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
14881   };
14882   auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
14883     if (NewMask == Mask)
14884       return true;
14885     SDLoc DL(Op);
14886     SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
14887     SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
14888     return TLO.CombineTo(Op, NewOp);
14889   };
14890
14891   // Prefer uxtb mask.
14892   if (IsLegalMask(0xFF))
14893     return UseMask(0xFF);
14894
14895   // Prefer uxth mask.
14896   if (IsLegalMask(0xFFFF))
14897     return UseMask(0xFFFF);
14898
14899   // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
14900   // FIXME: Prefer a contiguous sequence of bits for other optimizations.
14901   if (ShrunkMask < 256)
14902     return UseMask(ShrunkMask);
14903
14904   // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
14905   // FIXME: Prefer a contiguous sequence of bits for other optimizations.
14906   if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
14907     return UseMask(ExpandedMask);
14908
14909   // Potential improvements:
14910   //
14911   // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
14912   // We could try to prefer Thumb1 immediates which can be lowered to a
14913   // two-instruction sequence.
14914   // We could try to recognize more legal ARM/Thumb2 immediates here.
14915
14916   return false;
14917 }
14918
14919
14920 //===----------------------------------------------------------------------===//
14921 //                           ARM Inline Assembly Support
14922 //===----------------------------------------------------------------------===//
14923
14924 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
14925   // Looking for "rev" which is V6+.
14926   if (!Subtarget->hasV6Ops())
14927     return false;
14928
14929   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
14930   std::string AsmStr = IA->getAsmString();
14931   SmallVector<StringRef, 4> AsmPieces;
14932   SplitString(AsmStr, AsmPieces, ";\n");
14933
14934   switch (AsmPieces.size()) {
14935   default: return false;
14936   case 1:
14937     AsmStr = AsmPieces[0];
14938     AsmPieces.clear();
14939     SplitString(AsmStr, AsmPieces, " \t,");
14940
14941     // rev $0, $1
14942     if (AsmPieces.size() == 3 &&
14943         AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
14944         IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
14945       IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
14946       if (Ty && Ty->getBitWidth() == 32)
14947         return IntrinsicLowering::LowerToByteSwap(CI);
14948     }
14949     break;
14950   }
14951
14952   return false;
14953 }
14954
14955 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
14956   // At this point, we have to lower this constraint to something else, so we
14957   // lower it to an "r" or "w". However, by doing this we will force the result
14958   // to be in register, while the X constraint is much more permissive.
14959   //
14960   // Although we are correct (we are free to emit anything, without
14961   // constraints), we might break use cases that would expect us to be more
14962   // efficient and emit something else.
14963   if (!Subtarget->hasVFP2Base())
14964     return "r";
14965   if (ConstraintVT.isFloatingPoint())
14966     return "w";
14967   if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
14968      (ConstraintVT.getSizeInBits() == 64 ||
14969       ConstraintVT.getSizeInBits() == 128))
14970     return "w";
14971
14972   return "r";
14973 }
14974
14975 /// getConstraintType - Given a constraint letter, return the type of
14976 /// constraint it is for this target.
14977 ARMTargetLowering::ConstraintType
14978 ARMTargetLowering::getConstraintType(StringRef Constraint) const {
14979   unsigned S = Constraint.size();
14980   if (S == 1) {
14981     switch (Constraint[0]) {
14982     default:  break;
14983     case 'l': return C_RegisterClass;
14984     case 'w': return C_RegisterClass;
14985     case 'h': return C_RegisterClass;
14986     case 'x': return C_RegisterClass;
14987     case 't': return C_RegisterClass;
14988     case 'j': return C_Immediate; // Constant for movw.
14989     // An address with a single base register. Due to the way we
14990     // currently handle addresses it is the same as an 'r' memory constraint.
14991     case 'Q': return C_Memory;
14992     }
14993   } else if (S == 2) {
14994     switch (Constraint[0]) {
14995     default: break;
14996     case 'T': return C_RegisterClass;
14997     // All 'U+' constraints are addresses.
14998     case 'U': return C_Memory;
14999     }
15000   }
15001   return TargetLowering::getConstraintType(Constraint);
15002 }
15003
15004 /// Examine constraint type and operand type and determine a weight value.
15005 /// This object must already have been set up with the operand type
15006 /// and the current alternative constraint selected.
15007 TargetLowering::ConstraintWeight
15008 ARMTargetLowering::getSingleConstraintMatchWeight(
15009     AsmOperandInfo &info, const char *constraint) const {
15010   ConstraintWeight weight = CW_Invalid;
15011   Value *CallOperandVal = info.CallOperandVal;
15012     // If we don't have a value, we can't do a match,
15013     // but allow it at the lowest weight.
15014   if (!CallOperandVal)
15015     return CW_Default;
15016   Type *type = CallOperandVal->getType();
15017   // Look at the constraint type.
15018   switch (*constraint) {
15019   default:
15020     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
15021     break;
15022   case 'l':
15023     if (type->isIntegerTy()) {
15024       if (Subtarget->isThumb())
15025         weight = CW_SpecificReg;
15026       else
15027         weight = CW_Register;
15028     }
15029     break;
15030   case 'w':
15031     if (type->isFloatingPointTy())
15032       weight = CW_Register;
15033     break;
15034   }
15035   return weight;
15036 }
15037
15038 using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
15039
15040 RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
15041     const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
15042   switch (Constraint.size()) {
15043   case 1:
15044     // GCC ARM Constraint Letters
15045     switch (Constraint[0]) {
15046     case 'l': // Low regs or general regs.
15047       if (Subtarget->isThumb())
15048         return RCPair(0U, &ARM::tGPRRegClass);
15049       return RCPair(0U, &ARM::GPRRegClass);
15050     case 'h': // High regs or no regs.
15051       if (Subtarget->isThumb())
15052         return RCPair(0U, &ARM::hGPRRegClass);
15053       break;
15054     case 'r':
15055       if (Subtarget->isThumb1Only())
15056         return RCPair(0U, &ARM::tGPRRegClass);
15057       return RCPair(0U, &ARM::GPRRegClass);
15058     case 'w':
15059       if (VT == MVT::Other)
15060         break;
15061       if (VT == MVT::f32)
15062         return RCPair(0U, &ARM::SPRRegClass);
15063       if (VT.getSizeInBits() == 64)
15064         return RCPair(0U, &ARM::DPRRegClass);
15065       if (VT.getSizeInBits() == 128)
15066         return RCPair(0U, &ARM::QPRRegClass);
15067       break;
15068     case 'x':
15069       if (VT == MVT::Other)
15070         break;
15071       if (VT == MVT::f32)
15072         return RCPair(0U, &ARM::SPR_8RegClass);
15073       if (VT.getSizeInBits() == 64)
15074         return RCPair(0U, &ARM::DPR_8RegClass);
15075       if (VT.getSizeInBits() == 128)
15076         return RCPair(0U, &ARM::QPR_8RegClass);
15077       break;
15078     case 't':
15079       if (VT == MVT::Other)
15080         break;
15081       if (VT == MVT::f32 || VT == MVT::i32)
15082         return RCPair(0U, &ARM::SPRRegClass);
15083       if (VT.getSizeInBits() == 64)
15084         return RCPair(0U, &ARM::DPR_VFP2RegClass);
15085       if (VT.getSizeInBits() == 128)
15086         return RCPair(0U, &ARM::QPR_VFP2RegClass);
15087       break;
15088     }
15089     break;
15090
15091   case 2:
15092     if (Constraint[0] == 'T') {
15093       switch (Constraint[1]) {
15094       default:
15095         break;
15096       case 'e':
15097         return RCPair(0U, &ARM::tGPREvenRegClass);
15098       case 'o':
15099         return RCPair(0U, &ARM::tGPROddRegClass);
15100       }
15101     }
15102     break;
15103
15104   default:
15105     break;
15106   }
15107
15108   if (StringRef("{cc}").equals_lower(Constraint))
15109     return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
15110
15111   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15112 }
15113
15114 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
15115 /// vector.  If it is invalid, don't add anything to Ops.
15116 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
15117                                                      std::string &Constraint,
15118                                                      std::vector<SDValue>&Ops,
15119                                                      SelectionDAG &DAG) const {
15120   SDValue Result;
15121
15122   // Currently only support length 1 constraints.
15123   if (Constraint.length() != 1) return;
15124
15125   char ConstraintLetter = Constraint[0];
15126   switch (ConstraintLetter) {
15127   default: break;
15128   case 'j':
15129   case 'I': case 'J': case 'K': case 'L':
15130   case 'M': case 'N': case 'O':
15131     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
15132     if (!C)
15133       return;
15134
15135     int64_t CVal64 = C->getSExtValue();
15136     int CVal = (int) CVal64;
15137     // None of these constraints allow values larger than 32 bits.  Check
15138     // that the value fits in an int.
15139     if (CVal != CVal64)
15140       return;
15141
15142     switch (ConstraintLetter) {
15143       case 'j':
15144         // Constant suitable for movw, must be between 0 and
15145         // 65535.
15146         if (Subtarget->hasV6T2Ops())
15147           if (CVal >= 0 && CVal <= 65535)
15148             break;
15149         return;
15150       case 'I':
15151         if (Subtarget->isThumb1Only()) {
15152           // This must be a constant between 0 and 255, for ADD
15153           // immediates.
15154           if (CVal >= 0 && CVal <= 255)
15155             break;
15156         } else if (Subtarget->isThumb2()) {
15157           // A constant that can be used as an immediate value in a
15158           // data-processing instruction.
15159           if (ARM_AM::getT2SOImmVal(CVal) != -1)
15160             break;
15161         } else {
15162           // A constant that can be used as an immediate value in a
15163           // data-processing instruction.
15164           if (ARM_AM::getSOImmVal(CVal) != -1)
15165             break;
15166         }
15167         return;
15168
15169       case 'J':
15170         if (Subtarget->isThumb1Only()) {
15171           // This must be a constant between -255 and -1, for negated ADD
15172           // immediates. This can be used in GCC with an "n" modifier that
15173           // prints the negated value, for use with SUB instructions. It is
15174           // not useful otherwise but is implemented for compatibility.
15175           if (CVal >= -255 && CVal <= -1)
15176             break;
15177         } else {
15178           // This must be a constant between -4095 and 4095. It is not clear
15179           // what this constraint is intended for. Implemented for
15180           // compatibility with GCC.
15181           if (CVal >= -4095 && CVal <= 4095)
15182             break;
15183         }
15184         return;
15185
15186       case 'K':
15187         if (Subtarget->isThumb1Only()) {
15188           // A 32-bit value where only one byte has a nonzero value. Exclude
15189           // zero to match GCC. This constraint is used by GCC internally for
15190           // constants that can be loaded with a move/shift combination.
15191           // It is not useful otherwise but is implemented for compatibility.
15192           if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
15193             break;
15194         } else if (Subtarget->isThumb2()) {
15195           // A constant whose bitwise inverse can be used as an immediate
15196           // value in a data-processing instruction. This can be used in GCC
15197           // with a "B" modifier that prints the inverted value, for use with
15198           // BIC and MVN instructions. It is not useful otherwise but is
15199           // implemented for compatibility.
15200           if (ARM_AM::getT2SOImmVal(~CVal) != -1)
15201             break;
15202         } else {
15203           // A constant whose bitwise inverse can be used as an immediate
15204           // value in a data-processing instruction. This can be used in GCC
15205           // with a "B" modifier that prints the inverted value, for use with
15206           // BIC and MVN instructions. It is not useful otherwise but is
15207           // implemented for compatibility.
15208           if (ARM_AM::getSOImmVal(~CVal) != -1)
15209             break;
15210         }
15211         return;
15212
15213       case 'L':
15214         if (Subtarget->isThumb1Only()) {
15215           // This must be a constant between -7 and 7,
15216           // for 3-operand ADD/SUB immediate instructions.
15217           if (CVal >= -7 && CVal < 7)
15218             break;
15219         } else if (Subtarget->isThumb2()) {
15220           // A constant whose negation can be used as an immediate value in a
15221           // data-processing instruction. This can be used in GCC with an "n"
15222           // modifier that prints the negated value, for use with SUB
15223           // instructions. It is not useful otherwise but is implemented for
15224           // compatibility.
15225           if (ARM_AM::getT2SOImmVal(-CVal) != -1)
15226             break;
15227         } else {
15228           // A constant whose negation can be used as an immediate value in a
15229           // data-processing instruction. This can be used in GCC with an "n"
15230           // modifier that prints the negated value, for use with SUB
15231           // instructions. It is not useful otherwise but is implemented for
15232           // compatibility.
15233           if (ARM_AM::getSOImmVal(-CVal) != -1)
15234             break;
15235         }
15236         return;
15237
15238       case 'M':
15239         if (Subtarget->isThumb1Only()) {
15240           // This must be a multiple of 4 between 0 and 1020, for
15241           // ADD sp + immediate.
15242           if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
15243             break;
15244         } else {
15245           // A power of two or a constant between 0 and 32.  This is used in
15246           // GCC for the shift amount on shifted register operands, but it is
15247           // useful in general for any shift amounts.
15248           if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
15249             break;
15250         }
15251         return;
15252
15253       case 'N':
15254         if (Subtarget->isThumb()) {  // FIXME thumb2
15255           // This must be a constant between 0 and 31, for shift amounts.
15256           if (CVal >= 0 && CVal <= 31)
15257             break;
15258         }
15259         return;
15260
15261       case 'O':
15262         if (Subtarget->isThumb()) {  // FIXME thumb2
15263           // This must be a multiple of 4 between -508 and 508, for
15264           // ADD/SUB sp = sp + immediate.
15265           if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
15266             break;
15267         }
15268         return;
15269     }
15270     Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType());
15271     break;
15272   }
15273
15274   if (Result.getNode()) {
15275     Ops.push_back(Result);
15276     return;
15277   }
15278   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
15279 }
15280
15281 static RTLIB::Libcall getDivRemLibcall(
15282     const SDNode *N, MVT::SimpleValueType SVT) {
15283   assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
15284           N->getOpcode() == ISD::SREM    || N->getOpcode() == ISD::UREM) &&
15285          "Unhandled Opcode in getDivRemLibcall");
15286   bool isSigned = N->getOpcode() == ISD::SDIVREM ||
15287                   N->getOpcode() == ISD::SREM;
15288   RTLIB::Libcall LC;
15289   switch (SVT) {
15290   default: llvm_unreachable("Unexpected request for libcall!");
15291   case MVT::i8:  LC = isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
15292   case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
15293   case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
15294   case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
15295   }
15296   return LC;
15297 }
15298
15299 static TargetLowering::ArgListTy getDivRemArgList(
15300     const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
15301   assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
15302           N->getOpcode() == ISD::SREM    || N->getOpcode() == ISD::UREM) &&
15303          "Unhandled Opcode in getDivRemArgList");
15304   bool isSigned = N->getOpcode() == ISD::SDIVREM ||
15305                   N->getOpcode() == ISD::SREM;
15306   TargetLowering::ArgListTy Args;
15307   TargetLowering::ArgListEntry Entry;
15308   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
15309     EVT ArgVT = N->getOperand(i).getValueType();
15310     Type *ArgTy = ArgVT.getTypeForEVT(*Context);
15311     Entry.Node = N->getOperand(i);
15312     Entry.Ty = ArgTy;
15313     Entry.IsSExt = isSigned;
15314     Entry.IsZExt = !isSigned;
15315     Args.push_back(Entry);
15316   }
15317   if (Subtarget->isTargetWindows() && Args.size() >= 2)
15318     std::swap(Args[0], Args[1]);
15319   return Args;
15320 }
15321
15322 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
15323   assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
15324           Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
15325           Subtarget->isTargetWindows()) &&
15326          "Register-based DivRem lowering only");
15327   unsigned Opcode = Op->getOpcode();
15328   assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
15329          "Invalid opcode for Div/Rem lowering");
15330   bool isSigned = (Opcode == ISD::SDIVREM);
15331   EVT VT = Op->getValueType(0);
15332   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
15333   SDLoc dl(Op);
15334
15335   // If the target has hardware divide, use divide + multiply + subtract:
15336   //     div = a / b
15337   //     rem = a - b * div
15338   //     return {div, rem}
15339   // This should be lowered into UDIV/SDIV + MLS later on.
15340   bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
15341                                         : Subtarget->hasDivideInARMMode();
15342   if (hasDivide && Op->getValueType(0).isSimple() &&
15343       Op->getSimpleValueType(0) == MVT::i32) {
15344     unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
15345     const SDValue Dividend = Op->getOperand(0);
15346     const SDValue Divisor = Op->getOperand(1);
15347     SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
15348     SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
15349     SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
15350
15351     SDValue Values[2] = {Div, Rem};
15352     return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
15353   }
15354
15355   RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
15356                                        VT.getSimpleVT().SimpleTy);
15357   SDValue InChain = DAG.getEntryNode();
15358
15359   TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(),
15360                                                     DAG.getContext(),
15361                                                     Subtarget);
15362
15363   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
15364                                          getPointerTy(DAG.getDataLayout()));
15365
15366   Type *RetTy = StructType::get(Ty, Ty);
15367
15368   if (Subtarget->isTargetWindows())
15369     InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
15370
15371   TargetLowering::CallLoweringInfo CLI(DAG);
15372   CLI.setDebugLoc(dl).setChain(InChain)
15373     .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
15374     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
15375
15376   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
15377   return CallInfo.first;
15378 }
15379
15380 // Lowers REM using divmod helpers
15381 // see RTABI section 4.2/4.3
15382 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
15383   // Build return types (div and rem)
15384   std::vector<Type*> RetTyParams;
15385   Type *RetTyElement;
15386
15387   switch (N->getValueType(0).getSimpleVT().SimpleTy) {
15388   default: llvm_unreachable("Unexpected request for libcall!");
15389   case MVT::i8:   RetTyElement = Type::getInt8Ty(*DAG.getContext());  break;
15390   case MVT::i16:  RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
15391   case MVT::i32:  RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
15392   case MVT::i64:  RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
15393   }
15394
15395   RetTyParams.push_back(RetTyElement);
15396   RetTyParams.push_back(RetTyElement);
15397   ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
15398   Type *RetTy = StructType::get(*DAG.getContext(), ret);
15399
15400   RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
15401                                                              SimpleTy);
15402   SDValue InChain = DAG.getEntryNode();
15403   TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(),
15404                                                     Subtarget);
15405   bool isSigned = N->getOpcode() == ISD::SREM;
15406   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
15407                                          getPointerTy(DAG.getDataLayout()));
15408
15409   if (Subtarget->isTargetWindows())
15410     InChain = WinDBZCheckDenominator(DAG, N, InChain);
15411
15412   // Lower call
15413   CallLoweringInfo CLI(DAG);
15414   CLI.setChain(InChain)
15415      .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
15416      .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N));
15417   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
15418
15419   // Return second (rem) result operand (first contains div)
15420   SDNode *ResNode = CallResult.first.getNode();
15421   assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
15422   return ResNode->getOperand(1);
15423 }
15424
15425 SDValue
15426 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
15427   assert(Subtarget->isTargetWindows() && "unsupported target platform");
15428   SDLoc DL(Op);
15429
15430   // Get the inputs.
15431   SDValue Chain = Op.getOperand(0);
15432   SDValue Size  = Op.getOperand(1);
15433
15434   if (DAG.getMachineFunction().getFunction().hasFnAttribute(
15435           "no-stack-arg-probe")) {
15436     unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
15437     SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
15438     Chain = SP.getValue(1);
15439     SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
15440     if (Align)
15441       SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
15442                        DAG.getConstant(-(uint64_t)Align, DL, MVT::i32));
15443     Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
15444     SDValue Ops[2] = { SP, Chain };
15445     return DAG.getMergeValues(Ops, DL);
15446   }
15447
15448   SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
15449                               DAG.getConstant(2, DL, MVT::i32));
15450
15451   SDValue Flag;
15452   Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag);
15453   Flag = Chain.getValue(1);
15454
15455   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
15456   Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag);
15457
15458   SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
15459   Chain = NewSP.getValue(1);
15460
15461   SDValue Ops[2] = { NewSP, Chain };
15462   return DAG.getMergeValues(Ops, DL);
15463 }
15464
15465 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
15466   SDValue SrcVal = Op.getOperand(0);
15467   const unsigned DstSz = Op.getValueType().getSizeInBits();
15468   const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
15469   assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
15470          "Unexpected type for custom-lowering FP_EXTEND");
15471
15472   assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
15473          "With both FP DP and 16, any FP conversion is legal!");
15474
15475   assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
15476          "With FP16, 16 to 32 conversion is legal!");
15477
15478   // Either we are converting from 16 -> 64, without FP16 and/or
15479   // FP.double-precision or without Armv8-fp. So we must do it in two
15480   // steps.
15481   // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
15482   // without FP16. So we must do a function call.
15483   SDLoc Loc(Op);
15484   RTLIB::Libcall LC;
15485   if (SrcSz == 16) {
15486     // Instruction from 16 -> 32
15487     if (Subtarget->hasFP16())
15488       SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, SrcVal);
15489     // Lib call from 16 -> 32
15490     else {
15491       LC = RTLIB::getFPEXT(MVT::f16, MVT::f32);
15492       assert(LC != RTLIB::UNKNOWN_LIBCALL &&
15493              "Unexpected type for custom-lowering FP_EXTEND");
15494       SrcVal =
15495         makeLibCall(DAG, LC, MVT::f32, SrcVal, /*isSigned*/ false, Loc).first;
15496     }
15497   }
15498
15499   if (DstSz != 64)
15500     return SrcVal;
15501   // For sure now SrcVal is 32 bits
15502   if (Subtarget->hasFP64()) // Instruction from 32 -> 64
15503     return DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f64, SrcVal);
15504
15505   LC = RTLIB::getFPEXT(MVT::f32, MVT::f64);
15506   assert(LC != RTLIB::UNKNOWN_LIBCALL &&
15507          "Unexpected type for custom-lowering FP_EXTEND");
15508   return makeLibCall(DAG, LC, MVT::f64, SrcVal, /*isSigned*/ false, Loc).first;
15509 }
15510
15511 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
15512   SDValue SrcVal = Op.getOperand(0);
15513   EVT SrcVT = SrcVal.getValueType();
15514   EVT DstVT = Op.getValueType();
15515   const unsigned DstSz = Op.getValueType().getSizeInBits();
15516   const unsigned SrcSz = SrcVT.getSizeInBits();
15517   (void)DstSz;
15518   assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
15519          "Unexpected type for custom-lowering FP_ROUND");
15520
15521   assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
15522          "With both FP DP and 16, any FP conversion is legal!");
15523
15524   SDLoc Loc(Op);
15525
15526   // Instruction from 32 -> 16 if hasFP16 is valid
15527   if (SrcSz == 32 && Subtarget->hasFP16())
15528     return Op;
15529
15530   // Lib call from 32 -> 16 / 64 -> [32, 16]
15531   RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
15532   assert(LC != RTLIB::UNKNOWN_LIBCALL &&
15533          "Unexpected type for custom-lowering FP_ROUND");
15534   return makeLibCall(DAG, LC, DstVT, SrcVal, /*isSigned*/ false, Loc).first;
15535 }
15536
15537 void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
15538                                  SelectionDAG &DAG) const {
15539   assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS.");
15540   MVT HalfT = MVT::i32;
15541   SDLoc dl(N);
15542   SDValue Hi, Lo, Tmp;
15543
15544   if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) ||
15545       !isOperationLegalOrCustom(ISD::UADDO, HalfT))
15546     return ;
15547
15548   unsigned OpTypeBits = HalfT.getScalarSizeInBits();
15549   SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);
15550
15551   Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
15552                    DAG.getConstant(0, dl, HalfT));
15553   Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
15554                    DAG.getConstant(1, dl, HalfT));
15555
15556   Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi,
15557                     DAG.getConstant(OpTypeBits - 1, dl,
15558                     getShiftAmountTy(HalfT, DAG.getDataLayout())));
15559   Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
15560   Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
15561                    SDValue(Lo.getNode(), 1));
15562   Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
15563   Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
15564
15565   Results.push_back(Lo);
15566   Results.push_back(Hi);
15567 }
15568
15569 bool
15570 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
15571   // The ARM target isn't yet aware of offsets.
15572   return false;
15573 }
15574
15575 bool ARM::isBitFieldInvertedMask(unsigned v) {
15576   if (v == 0xffffffff)
15577     return false;
15578
15579   // there can be 1's on either or both "outsides", all the "inside"
15580   // bits must be 0's
15581   return isShiftedMask_32(~v);
15582 }
15583
15584 /// isFPImmLegal - Returns true if the target can instruction select the
15585 /// specified FP immediate natively. If false, the legalizer will
15586 /// materialize the FP immediate as a load from a constant pool.
15587 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
15588                                      bool ForCodeSize) const {
15589   if (!Subtarget->hasVFP3Base())
15590     return false;
15591   if (VT == MVT::f16 && Subtarget->hasFullFP16())
15592     return ARM_AM::getFP16Imm(Imm) != -1;
15593   if (VT == MVT::f32)
15594     return ARM_AM::getFP32Imm(Imm) != -1;
15595   if (VT == MVT::f64 && Subtarget->hasFP64())
15596     return ARM_AM::getFP64Imm(Imm) != -1;
15597   return false;
15598 }
15599
15600 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
15601 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
15602 /// specified in the intrinsic calls.
15603 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
15604                                            const CallInst &I,
15605                                            MachineFunction &MF,
15606                                            unsigned Intrinsic) const {
15607   switch (Intrinsic) {
15608   case Intrinsic::arm_neon_vld1:
15609   case Intrinsic::arm_neon_vld2:
15610   case Intrinsic::arm_neon_vld3:
15611   case Intrinsic::arm_neon_vld4:
15612   case Intrinsic::arm_neon_vld2lane:
15613   case Intrinsic::arm_neon_vld3lane:
15614   case Intrinsic::arm_neon_vld4lane:
15615   case Intrinsic::arm_neon_vld2dup:
15616   case Intrinsic::arm_neon_vld3dup:
15617   case Intrinsic::arm_neon_vld4dup: {
15618     Info.opc = ISD::INTRINSIC_W_CHAIN;
15619     // Conservatively set memVT to the entire set of vectors loaded.
15620     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
15621     uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
15622     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
15623     Info.ptrVal = I.getArgOperand(0);
15624     Info.offset = 0;
15625     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
15626     Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
15627     // volatile loads with NEON intrinsics not supported
15628     Info.flags = MachineMemOperand::MOLoad;
15629     return true;
15630   }
15631   case Intrinsic::arm_neon_vld1x2:
15632   case Intrinsic::arm_neon_vld1x3:
15633   case Intrinsic::arm_neon_vld1x4: {
15634     Info.opc = ISD::INTRINSIC_W_CHAIN;
15635     // Conservatively set memVT to the entire set of vectors loaded.
15636     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
15637     uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
15638     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
15639     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
15640     Info.offset = 0;
15641     Info.align = 0;
15642     // volatile loads with NEON intrinsics not supported
15643     Info.flags = MachineMemOperand::MOLoad;
15644     return true;
15645   }
15646   case Intrinsic::arm_neon_vst1:
15647   case Intrinsic::arm_neon_vst2:
15648   case Intrinsic::arm_neon_vst3:
15649   case Intrinsic::arm_neon_vst4:
15650   case Intrinsic::arm_neon_vst2lane:
15651   case Intrinsic::arm_neon_vst3lane:
15652   case Intrinsic::arm_neon_vst4lane: {
15653     Info.opc = ISD::INTRINSIC_VOID;
15654     // Conservatively set memVT to the entire set of vectors stored.
15655     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
15656     unsigned NumElts = 0;
15657     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
15658       Type *ArgTy = I.getArgOperand(ArgI)->getType();
15659       if (!ArgTy->isVectorTy())
15660         break;
15661       NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
15662     }
15663     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
15664     Info.ptrVal = I.getArgOperand(0);
15665     Info.offset = 0;
15666     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
15667     Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
15668     // volatile stores with NEON intrinsics not supported
15669     Info.flags = MachineMemOperand::MOStore;
15670     return true;
15671   }
15672   case Intrinsic::arm_neon_vst1x2:
15673   case Intrinsic::arm_neon_vst1x3:
15674   case Intrinsic::arm_neon_vst1x4: {
15675     Info.opc = ISD::INTRINSIC_VOID;
15676     // Conservatively set memVT to the entire set of vectors stored.
15677     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
15678     unsigned NumElts = 0;
15679     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
15680       Type *ArgTy = I.getArgOperand(ArgI)->getType();
15681       if (!ArgTy->isVectorTy())
15682         break;
15683       NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
15684     }
15685     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
15686     Info.ptrVal = I.getArgOperand(0);
15687     Info.offset = 0;
15688     Info.align = 0;
15689     // volatile stores with NEON intrinsics not supported
15690     Info.flags = MachineMemOperand::MOStore;
15691     return true;
15692   }
15693   case Intrinsic::arm_ldaex:
15694   case Intrinsic::arm_ldrex: {
15695     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
15696     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
15697     Info.opc = ISD::INTRINSIC_W_CHAIN;
15698     Info.memVT = MVT::getVT(PtrTy->getElementType());
15699     Info.ptrVal = I.getArgOperand(0);
15700     Info.offset = 0;
15701     Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
15702     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
15703     return true;
15704   }
15705   case Intrinsic::arm_stlex:
15706   case Intrinsic::arm_strex: {
15707     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
15708     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
15709     Info.opc = ISD::INTRINSIC_W_CHAIN;
15710     Info.memVT = MVT::getVT(PtrTy->getElementType());
15711     Info.ptrVal = I.getArgOperand(1);
15712     Info.offset = 0;
15713     Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
15714     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
15715     return true;
15716   }
15717   case Intrinsic::arm_stlexd:
15718   case Intrinsic::arm_strexd:
15719     Info.opc = ISD::INTRINSIC_W_CHAIN;
15720     Info.memVT = MVT::i64;
15721     Info.ptrVal = I.getArgOperand(2);
15722     Info.offset = 0;
15723     Info.align = 8;
15724     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
15725     return true;
15726
15727   case Intrinsic::arm_ldaexd:
15728   case Intrinsic::arm_ldrexd:
15729     Info.opc = ISD::INTRINSIC_W_CHAIN;
15730     Info.memVT = MVT::i64;
15731     Info.ptrVal = I.getArgOperand(0);
15732     Info.offset = 0;
15733     Info.align = 8;
15734     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
15735     return true;
15736
15737   default:
15738     break;
15739   }
15740
15741   return false;
15742 }
15743
15744 /// Returns true if it is beneficial to convert a load of a constant
15745 /// to just the constant itself.
15746 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
15747                                                           Type *Ty) const {
15748   assert(Ty->isIntegerTy());
15749
15750   unsigned Bits = Ty->getPrimitiveSizeInBits();
15751   if (Bits == 0 || Bits > 32)
15752     return false;
15753   return true;
15754 }
15755
15756 bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
15757                                                 unsigned Index) const {
15758   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
15759     return false;
15760
15761   return (Index == 0 || Index == ResVT.getVectorNumElements());
15762 }
15763
15764 Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
15765                                         ARM_MB::MemBOpt Domain) const {
15766   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
15767
15768   // First, if the target has no DMB, see what fallback we can use.
15769   if (!Subtarget->hasDataBarrier()) {
15770     // Some ARMv6 cpus can support data barriers with an mcr instruction.
15771     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
15772     // here.
15773     if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
15774       Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
15775       Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
15776                         Builder.getInt32(0), Builder.getInt32(7),
15777                         Builder.getInt32(10), Builder.getInt32(5)};
15778       return Builder.CreateCall(MCR, args);
15779     } else {
15780       // Instead of using barriers, atomic accesses on these subtargets use
15781       // libcalls.
15782       llvm_unreachable("makeDMB on a target so old that it has no barriers");
15783     }
15784   } else {
15785     Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
15786     // Only a full system barrier exists in the M-class architectures.
15787     Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
15788     Constant *CDomain = Builder.getInt32(Domain);
15789     return Builder.CreateCall(DMB, CDomain);
15790   }
15791 }
15792
15793 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
15794 Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
15795                                                  Instruction *Inst,
15796                                                  AtomicOrdering Ord) const {
15797   switch (Ord) {
15798   case AtomicOrdering::NotAtomic:
15799   case AtomicOrdering::Unordered:
15800     llvm_unreachable("Invalid fence: unordered/non-atomic");
15801   case AtomicOrdering::Monotonic:
15802   case AtomicOrdering::Acquire:
15803     return nullptr; // Nothing to do
15804   case AtomicOrdering::SequentiallyConsistent:
15805     if (!Inst->hasAtomicStore())
15806       return nullptr; // Nothing to do
15807     LLVM_FALLTHROUGH;
15808   case AtomicOrdering::Release:
15809   case AtomicOrdering::AcquireRelease:
15810     if (Subtarget->preferISHSTBarriers())
15811       return makeDMB(Builder, ARM_MB::ISHST);
15812     // FIXME: add a comment with a link to documentation justifying this.
15813     else
15814       return makeDMB(Builder, ARM_MB::ISH);
15815   }
15816   llvm_unreachable("Unknown fence ordering in emitLeadingFence");
15817 }
15818
15819 Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
15820                                                   Instruction *Inst,
15821                                                   AtomicOrdering Ord) const {
15822   switch (Ord) {
15823   case AtomicOrdering::NotAtomic:
15824   case AtomicOrdering::Unordered:
15825     llvm_unreachable("Invalid fence: unordered/not-atomic");
15826   case AtomicOrdering::Monotonic:
15827   case AtomicOrdering::Release:
15828     return nullptr; // Nothing to do
15829   case AtomicOrdering::Acquire:
15830   case AtomicOrdering::AcquireRelease:
15831   case AtomicOrdering::SequentiallyConsistent:
15832     return makeDMB(Builder, ARM_MB::ISH);
15833   }
15834   llvm_unreachable("Unknown fence ordering in emitTrailingFence");
15835 }
15836
15837 // Loads and stores less than 64-bits are already atomic; ones above that
15838 // are doomed anyway, so defer to the default libcall and blame the OS when
15839 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
15840 // anything for those.
15841 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
15842   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
15843   return (Size == 64) && !Subtarget->isMClass();
15844 }
15845
15846 // Loads and stores less than 64-bits are already atomic; ones above that
15847 // are doomed anyway, so defer to the default libcall and blame the OS when
15848 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
15849 // anything for those.
15850 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
15851 // guarantee, see DDI0406C ARM architecture reference manual,
15852 // sections A8.8.72-74 LDRD)
15853 TargetLowering::AtomicExpansionKind
15854 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
15855   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
15856   return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly
15857                                                   : AtomicExpansionKind::None;
15858 }
15859
15860 // For the real atomic operations, we have ldrex/strex up to 32 bits,
15861 // and up to 64 bits on the non-M profiles
15862 TargetLowering::AtomicExpansionKind
15863 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
15864   if (AI->isFloatingPointOperation())
15865     return AtomicExpansionKind::CmpXChg;
15866
15867   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
15868   bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
15869   return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW)
15870              ? AtomicExpansionKind::LLSC
15871              : AtomicExpansionKind::None;
15872 }
15873
15874 TargetLowering::AtomicExpansionKind
15875 ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
15876   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
15877   // implement cmpxchg without spilling. If the address being exchanged is also
15878   // on the stack and close enough to the spill slot, this can lead to a
15879   // situation where the monitor always gets cleared and the atomic operation
15880   // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
15881   bool HasAtomicCmpXchg =
15882       !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
15883   if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg)
15884     return AtomicExpansionKind::LLSC;
15885   return AtomicExpansionKind::None;
15886 }
15887
15888 bool ARMTargetLowering::shouldInsertFencesForAtomic(
15889     const Instruction *I) const {
15890   return InsertFencesForAtomic;
15891 }
15892
15893 // This has so far only been implemented for MachO.
15894 bool ARMTargetLowering::useLoadStackGuardNode() const {
15895   return Subtarget->isTargetMachO();
15896 }
15897
15898 void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
15899   if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
15900     return TargetLowering::insertSSPDeclarations(M);
15901
15902   // MSVC CRT has a global variable holding security cookie.
15903   M.getOrInsertGlobal("__security_cookie",
15904                       Type::getInt8PtrTy(M.getContext()));
15905
15906   // MSVC CRT has a function to validate security cookie.
15907   FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
15908       "__security_check_cookie", Type::getVoidTy(M.getContext()),
15909       Type::getInt8PtrTy(M.getContext()));
15910   if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
15911     F->addAttribute(1, Attribute::AttrKind::InReg);
15912 }
15913
15914 Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const {
15915   // MSVC CRT has a global variable holding security cookie.
15916   if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
15917     return M.getGlobalVariable("__security_cookie");
15918   return TargetLowering::getSDagStackGuard(M);
15919 }
15920
15921 Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const {
15922   // MSVC CRT has a function to validate security cookie.
15923   if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
15924     return M.getFunction("__security_check_cookie");
15925   return TargetLowering::getSSPStackGuardCheck(M);
15926 }
15927
15928 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
15929                                                   unsigned &Cost) const {
15930   // If we do not have NEON, vector types are not natively supported.
15931   if (!Subtarget->hasNEON())
15932     return false;
15933
15934   // Floating point values and vector values map to the same register file.
15935   // Therefore, although we could do a store extract of a vector type, this is
15936   // better to leave at float as we have more freedom in the addressing mode for
15937   // those.
15938   if (VectorTy->isFPOrFPVectorTy())
15939     return false;
15940
15941   // If the index is unknown at compile time, this is very expensive to lower
15942   // and it is not possible to combine the store with the extract.
15943   if (!isa<ConstantInt>(Idx))
15944     return false;
15945
15946   assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
15947   unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth();
15948   // We can do a store + vector extract on any vector that fits perfectly in a D
15949   // or Q register.
15950   if (BitWidth == 64 || BitWidth == 128) {
15951     Cost = 0;
15952     return true;
15953   }
15954   return false;
15955 }
15956
15957 bool ARMTargetLowering::isCheapToSpeculateCttz() const {
15958   return Subtarget->hasV6T2Ops();
15959 }
15960
15961 bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
15962   return Subtarget->hasV6T2Ops();
15963 }
15964
15965 bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const {
15966   return !Subtarget->hasMinSize();
15967 }
15968
15969 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
15970                                          AtomicOrdering Ord) const {
15971   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
15972   Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
15973   bool IsAcquire = isAcquireOrStronger(Ord);
15974
15975   // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
15976   // intrinsic must return {i32, i32} and we have to recombine them into a
15977   // single i64 here.
15978   if (ValTy->getPrimitiveSizeInBits() == 64) {
15979     Intrinsic::ID Int =
15980         IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
15981     Function *Ldrex = Intrinsic::getDeclaration(M, Int);
15982
15983     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
15984     Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
15985
15986     Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
15987     Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
15988     if (!Subtarget->isLittle())
15989       std::swap (Lo, Hi);
15990     Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
15991     Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
15992     return Builder.CreateOr(
15993         Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64");
15994   }
15995
15996   Type *Tys[] = { Addr->getType() };
15997   Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
15998   Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys);
15999
16000   return Builder.CreateTruncOrBitCast(
16001       Builder.CreateCall(Ldrex, Addr),
16002       cast<PointerType>(Addr->getType())->getElementType());
16003 }
16004
16005 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
16006     IRBuilder<> &Builder) const {
16007   if (!Subtarget->hasV7Ops())
16008     return;
16009   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
16010   Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
16011 }
16012
16013 Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
16014                                                Value *Addr,
16015                                                AtomicOrdering Ord) const {
16016   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
16017   bool IsRelease = isReleaseOrStronger(Ord);
16018
16019   // Since the intrinsics must have legal type, the i64 intrinsics take two
16020   // parameters: "i32, i32". We must marshal Val into the appropriate form
16021   // before the call.
16022   if (Val->getType()->getPrimitiveSizeInBits() == 64) {
16023     Intrinsic::ID Int =
16024         IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
16025     Function *Strex = Intrinsic::getDeclaration(M, Int);
16026     Type *Int32Ty = Type::getInt32Ty(M->getContext());
16027
16028     Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
16029     Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
16030     if (!Subtarget->isLittle())
16031       std::swap(Lo, Hi);
16032     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
16033     return Builder.CreateCall(Strex, {Lo, Hi, Addr});
16034   }
16035
16036   Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
16037   Type *Tys[] = { Addr->getType() };
16038   Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
16039
16040   return Builder.CreateCall(
16041       Strex, {Builder.CreateZExtOrBitCast(
16042                   Val, Strex->getFunctionType()->getParamType(0)),
16043               Addr});
16044 }
16045
16046
16047 bool ARMTargetLowering::alignLoopsWithOptSize() const {
16048   return Subtarget->isMClass();
16049 }
16050
16051 /// A helper function for determining the number of interleaved accesses we
16052 /// will generate when lowering accesses of the given type.
16053 unsigned
16054 ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
16055                                              const DataLayout &DL) const {
16056   return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
16057 }
16058
16059 bool ARMTargetLowering::isLegalInterleavedAccessType(
16060     VectorType *VecTy, const DataLayout &DL) const {
16061
16062   unsigned VecSize = DL.getTypeSizeInBits(VecTy);
16063   unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
16064
16065   // Ensure the vector doesn't have f16 elements. Even though we could do an
16066   // i16 vldN, we can't hold the f16 vectors and will end up converting via
16067   // f32.
16068   if (VecTy->getElementType()->isHalfTy())
16069     return false;
16070
16071   // Ensure the number of vector elements is greater than 1.
16072   if (VecTy->getNumElements() < 2)
16073     return false;
16074
16075   // Ensure the element type is legal.
16076   if (ElSize != 8 && ElSize != 16 && ElSize != 32)
16077     return false;
16078
16079   // Ensure the total vector size is 64 or a multiple of 128. Types larger than
16080   // 128 will be split into multiple interleaved accesses.
16081   return VecSize == 64 || VecSize % 128 == 0;
16082 }
16083
16084 /// Lower an interleaved load into a vldN intrinsic.
16085 ///
16086 /// E.g. Lower an interleaved load (Factor = 2):
16087 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
16088 ///        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
16089 ///        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
16090 ///
16091 ///      Into:
16092 ///        %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
16093 ///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
16094 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
16095 bool ARMTargetLowering::lowerInterleavedLoad(
16096     LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
16097     ArrayRef<unsigned> Indices, unsigned Factor) const {
16098   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16099          "Invalid interleave factor");
16100   assert(!Shuffles.empty() && "Empty shufflevector input");
16101   assert(Shuffles.size() == Indices.size() &&
16102          "Unmatched number of shufflevectors and indices");
16103
16104   VectorType *VecTy = Shuffles[0]->getType();
16105   Type *EltTy = VecTy->getVectorElementType();
16106
16107   const DataLayout &DL = LI->getModule()->getDataLayout();
16108
16109   // Skip if we do not have NEON and skip illegal vector types. We can
16110   // "legalize" wide vector types into multiple interleaved accesses as long as
16111   // the vector types are divisible by 128.
16112   if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL))
16113     return false;
16114
16115   unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
16116
16117   // A pointer vector can not be the return type of the ldN intrinsics. Need to
16118   // load integer vectors first and then convert to pointer vectors.
16119   if (EltTy->isPointerTy())
16120     VecTy =
16121         VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
16122
16123   IRBuilder<> Builder(LI);
16124
16125   // The base address of the load.
16126   Value *BaseAddr = LI->getPointerOperand();
16127
16128   if (NumLoads > 1) {
16129     // If we're going to generate more than one load, reset the sub-vector type
16130     // to something legal.
16131     VecTy = VectorType::get(VecTy->getVectorElementType(),
16132                             VecTy->getVectorNumElements() / NumLoads);
16133
16134     // We will compute the pointer operand of each load from the original base
16135     // address using GEPs. Cast the base address to a pointer to the scalar
16136     // element type.
16137     BaseAddr = Builder.CreateBitCast(
16138         BaseAddr, VecTy->getVectorElementType()->getPointerTo(
16139                       LI->getPointerAddressSpace()));
16140   }
16141
16142   assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
16143
16144   Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
16145   Type *Tys[] = {VecTy, Int8Ptr};
16146   static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
16147                                             Intrinsic::arm_neon_vld3,
16148                                             Intrinsic::arm_neon_vld4};
16149   Function *VldnFunc =
16150       Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
16151
16152   // Holds sub-vectors extracted from the load intrinsic return values. The
16153   // sub-vectors are associated with the shufflevector instructions they will
16154   // replace.
16155   DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
16156
16157   for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
16158     // If we're generating more than one load, compute the base address of
16159     // subsequent loads as an offset from the previous.
16160     if (LoadCount > 0)
16161       BaseAddr =
16162           Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr,
16163                                      VecTy->getVectorNumElements() * Factor);
16164
16165     SmallVector<Value *, 2> Ops;
16166     Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
16167     Ops.push_back(Builder.getInt32(LI->getAlignment()));
16168
16169     CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
16170
16171     // Replace uses of each shufflevector with the corresponding vector loaded
16172     // by ldN.
16173     for (unsigned i = 0; i < Shuffles.size(); i++) {
16174       ShuffleVectorInst *SV = Shuffles[i];
16175       unsigned Index = Indices[i];
16176
16177       Value *SubVec = Builder.CreateExtractValue(VldN, Index);
16178
16179       // Convert the integer vector to pointer vector if the element is pointer.
16180       if (EltTy->isPointerTy())
16181         SubVec = Builder.CreateIntToPtr(
16182             SubVec, VectorType::get(SV->getType()->getVectorElementType(),
16183                                     VecTy->getVectorNumElements()));
16184
16185       SubVecs[SV].push_back(SubVec);
16186     }
16187   }
16188
16189   // Replace uses of the shufflevector instructions with the sub-vectors
16190   // returned by the load intrinsic. If a shufflevector instruction is
16191   // associated with more than one sub-vector, those sub-vectors will be
16192   // concatenated into a single wide vector.
16193   for (ShuffleVectorInst *SVI : Shuffles) {
16194     auto &SubVec = SubVecs[SVI];
16195     auto *WideVec =
16196         SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
16197     SVI->replaceAllUsesWith(WideVec);
16198   }
16199
16200   return true;
16201 }
16202
16203 /// Lower an interleaved store into a vstN intrinsic.
16204 ///
16205 /// E.g. Lower an interleaved store (Factor = 3):
16206 ///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
16207 ///                                  <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
16208 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
16209 ///
16210 ///      Into:
16211 ///        %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
16212 ///        %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
16213 ///        %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
16214 ///        call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
16215 ///
16216 /// Note that the new shufflevectors will be removed and we'll only generate one
16217 /// vst3 instruction in CodeGen.
16218 ///
16219 /// Example for a more general valid mask (Factor 3). Lower:
16220 ///        %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
16221 ///                 <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
16222 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr
16223 ///
16224 ///      Into:
16225 ///        %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
16226 ///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
16227 ///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
16228 ///        call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
16229 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
16230                                               ShuffleVectorInst *SVI,
16231                                               unsigned Factor) const {
16232   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16233          "Invalid interleave factor");
16234
16235   VectorType *VecTy = SVI->getType();
16236   assert(VecTy->getVectorNumElements() % Factor == 0 &&
16237          "Invalid interleaved store");
16238
16239   unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
16240   Type *EltTy = VecTy->getVectorElementType();
16241   VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
16242
16243   const DataLayout &DL = SI->getModule()->getDataLayout();
16244
16245   // Skip if we do not have NEON and skip illegal vector types. We can
16246   // "legalize" wide vector types into multiple interleaved accesses as long as
16247   // the vector types are divisible by 128.
16248   if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
16249     return false;
16250
16251   unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
16252
16253   Value *Op0 = SVI->getOperand(0);
16254   Value *Op1 = SVI->getOperand(1);
16255   IRBuilder<> Builder(SI);
16256
16257   // StN intrinsics don't support pointer vectors as arguments. Convert pointer
16258   // vectors to integer vectors.
16259   if (EltTy->isPointerTy()) {
16260     Type *IntTy = DL.getIntPtrType(EltTy);
16261
16262     // Convert to the corresponding integer vector.
16263     Type *IntVecTy =
16264         VectorType::get(IntTy, Op0->getType()->getVectorNumElements());
16265     Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
16266     Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
16267
16268     SubVecTy = VectorType::get(IntTy, LaneLen);
16269   }
16270
16271   // The base address of the store.
16272   Value *BaseAddr = SI->getPointerOperand();
16273
16274   if (NumStores > 1) {
16275     // If we're going to generate more than one store, reset the lane length
16276     // and sub-vector type to something legal.
16277     LaneLen /= NumStores;
16278     SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);
16279
16280     // We will compute the pointer operand of each store from the original base
16281     // address using GEPs. Cast the base address to a pointer to the scalar
16282     // element type.
16283     BaseAddr = Builder.CreateBitCast(
16284         BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
16285                       SI->getPointerAddressSpace()));
16286   }
16287
16288   assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
16289
16290   auto Mask = SVI->getShuffleMask();
16291
16292   Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
16293   Type *Tys[] = {Int8Ptr, SubVecTy};
16294   static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
16295                                              Intrinsic::arm_neon_vst3,
16296                                              Intrinsic::arm_neon_vst4};
16297
16298   for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
16299     // If we generating more than one store, we compute the base address of
16300     // subsequent stores as an offset from the previous.
16301     if (StoreCount > 0)
16302       BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(),
16303                                             BaseAddr, LaneLen * Factor);
16304
16305     SmallVector<Value *, 6> Ops;
16306     Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
16307
16308     Function *VstNFunc =
16309         Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
16310
16311     // Split the shufflevector operands into sub vectors for the new vstN call.
16312     for (unsigned i = 0; i < Factor; i++) {
16313       unsigned IdxI = StoreCount * LaneLen * Factor + i;
16314       if (Mask[IdxI] >= 0) {
16315         Ops.push_back(Builder.CreateShuffleVector(
16316             Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
16317       } else {
16318         unsigned StartMask = 0;
16319         for (unsigned j = 1; j < LaneLen; j++) {
16320           unsigned IdxJ = StoreCount * LaneLen * Factor + j;
16321           if (Mask[IdxJ * Factor + IdxI] >= 0) {
16322             StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
16323             break;
16324           }
16325         }
16326         // Note: If all elements in a chunk are undefs, StartMask=0!
16327         // Note: Filling undef gaps with random elements is ok, since
16328         // those elements were being written anyway (with undefs).
16329         // In the case of all undefs we're defaulting to using elems from 0
16330         // Note: StartMask cannot be negative, it's checked in
16331         // isReInterleaveMask
16332         Ops.push_back(Builder.CreateShuffleVector(
16333             Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
16334       }
16335     }
16336
16337     Ops.push_back(Builder.getInt32(SI->getAlignment()));
16338     Builder.CreateCall(VstNFunc, Ops);
16339   }
16340   return true;
16341 }
16342
16343 enum HABaseType {
16344   HA_UNKNOWN = 0,
16345   HA_FLOAT,
16346   HA_DOUBLE,
16347   HA_VECT64,
16348   HA_VECT128
16349 };
16350
16351 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
16352                                    uint64_t &Members) {
16353   if (auto *ST = dyn_cast<StructType>(Ty)) {
16354     for (unsigned i = 0; i < ST->getNumElements(); ++i) {
16355       uint64_t SubMembers = 0;
16356       if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
16357         return false;
16358       Members += SubMembers;
16359     }
16360   } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
16361     uint64_t SubMembers = 0;
16362     if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
16363       return false;
16364     Members += SubMembers * AT->getNumElements();
16365   } else if (Ty->isFloatTy()) {
16366     if (Base != HA_UNKNOWN && Base != HA_FLOAT)
16367       return false;
16368     Members = 1;
16369     Base = HA_FLOAT;
16370   } else if (Ty->isDoubleTy()) {
16371     if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
16372       return false;
16373     Members = 1;
16374     Base = HA_DOUBLE;
16375   } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
16376     Members = 1;
16377     switch (Base) {
16378     case HA_FLOAT:
16379     case HA_DOUBLE:
16380       return false;
16381     case HA_VECT64:
16382       return VT->getBitWidth() == 64;
16383     case HA_VECT128:
16384       return VT->getBitWidth() == 128;
16385     case HA_UNKNOWN:
16386       switch (VT->getBitWidth()) {
16387       case 64:
16388         Base = HA_VECT64;
16389         return true;
16390       case 128:
16391         Base = HA_VECT128;
16392         return true;
16393       default:
16394         return false;
16395       }
16396     }
16397   }
16398
16399   return (Members > 0 && Members <= 4);
16400 }
16401
16402 /// Return the correct alignment for the current calling convention.
16403 unsigned
16404 ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy,
16405                                                  DataLayout DL) const {
16406   if (!ArgTy->isVectorTy())
16407     return DL.getABITypeAlignment(ArgTy);
16408
16409   // Avoid over-aligning vector parameters. It would require realigning the
16410   // stack and waste space for no real benefit.
16411   return std::min(DL.getABITypeAlignment(ArgTy), DL.getStackAlignment());
16412 }
16413
16414 /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
16415 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
16416 /// passing according to AAPCS rules.
16417 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
16418     Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
16419   if (getEffectiveCallingConv(CallConv, isVarArg) !=
16420       CallingConv::ARM_AAPCS_VFP)
16421     return false;
16422
16423   HABaseType Base = HA_UNKNOWN;
16424   uint64_t Members = 0;
16425   bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
16426   LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
16427
16428   bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
16429   return IsHA || IsIntArray;
16430 }
16431
16432 unsigned ARMTargetLowering::getExceptionPointerRegister(
16433     const Constant *PersonalityFn) const {
16434   // Platforms which do not use SjLj EH may return values in these registers
16435   // via the personality function.
16436   return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0;
16437 }
16438
16439 unsigned ARMTargetLowering::getExceptionSelectorRegister(
16440     const Constant *PersonalityFn) const {
16441   // Platforms which do not use SjLj EH may return values in these registers
16442   // via the personality function.
16443   return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1;
16444 }
16445
16446 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
16447   // Update IsSplitCSR in ARMFunctionInfo.
16448   ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
16449   AFI->setIsSplitCSR(true);
16450 }
16451
16452 void ARMTargetLowering::insertCopiesSplitCSR(
16453     MachineBasicBlock *Entry,
16454     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
16455   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
16456   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
16457   if (!IStart)
16458     return;
16459
16460   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
16461   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
16462   MachineBasicBlock::iterator MBBI = Entry->begin();
16463   for (const MCPhysReg *I = IStart; *I; ++I) {
16464     const TargetRegisterClass *RC = nullptr;
16465     if (ARM::GPRRegClass.contains(*I))
16466       RC = &ARM::GPRRegClass;
16467     else if (ARM::DPRRegClass.contains(*I))
16468       RC = &ARM::DPRRegClass;
16469     else
16470       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
16471
16472     unsigned NewVR = MRI->createVirtualRegister(RC);
16473     // Create copy from CSR to a virtual register.
16474     // FIXME: this currently does not emit CFI pseudo-instructions, it works
16475     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
16476     // nounwind. If we want to generalize this later, we may need to emit
16477     // CFI pseudo-instructions.
16478     assert(Entry->getParent()->getFunction().hasFnAttribute(
16479                Attribute::NoUnwind) &&
16480            "Function should be nounwind in insertCopiesSplitCSR!");
16481     Entry->addLiveIn(*I);
16482     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
16483         .addReg(*I);
16484
16485     // Insert the copy-back instructions right before the terminator.
16486     for (auto *Exit : Exits)
16487       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
16488               TII->get(TargetOpcode::COPY), *I)
16489           .addReg(NewVR);
16490   }
16491 }
16492
16493 void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const {
16494   MF.getFrameInfo().computeMaxCallFrameSize(MF);
16495   TargetLoweringBase::finalizeLowering(MF);
16496 }