lib/Target/ARM/ARMISelLowering.cpp

   1 //===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file defines the interfaces that ARM uses to lower LLVM code into a
  10 // selection DAG.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "ARMISelLowering.h"
  15 #include "ARMBaseInstrInfo.h"
  16 #include "ARMBaseRegisterInfo.h"
  17 #include "ARMCallingConv.h"
  18 #include "ARMConstantPoolValue.h"
  19 #include "ARMMachineFunctionInfo.h"
  20 #include "ARMPerfectShuffle.h"
  21 #include "ARMRegisterInfo.h"
  22 #include "ARMSelectionDAGInfo.h"
  23 #include "ARMSubtarget.h"
  24 #include "MCTargetDesc/ARMAddressingModes.h"
  25 #include "MCTargetDesc/ARMBaseInfo.h"
  26 #include "Utils/ARMBaseInfo.h"
  27 #include "llvm/ADT/APFloat.h"
  28 #include "llvm/ADT/APInt.h"
  29 #include "llvm/ADT/ArrayRef.h"
  30 #include "llvm/ADT/BitVector.h"
  31 #include "llvm/ADT/DenseMap.h"
  32 #include "llvm/ADT/STLExtras.h"
  33 #include "llvm/ADT/SmallPtrSet.h"
  34 #include "llvm/ADT/SmallVector.h"
  35 #include "llvm/ADT/Statistic.h"
  36 #include "llvm/ADT/StringExtras.h"
  37 #include "llvm/ADT/StringRef.h"
  38 #include "llvm/ADT/StringSwitch.h"
  39 #include "llvm/ADT/Triple.h"
  40 #include "llvm/ADT/Twine.h"
  41 #include "llvm/Analysis/VectorUtils.h"
  42 #include "llvm/CodeGen/CallingConvLower.h"
  43 #include "llvm/CodeGen/ISDOpcodes.h"
  44 #include "llvm/CodeGen/IntrinsicLowering.h"
  45 #include "llvm/CodeGen/MachineBasicBlock.h"
  46 #include "llvm/CodeGen/MachineConstantPool.h"
  47 #include "llvm/CodeGen/MachineFrameInfo.h"
  48 #include "llvm/CodeGen/MachineFunction.h"
  49 #include "llvm/CodeGen/MachineInstr.h"
  50 #include "llvm/CodeGen/MachineInstrBuilder.h"
  51 #include "llvm/CodeGen/MachineJumpTableInfo.h"
  52 #include "llvm/CodeGen/MachineMemOperand.h"
  53 #include "llvm/CodeGen/MachineOperand.h"
  54 #include "llvm/CodeGen/MachineRegisterInfo.h"
  55 #include "llvm/CodeGen/RuntimeLibcalls.h"
  56 #include "llvm/CodeGen/SelectionDAG.h"
  57 #include "llvm/CodeGen/SelectionDAGNodes.h"
  58 #include "llvm/CodeGen/TargetInstrInfo.h"
  59 #include "llvm/CodeGen/TargetLowering.h"
  60 #include "llvm/CodeGen/TargetOpcodes.h"
  61 #include "llvm/CodeGen/TargetRegisterInfo.h"
  62 #include "llvm/CodeGen/TargetSubtargetInfo.h"
  63 #include "llvm/CodeGen/ValueTypes.h"
  64 #include "llvm/IR/Attributes.h"
  65 #include "llvm/IR/CallingConv.h"
  66 #include "llvm/IR/Constant.h"
  67 #include "llvm/IR/Constants.h"
  68 #include "llvm/IR/DataLayout.h"
  69 #include "llvm/IR/DebugLoc.h"
  70 #include "llvm/IR/DerivedTypes.h"
  71 #include "llvm/IR/Function.h"
  72 #include "llvm/IR/GlobalAlias.h"
  73 #include "llvm/IR/GlobalValue.h"
  74 #include "llvm/IR/GlobalVariable.h"
  75 #include "llvm/IR/IRBuilder.h"
  76 #include "llvm/IR/InlineAsm.h"
  77 #include "llvm/IR/Instruction.h"
  78 #include "llvm/IR/Instructions.h"
  79 #include "llvm/IR/IntrinsicInst.h"
  80 #include "llvm/IR/Intrinsics.h"
  81 #include "llvm/IR/Module.h"
  82 #include "llvm/IR/PatternMatch.h"
  83 #include "llvm/IR/Type.h"
  84 #include "llvm/IR/User.h"
  85 #include "llvm/IR/Value.h"
  86 #include "llvm/MC/MCInstrDesc.h"
  87 #include "llvm/MC/MCInstrItineraries.h"
  88 #include "llvm/MC/MCRegisterInfo.h"
  89 #include "llvm/MC/MCSchedule.h"
  90 #include "llvm/Support/AtomicOrdering.h"
  91 #include "llvm/Support/BranchProbability.h"
  92 #include "llvm/Support/Casting.h"
  93 #include "llvm/Support/CodeGen.h"
  94 #include "llvm/Support/CommandLine.h"
  95 #include "llvm/Support/Compiler.h"
  96 #include "llvm/Support/Debug.h"
  97 #include "llvm/Support/ErrorHandling.h"
  98 #include "llvm/Support/KnownBits.h"
  99 #include "llvm/Support/MachineValueType.h"
 100 #include "llvm/Support/MathExtras.h"
 101 #include "llvm/Support/raw_ostream.h"
 102 #include "llvm/Target/TargetMachine.h"
 103 #include "llvm/Target/TargetOptions.h"
 104 #include <algorithm>
 105 #include <cassert>
 106 #include <cstdint>
 107 #include <cstdlib>
 108 #include <iterator>
 109 #include <limits>
 110 #include <string>
 111 #include <tuple>
 112 #include <utility>
 113 #include <vector>
 114
 115 using namespace llvm;
 116 using namespace llvm::PatternMatch;
 117
 118 #define DEBUG_TYPE "arm-isel"
 119
 120 STATISTIC(NumTailCalls, "Number of tail calls");
 121 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
 122 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
 123 STATISTIC(NumConstpoolPromoted,
 124   "Number of constants with their storage promoted into constant pools");
 125
 126 static cl::opt<bool>
 127 ARMInterworking("arm-interworking", cl::Hidden,
 128   cl::desc("Enable / disable ARM interworking (for debugging only)"),
 129   cl::init(true));
 130
 131 static cl::opt<bool> EnableConstpoolPromotion(
 132     "arm-promote-constant", cl::Hidden,
 133     cl::desc("Enable / disable promotion of unnamed_addr constants into "
 134              "constant pools"),
 135     cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
 136 static cl::opt<unsigned> ConstpoolPromotionMaxSize(
 137     "arm-promote-constant-max-size", cl::Hidden,
 138     cl::desc("Maximum size of constant to promote into a constant pool"),
 139     cl::init(64));
 140 static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
 141     "arm-promote-constant-max-total", cl::Hidden,
 142     cl::desc("Maximum size of ALL constants to promote into a constant pool"),
 143     cl::init(128));
 144
 145 // The APCS parameter registers.
 146 static const MCPhysReg GPRArgRegs[] = {
 147   ARM::R0, ARM::R1, ARM::R2, ARM::R3
 148 };
 149
 150 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
 151                                        MVT PromotedBitwiseVT) {
 152   if (VT != PromotedLdStVT) {
 153     setOperationAction(ISD::LOAD, VT, Promote);
 154     AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
 155
 156     setOperationAction(ISD::STORE, VT, Promote);
 157     AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
 158   }
 159
 160   MVT ElemTy = VT.getVectorElementType();
 161   if (ElemTy != MVT::f64)
 162     setOperationAction(ISD::SETCC, VT, Custom);
 163   setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
 164   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 165   if (ElemTy == MVT::i32) {
 166     setOperationAction(ISD::SINT_TO_FP, VT, Custom);
 167     setOperationAction(ISD::UINT_TO_FP, VT, Custom);
 168     setOperationAction(ISD::FP_TO_SINT, VT, Custom);
 169     setOperationAction(ISD::FP_TO_UINT, VT, Custom);
 170   } else {
 171     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 172     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 173     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 174     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 175   }
 176   setOperationAction(ISD::BUILD_VECTOR,      VT, Custom);
 177   setOperationAction(ISD::VECTOR_SHUFFLE,    VT, Custom);
 178   setOperationAction(ISD::CONCAT_VECTORS,    VT, Legal);
 179   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
 180   setOperationAction(ISD::SELECT,            VT, Expand);
 181   setOperationAction(ISD::SELECT_CC,         VT, Expand);
 182   setOperationAction(ISD::VSELECT,           VT, Expand);
 183   setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
 184   if (VT.isInteger()) {
 185     setOperationAction(ISD::SHL, VT, Custom);
 186     setOperationAction(ISD::SRA, VT, Custom);
 187     setOperationAction(ISD::SRL, VT, Custom);
 188   }
 189
 190   // Promote all bit-wise operations.
 191   if (VT.isInteger() && VT != PromotedBitwiseVT) {
 192     setOperationAction(ISD::AND, VT, Promote);
 193     AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
 194     setOperationAction(ISD::OR,  VT, Promote);
 195     AddPromotedToType (ISD::OR,  VT, PromotedBitwiseVT);
 196     setOperationAction(ISD::XOR, VT, Promote);
 197     AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
 198   }
 199
 200   // Neon does not support vector divide/remainder operations.
 201   setOperationAction(ISD::SDIV, VT, Expand);
 202   setOperationAction(ISD::UDIV, VT, Expand);
 203   setOperationAction(ISD::FDIV, VT, Expand);
 204   setOperationAction(ISD::SREM, VT, Expand);
 205   setOperationAction(ISD::UREM, VT, Expand);
 206   setOperationAction(ISD::FREM, VT, Expand);
 207
 208   if (!VT.isFloatingPoint() &&
 209       VT != MVT::v2i64 && VT != MVT::v1i64)
 210     for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
 211       setOperationAction(Opcode, VT, Legal);
 212 }
 213
 214 void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
 215   addRegisterClass(VT, &ARM::DPRRegClass);
 216   addTypeForNEON(VT, MVT::f64, MVT::v2i32);
 217 }
 218
 219 void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
 220   addRegisterClass(VT, &ARM::DPairRegClass);
 221   addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
 222 }
 223
 224 void ARMTargetLowering::setAllExpand(MVT VT) {
 225   for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
 226     setOperationAction(Opc, VT, Expand);
 227
 228   // We support these really simple operations even on types where all
 229   // the actual arithmetic has to be broken down into simpler
 230   // operations or turned into library calls.
 231   setOperationAction(ISD::BITCAST, VT, Legal);
 232   setOperationAction(ISD::LOAD, VT, Legal);
 233   setOperationAction(ISD::STORE, VT, Legal);
 234   setOperationAction(ISD::UNDEF, VT, Legal);
 235 }
 236
 237 void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
 238                                        LegalizeAction Action) {
 239   setLoadExtAction(ISD::EXTLOAD,  From, To, Action);
 240   setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
 241   setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
 242 }
 243
 244 void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
 245   const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
 246
 247   for (auto VT : IntTypes) {
 248     addRegisterClass(VT, &ARM::MQPRRegClass);
 249     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
 250     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
 251     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 252     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
 253     setOperationAction(ISD::SHL, VT, Custom);
 254     setOperationAction(ISD::SRA, VT, Custom);
 255     setOperationAction(ISD::SRL, VT, Custom);
 256     setOperationAction(ISD::SMIN, VT, Legal);
 257     setOperationAction(ISD::SMAX, VT, Legal);
 258     setOperationAction(ISD::UMIN, VT, Legal);
 259     setOperationAction(ISD::UMAX, VT, Legal);
 260     setOperationAction(ISD::ABS, VT, Legal);
 261     setOperationAction(ISD::SETCC, VT, Custom);
 262     setOperationAction(ISD::MLOAD, VT, Custom);
 263     setOperationAction(ISD::MSTORE, VT, Legal);
 264     setOperationAction(ISD::CTLZ, VT, Legal);
 265     setOperationAction(ISD::CTTZ, VT, Custom);
 266     setOperationAction(ISD::BITREVERSE, VT, Legal);
 267     setOperationAction(ISD::BSWAP, VT, Legal);
 268     setOperationAction(ISD::SADDSAT, VT, Legal);
 269     setOperationAction(ISD::UADDSAT, VT, Legal);
 270
 271     // No native support for these.
 272     setOperationAction(ISD::UDIV, VT, Expand);
 273     setOperationAction(ISD::SDIV, VT, Expand);
 274     setOperationAction(ISD::UREM, VT, Expand);
 275     setOperationAction(ISD::SREM, VT, Expand);
 276     setOperationAction(ISD::CTPOP, VT, Expand);
 277
 278     // Vector reductions
 279     setOperationAction(ISD::VECREDUCE_ADD, VT, Legal);
 280     setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal);
 281     setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal);
 282     setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal);
 283     setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal);
 284
 285     if (!HasMVEFP) {
 286       setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 287       setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 288       setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 289       setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 290     }
 291
 292     // Pre and Post inc are supported on loads and stores
 293     for (unsigned im = (unsigned)ISD::PRE_INC;
 294          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
 295       setIndexedLoadAction(im, VT, Legal);
 296       setIndexedStoreAction(im, VT, Legal);
 297     }
 298   }
 299
 300   const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
 301   for (auto VT : FloatTypes) {
 302     addRegisterClass(VT, &ARM::MQPRRegClass);
 303     if (!HasMVEFP)
 304       setAllExpand(VT);
 305
 306     // These are legal or custom whether we have MVE.fp or not
 307     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
 308     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
 309     setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getVectorElementType(), Custom);
 310     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 311     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
 312     setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom);
 313     setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
 314     setOperationAction(ISD::SETCC, VT, Custom);
 315     setOperationAction(ISD::MLOAD, VT, Custom);
 316     setOperationAction(ISD::MSTORE, VT, Legal);
 317
 318     // Pre and Post inc are supported on loads and stores
 319     for (unsigned im = (unsigned)ISD::PRE_INC;
 320          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
 321       setIndexedLoadAction(im, VT, Legal);
 322       setIndexedStoreAction(im, VT, Legal);
 323     }
 324
 325     if (HasMVEFP) {
 326       setOperationAction(ISD::FMINNUM, VT, Legal);
 327       setOperationAction(ISD::FMAXNUM, VT, Legal);
 328       setOperationAction(ISD::FROUND, VT, Legal);
 329
 330       // No native support for these.
 331       setOperationAction(ISD::FDIV, VT, Expand);
 332       setOperationAction(ISD::FREM, VT, Expand);
 333       setOperationAction(ISD::FSQRT, VT, Expand);
 334       setOperationAction(ISD::FSIN, VT, Expand);
 335       setOperationAction(ISD::FCOS, VT, Expand);
 336       setOperationAction(ISD::FPOW, VT, Expand);
 337       setOperationAction(ISD::FLOG, VT, Expand);
 338       setOperationAction(ISD::FLOG2, VT, Expand);
 339       setOperationAction(ISD::FLOG10, VT, Expand);
 340       setOperationAction(ISD::FEXP, VT, Expand);
 341       setOperationAction(ISD::FEXP2, VT, Expand);
 342       setOperationAction(ISD::FNEARBYINT, VT, Expand);
 343     }
 344   }
 345
 346   // We 'support' these types up to bitcast/load/store level, regardless of
 347   // MVE integer-only / float support. Only doing FP data processing on the FP
 348   // vector types is inhibited at integer-only level.
 349   const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
 350   for (auto VT : LongTypes) {
 351     addRegisterClass(VT, &ARM::MQPRRegClass);
 352     setAllExpand(VT);
 353     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
 354     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 355     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
 356   }
 357   // We can do bitwise operations on v2i64 vectors
 358   setOperationAction(ISD::AND, MVT::v2i64, Legal);
 359   setOperationAction(ISD::OR, MVT::v2i64, Legal);
 360   setOperationAction(ISD::XOR, MVT::v2i64, Legal);
 361
 362   // It is legal to extload from v4i8 to v4i16 or v4i32.
 363   addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
 364   addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
 365   addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
 366
 367   // Some truncating stores are legal too.
 368   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
 369   setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
 370   setTruncStoreAction(MVT::v8i16, MVT::v8i8,  Legal);
 371
 372   // Pre and Post inc on these are legal, given the correct extends
 373   for (unsigned im = (unsigned)ISD::PRE_INC;
 374        im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
 375     setIndexedLoadAction(im, MVT::v8i8, Legal);
 376     setIndexedStoreAction(im, MVT::v8i8, Legal);
 377     setIndexedLoadAction(im, MVT::v4i8, Legal);
 378     setIndexedStoreAction(im, MVT::v4i8, Legal);
 379     setIndexedLoadAction(im, MVT::v4i16, Legal);
 380     setIndexedStoreAction(im, MVT::v4i16, Legal);
 381   }
 382
 383   // Predicate types
 384   const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1};
 385   for (auto VT : pTypes) {
 386     addRegisterClass(VT, &ARM::VCCRRegClass);
 387     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
 388     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
 389     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
 390     setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
 391     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
 392     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 393     setOperationAction(ISD::SETCC, VT, Custom);
 394     setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
 395     setOperationAction(ISD::LOAD, VT, Custom);
 396     setOperationAction(ISD::STORE, VT, Custom);
 397   }
 398 }
 399
 400 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
 401                                      const ARMSubtarget &STI)
 402     : TargetLowering(TM), Subtarget(&STI) {
 403   RegInfo = Subtarget->getRegisterInfo();
 404   Itins = Subtarget->getInstrItineraryData();
 405
 406   setBooleanContents(ZeroOrOneBooleanContent);
 407   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 408
 409   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() &&
 410       !Subtarget->isTargetWatchOS()) {
 411     bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard;
 412     for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID)
 413       setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID),
 414                             IsHFTarget ? CallingConv::ARM_AAPCS_VFP
 415                                        : CallingConv::ARM_AAPCS);
 416   }
 417
 418   if (Subtarget->isTargetMachO()) {
 419     // Uses VFP for Thumb libfuncs if available.
 420     if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
 421         Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
 422       static const struct {
 423         const RTLIB::Libcall Op;
 424         const char * const Name;
 425         const ISD::CondCode Cond;
 426       } LibraryCalls[] = {
 427         // Single-precision floating-point arithmetic.
 428         { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
 429         { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
 430         { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
 431         { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
 432
 433         // Double-precision floating-point arithmetic.
 434         { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
 435         { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
 436         { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
 437         { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
 438
 439         // Single-precision comparisons.
 440         { RTLIB::OEQ_F32, "__eqsf2vfp",    ISD::SETNE },
 441         { RTLIB::UNE_F32, "__nesf2vfp",    ISD::SETNE },
 442         { RTLIB::OLT_F32, "__ltsf2vfp",    ISD::SETNE },
 443         { RTLIB::OLE_F32, "__lesf2vfp",    ISD::SETNE },
 444         { RTLIB::OGE_F32, "__gesf2vfp",    ISD::SETNE },
 445         { RTLIB::OGT_F32, "__gtsf2vfp",    ISD::SETNE },
 446         { RTLIB::UO_F32,  "__unordsf2vfp", ISD::SETNE },
 447         { RTLIB::O_F32,   "__unordsf2vfp", ISD::SETEQ },
 448
 449         // Double-precision comparisons.
 450         { RTLIB::OEQ_F64, "__eqdf2vfp",    ISD::SETNE },
 451         { RTLIB::UNE_F64, "__nedf2vfp",    ISD::SETNE },
 452         { RTLIB::OLT_F64, "__ltdf2vfp",    ISD::SETNE },
 453         { RTLIB::OLE_F64, "__ledf2vfp",    ISD::SETNE },
 454         { RTLIB::OGE_F64, "__gedf2vfp",    ISD::SETNE },
 455         { RTLIB::OGT_F64, "__gtdf2vfp",    ISD::SETNE },
 456         { RTLIB::UO_F64,  "__unorddf2vfp", ISD::SETNE },
 457         { RTLIB::O_F64,   "__unorddf2vfp", ISD::SETEQ },
 458
 459         // Floating-point to integer conversions.
 460         // i64 conversions are done via library routines even when generating VFP
 461         // instructions, so use the same ones.
 462         { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp",    ISD::SETCC_INVALID },
 463         { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
 464         { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp",    ISD::SETCC_INVALID },
 465         { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
 466
 467         // Conversions between floating types.
 468         { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp",  ISD::SETCC_INVALID },
 469         { RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp", ISD::SETCC_INVALID },
 470
 471         // Integer to floating-point conversions.
 472         // i64 conversions are done via library routines even when generating VFP
 473         // instructions, so use the same ones.
 474         // FIXME: There appears to be some naming inconsistency in ARM libgcc:
 475         // e.g., __floatunsidf vs. __floatunssidfvfp.
 476         { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp",    ISD::SETCC_INVALID },
 477         { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
 478         { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp",    ISD::SETCC_INVALID },
 479         { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
 480       };
 481
 482       for (const auto &LC : LibraryCalls) {
 483         setLibcallName(LC.Op, LC.Name);
 484         if (LC.Cond != ISD::SETCC_INVALID)
 485           setCmpLibcallCC(LC.Op, LC.Cond);
 486       }
 487     }
 488   }
 489
 490   // These libcalls are not available in 32-bit.
 491   setLibcallName(RTLIB::SHL_I128, nullptr);
 492   setLibcallName(RTLIB::SRL_I128, nullptr);
 493   setLibcallName(RTLIB::SRA_I128, nullptr);
 494
 495   // RTLIB
 496   if (Subtarget->isAAPCS_ABI() &&
 497       (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
 498        Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
 499     static const struct {
 500       const RTLIB::Libcall Op;
 501       const char * const Name;
 502       const CallingConv::ID CC;
 503       const ISD::CondCode Cond;
 504     } LibraryCalls[] = {
 505       // Double-precision floating-point arithmetic helper functions
 506       // RTABI chapter 4.1.2, Table 2
 507       { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 508       { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 509       { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 510       { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 511
 512       // Double-precision floating-point comparison helper functions
 513       // RTABI chapter 4.1.2, Table 3
 514       { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
 515       { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
 516       { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
 517       { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
 518       { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
 519       { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
 520       { RTLIB::UO_F64,  "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
 521       { RTLIB::O_F64,   "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
 522
 523       // Single-precision floating-point arithmetic helper functions
 524       // RTABI chapter 4.1.2, Table 4
 525       { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 526       { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 527       { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 528       { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 529
 530       // Single-precision floating-point comparison helper functions
 531       // RTABI chapter 4.1.2, Table 5
 532       { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
 533       { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
 534       { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
 535       { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
 536       { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
 537       { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
 538       { RTLIB::UO_F32,  "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
 539       { RTLIB::O_F32,   "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
 540
 541       // Floating-point to integer conversions.
 542       // RTABI chapter 4.1.2, Table 6
 543       { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 544       { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 545       { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 546       { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 547       { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 548       { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 549       { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 550       { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 551
 552       // Conversions between floating types.
 553       // RTABI chapter 4.1.2, Table 7
 554       { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 555       { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 556       { RTLIB::FPEXT_F32_F64,   "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 557
 558       // Integer to floating-point conversions.
 559       // RTABI chapter 4.1.2, Table 8
 560       { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 561       { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 562       { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 563       { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 564       { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 565       { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 566       { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 567       { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 568
 569       // Long long helper functions
 570       // RTABI chapter 4.2, Table 9
 571       { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 572       { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 573       { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 574       { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 575
 576       // Integer division functions
 577       // RTABI chapter 4.3.1
 578       { RTLIB::SDIV_I8,  "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 579       { RTLIB::SDIV_I16, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 580       { RTLIB::SDIV_I32, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 581       { RTLIB::SDIV_I64, "__aeabi_ldivmod",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 582       { RTLIB::UDIV_I8,  "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 583       { RTLIB::UDIV_I16, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 584       { RTLIB::UDIV_I32, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 585       { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 586     };
 587
 588     for (const auto &LC : LibraryCalls) {
 589       setLibcallName(LC.Op, LC.Name);
 590       setLibcallCallingConv(LC.Op, LC.CC);
 591       if (LC.Cond != ISD::SETCC_INVALID)
 592         setCmpLibcallCC(LC.Op, LC.Cond);
 593     }
 594
 595     // EABI dependent RTLIB
 596     if (TM.Options.EABIVersion == EABI::EABI4 ||
 597         TM.Options.EABIVersion == EABI::EABI5) {
 598       static const struct {
 599         const RTLIB::Libcall Op;
 600         const char *const Name;
 601         const CallingConv::ID CC;
 602         const ISD::CondCode Cond;
 603       } MemOpsLibraryCalls[] = {
 604         // Memory operations
 605         // RTABI chapter 4.3.4
 606         { RTLIB::MEMCPY,  "__aeabi_memcpy",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 607         { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 608         { RTLIB::MEMSET,  "__aeabi_memset",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 609       };
 610
 611       for (const auto &LC : MemOpsLibraryCalls) {
 612         setLibcallName(LC.Op, LC.Name);
 613         setLibcallCallingConv(LC.Op, LC.CC);
 614         if (LC.Cond != ISD::SETCC_INVALID)
 615           setCmpLibcallCC(LC.Op, LC.Cond);
 616       }
 617     }
 618   }
 619
 620   if (Subtarget->isTargetWindows()) {
 621     static const struct {
 622       const RTLIB::Libcall Op;
 623       const char * const Name;
 624       const CallingConv::ID CC;
 625     } LibraryCalls[] = {
 626       { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
 627       { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
 628       { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
 629       { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
 630       { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
 631       { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
 632       { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
 633       { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
 634     };
 635
 636     for (const auto &LC : LibraryCalls) {
 637       setLibcallName(LC.Op, LC.Name);
 638       setLibcallCallingConv(LC.Op, LC.CC);
 639     }
 640   }
 641
 642   // Use divmod compiler-rt calls for iOS 5.0 and later.
 643   if (Subtarget->isTargetMachO() &&
 644       !(Subtarget->isTargetIOS() &&
 645         Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
 646     setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
 647     setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
 648   }
 649
 650   // The half <-> float conversion functions are always soft-float on
 651   // non-watchos platforms, but are needed for some targets which use a
 652   // hard-float calling convention by default.
 653   if (!Subtarget->isTargetWatchABI()) {
 654     if (Subtarget->isAAPCS_ABI()) {
 655       setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
 656       setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
 657       setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
 658     } else {
 659       setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
 660       setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
 661       setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
 662     }
 663   }
 664
 665   // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
 666   // a __gnu_ prefix (which is the default).
 667   if (Subtarget->isTargetAEABI()) {
 668     static const struct {
 669       const RTLIB::Libcall Op;
 670       const char * const Name;
 671       const CallingConv::ID CC;
 672     } LibraryCalls[] = {
 673       { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS },
 674       { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS },
 675       { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS },
 676     };
 677
 678     for (const auto &LC : LibraryCalls) {
 679       setLibcallName(LC.Op, LC.Name);
 680       setLibcallCallingConv(LC.Op, LC.CC);
 681     }
 682   }
 683
 684   if (Subtarget->isThumb1Only())
 685     addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
 686   else
 687     addRegisterClass(MVT::i32, &ARM::GPRRegClass);
 688
 689   if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
 690       Subtarget->hasFPRegs()) {
 691     addRegisterClass(MVT::f32, &ARM::SPRRegClass);
 692     addRegisterClass(MVT::f64, &ARM::DPRRegClass);
 693     if (!Subtarget->hasVFP2Base())
 694       setAllExpand(MVT::f32);
 695     if (!Subtarget->hasFP64())
 696       setAllExpand(MVT::f64);
 697   }
 698
 699   if (Subtarget->hasFullFP16()) {
 700     addRegisterClass(MVT::f16, &ARM::HPRRegClass);
 701     setOperationAction(ISD::BITCAST, MVT::i16, Custom);
 702     setOperationAction(ISD::BITCAST, MVT::i32, Custom);
 703     setOperationAction(ISD::BITCAST, MVT::f16, Custom);
 704
 705     setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
 706     setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
 707   }
 708
 709   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
 710     for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
 711       setTruncStoreAction(VT, InnerVT, Expand);
 712       addAllExtLoads(VT, InnerVT, Expand);
 713     }
 714
 715     setOperationAction(ISD::MULHS, VT, Expand);
 716     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 717     setOperationAction(ISD::MULHU, VT, Expand);
 718     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 719
 720     setOperationAction(ISD::BSWAP, VT, Expand);
 721   }
 722
 723   setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
 724   setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
 725
 726   setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom);
 727   setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);
 728
 729   if (Subtarget->hasMVEIntegerOps())
 730     addMVEVectorTypes(Subtarget->hasMVEFloatOps());
 731
 732   // Combine low-overhead loop intrinsics so that we can lower i1 types.
 733   if (Subtarget->hasLOB()) {
 734     setTargetDAGCombine(ISD::BRCOND);
 735     setTargetDAGCombine(ISD::BR_CC);
 736   }
 737
 738   if (Subtarget->hasNEON()) {
 739     addDRTypeForNEON(MVT::v2f32);
 740     addDRTypeForNEON(MVT::v8i8);
 741     addDRTypeForNEON(MVT::v4i16);
 742     addDRTypeForNEON(MVT::v2i32);
 743     addDRTypeForNEON(MVT::v1i64);
 744
 745     addQRTypeForNEON(MVT::v4f32);
 746     addQRTypeForNEON(MVT::v2f64);
 747     addQRTypeForNEON(MVT::v16i8);
 748     addQRTypeForNEON(MVT::v8i16);
 749     addQRTypeForNEON(MVT::v4i32);
 750     addQRTypeForNEON(MVT::v2i64);
 751
 752     if (Subtarget->hasFullFP16()) {
 753       addQRTypeForNEON(MVT::v8f16);
 754       addDRTypeForNEON(MVT::v4f16);
 755     }
 756   }
 757
 758   if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
 759     // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
 760     // none of Neon, MVE or VFP supports any arithmetic operations on it.
 761     setOperationAction(ISD::FADD, MVT::v2f64, Expand);
 762     setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
 763     setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
 764     // FIXME: Code duplication: FDIV and FREM are expanded always, see
 765     // ARMTargetLowering::addTypeForNEON method for details.
 766     setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
 767     setOperationAction(ISD::FREM, MVT::v2f64, Expand);
 768     // FIXME: Create unittest.
 769     // In another words, find a way when "copysign" appears in DAG with vector
 770     // operands.
 771     setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
 772     // FIXME: Code duplication: SETCC has custom operation action, see
 773     // ARMTargetLowering::addTypeForNEON method for details.
 774     setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
 775     // FIXME: Create unittest for FNEG and for FABS.
 776     setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
 777     setOperationAction(ISD::FABS, MVT::v2f64, Expand);
 778     setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
 779     setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
 780     setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
 781     setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
 782     setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
 783     setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
 784     setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
 785     setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
 786     setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
 787     // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
 788     setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
 789     setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
 790     setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
 791     setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
 792     setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
 793     setOperationAction(ISD::FMA, MVT::v2f64, Expand);
 794   }
 795
 796   if (Subtarget->hasNEON()) {
 797     // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
 798     // supported for v4f32.
 799     setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
 800     setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
 801     setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
 802     setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
 803     setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
 804     setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
 805     setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
 806     setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
 807     setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
 808     setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
 809     setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
 810     setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
 811     setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
 812     setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
 813
 814     // Mark v2f32 intrinsics.
 815     setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
 816     setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
 817     setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
 818     setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
 819     setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
 820     setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
 821     setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
 822     setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
 823     setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
 824     setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
 825     setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
 826     setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
 827     setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
 828     setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
 829
 830     // Neon does not support some operations on v1i64 and v2i64 types.
 831     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
 832     // Custom handling for some quad-vector types to detect VMULL.
 833     setOperationAction(ISD::MUL, MVT::v8i16, Custom);
 834     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
 835     setOperationAction(ISD::MUL, MVT::v2i64, Custom);
 836     // Custom handling for some vector types to avoid expensive expansions
 837     setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
 838     setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
 839     setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
 840     setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
 841     // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
 842     // a destination type that is wider than the source, and nor does
 843     // it have a FP_TO_[SU]INT instruction with a narrower destination than
 844     // source.
 845     setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
 846     setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
 847     setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
 848     setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
 849     setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
 850     setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
 851     setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
 852     setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
 853
 854     setOperationAction(ISD::FP_ROUND,   MVT::v2f32, Expand);
 855     setOperationAction(ISD::FP_EXTEND,  MVT::v2f64, Expand);
 856
 857     // NEON does not have single instruction CTPOP for vectors with element
 858     // types wider than 8-bits.  However, custom lowering can leverage the
 859     // v8i8/v16i8 vcnt instruction.
 860     setOperationAction(ISD::CTPOP,      MVT::v2i32, Custom);
 861     setOperationAction(ISD::CTPOP,      MVT::v4i32, Custom);
 862     setOperationAction(ISD::CTPOP,      MVT::v4i16, Custom);
 863     setOperationAction(ISD::CTPOP,      MVT::v8i16, Custom);
 864     setOperationAction(ISD::CTPOP,      MVT::v1i64, Custom);
 865     setOperationAction(ISD::CTPOP,      MVT::v2i64, Custom);
 866
 867     setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
 868     setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
 869
 870     // NEON does not have single instruction CTTZ for vectors.
 871     setOperationAction(ISD::CTTZ, MVT::v8i8, Custom);
 872     setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
 873     setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
 874     setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
 875
 876     setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
 877     setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
 878     setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
 879     setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
 880
 881     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom);
 882     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom);
 883     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom);
 884     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom);
 885
 886     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom);
 887     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom);
 888     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
 889     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
 890
 891     // NEON only has FMA instructions as of VFP4.
 892     if (!Subtarget->hasVFP4Base()) {
 893       setOperationAction(ISD::FMA, MVT::v2f32, Expand);
 894       setOperationAction(ISD::FMA, MVT::v4f32, Expand);
 895     }
 896
 897     setTargetDAGCombine(ISD::INTRINSIC_VOID);
 898     setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
 899     setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
 900     setTargetDAGCombine(ISD::SHL);
 901     setTargetDAGCombine(ISD::SRL);
 902     setTargetDAGCombine(ISD::SRA);
 903     setTargetDAGCombine(ISD::FP_TO_SINT);
 904     setTargetDAGCombine(ISD::FP_TO_UINT);
 905     setTargetDAGCombine(ISD::FDIV);
 906     setTargetDAGCombine(ISD::LOAD);
 907
 908     // It is legal to extload from v4i8 to v4i16 or v4i32.
 909     for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
 910                    MVT::v2i32}) {
 911       for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
 912         setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal);
 913         setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal);
 914         setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal);
 915       }
 916     }
 917   }
 918
 919   if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
 920     setTargetDAGCombine(ISD::BUILD_VECTOR);
 921     setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
 922     setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 923     setTargetDAGCombine(ISD::STORE);
 924     setTargetDAGCombine(ISD::SIGN_EXTEND);
 925     setTargetDAGCombine(ISD::ZERO_EXTEND);
 926     setTargetDAGCombine(ISD::ANY_EXTEND);
 927   }
 928
 929   if (!Subtarget->hasFP64()) {
 930     // When targeting a floating-point unit with only single-precision
 931     // operations, f64 is legal for the few double-precision instructions which
 932     // are present However, no double-precision operations other than moves,
 933     // loads and stores are provided by the hardware.
 934     setOperationAction(ISD::FADD,       MVT::f64, Expand);
 935     setOperationAction(ISD::FSUB,       MVT::f64, Expand);
 936     setOperationAction(ISD::FMUL,       MVT::f64, Expand);
 937     setOperationAction(ISD::FMA,        MVT::f64, Expand);
 938     setOperationAction(ISD::FDIV,       MVT::f64, Expand);
 939     setOperationAction(ISD::FREM,       MVT::f64, Expand);
 940     setOperationAction(ISD::FCOPYSIGN,  MVT::f64, Expand);
 941     setOperationAction(ISD::FGETSIGN,   MVT::f64, Expand);
 942     setOperationAction(ISD::FNEG,       MVT::f64, Expand);
 943     setOperationAction(ISD::FABS,       MVT::f64, Expand);
 944     setOperationAction(ISD::FSQRT,      MVT::f64, Expand);
 945     setOperationAction(ISD::FSIN,       MVT::f64, Expand);
 946     setOperationAction(ISD::FCOS,       MVT::f64, Expand);
 947     setOperationAction(ISD::FPOW,       MVT::f64, Expand);
 948     setOperationAction(ISD::FLOG,       MVT::f64, Expand);
 949     setOperationAction(ISD::FLOG2,      MVT::f64, Expand);
 950     setOperationAction(ISD::FLOG10,     MVT::f64, Expand);
 951     setOperationAction(ISD::FEXP,       MVT::f64, Expand);
 952     setOperationAction(ISD::FEXP2,      MVT::f64, Expand);
 953     setOperationAction(ISD::FCEIL,      MVT::f64, Expand);
 954     setOperationAction(ISD::FTRUNC,     MVT::f64, Expand);
 955     setOperationAction(ISD::FRINT,      MVT::f64, Expand);
 956     setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
 957     setOperationAction(ISD::FFLOOR,     MVT::f64, Expand);
 958     setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
 959     setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
 960     setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
 961     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
 962     setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
 963     setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
 964     setOperationAction(ISD::FP_ROUND,   MVT::f32, Custom);
 965   }
 966
 967   if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
 968     setOperationAction(ISD::FP_EXTEND,  MVT::f64, Custom);
 969     if (Subtarget->hasFullFP16())
 970       setOperationAction(ISD::FP_ROUND,  MVT::f16, Custom);
 971   }
 972
 973   if (!Subtarget->hasFP16())
 974     setOperationAction(ISD::FP_EXTEND,  MVT::f32, Custom);
 975
 976   if (!Subtarget->hasFP64())
 977     setOperationAction(ISD::FP_ROUND,  MVT::f32, Custom);
 978
 979   computeRegisterProperties(Subtarget->getRegisterInfo());
 980
 981   // ARM does not have floating-point extending loads.
 982   for (MVT VT : MVT::fp_valuetypes()) {
 983     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
 984     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
 985   }
 986
 987   // ... or truncating stores
 988   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 989   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 990   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 991
 992   // ARM does not have i1 sign extending load.
 993   for (MVT VT : MVT::integer_valuetypes())
 994     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 995
 996   // ARM supports all 4 flavors of integer indexed load / store.
 997   if (!Subtarget->isThumb1Only()) {
 998     for (unsigned im = (unsigned)ISD::PRE_INC;
 999          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1000       setIndexedLoadAction(im,  MVT::i1,  Legal);
1001       setIndexedLoadAction(im,  MVT::i8,  Legal);
1002       setIndexedLoadAction(im,  MVT::i16, Legal);
1003       setIndexedLoadAction(im,  MVT::i32, Legal);
1004       setIndexedStoreAction(im, MVT::i1,  Legal);
1005       setIndexedStoreAction(im, MVT::i8,  Legal);
1006       setIndexedStoreAction(im, MVT::i16, Legal);
1007       setIndexedStoreAction(im, MVT::i32, Legal);
1008     }
1009   } else {
1010     // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
1011     setIndexedLoadAction(ISD::POST_INC, MVT::i32,  Legal);
1012     setIndexedStoreAction(ISD::POST_INC, MVT::i32,  Legal);
1013   }
1014
1015   setOperationAction(ISD::SADDO, MVT::i32, Custom);
1016   setOperationAction(ISD::UADDO, MVT::i32, Custom);
1017   setOperationAction(ISD::SSUBO, MVT::i32, Custom);
1018   setOperationAction(ISD::USUBO, MVT::i32, Custom);
1019
1020   setOperationAction(ISD::ADDCARRY, MVT::i32, Custom);
1021   setOperationAction(ISD::SUBCARRY, MVT::i32, Custom);
1022
1023   // i64 operation support.
1024   setOperationAction(ISD::MUL,     MVT::i64, Expand);
1025   setOperationAction(ISD::MULHU,   MVT::i32, Expand);
1026   if (Subtarget->isThumb1Only()) {
1027     setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
1028     setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
1029   }
1030   if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
1031       || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
1032     setOperationAction(ISD::MULHS, MVT::i32, Expand);
1033
1034   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
1035   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
1036   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
1037   setOperationAction(ISD::SRL,       MVT::i64, Custom);
1038   setOperationAction(ISD::SRA,       MVT::i64, Custom);
1039   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1040   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1041
1042   // MVE lowers 64 bit shifts to lsll and lsrl
1043   // assuming that ISD::SRL and SRA of i64 are already marked custom
1044   if (Subtarget->hasMVEIntegerOps())
1045     setOperationAction(ISD::SHL, MVT::i64, Custom);
1046
1047   // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
1048   if (Subtarget->isThumb1Only()) {
1049     setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
1050     setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
1051     setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
1052   }
1053
1054   if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
1055     setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
1056
1057   // ARM does not have ROTL.
1058   setOperationAction(ISD::ROTL, MVT::i32, Expand);
1059   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1060     setOperationAction(ISD::ROTL, VT, Expand);
1061     setOperationAction(ISD::ROTR, VT, Expand);
1062   }
1063   setOperationAction(ISD::CTTZ,  MVT::i32, Custom);
1064   setOperationAction(ISD::CTPOP, MVT::i32, Expand);
1065   if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1066     setOperationAction(ISD::CTLZ, MVT::i32, Expand);
1067     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall);
1068   }
1069
1070   // @llvm.readcyclecounter requires the Performance Monitors extension.
1071   // Default to the 0 expansion on unsupported platforms.
1072   // FIXME: Technically there are older ARM CPUs that have
1073   // implementation-specific ways of obtaining this information.
1074   if (Subtarget->hasPerfMon())
1075     setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
1076
1077   // Only ARMv6 has BSWAP.
1078   if (!Subtarget->hasV6Ops())
1079     setOperationAction(ISD::BSWAP, MVT::i32, Expand);
1080
1081   bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1082                                         : Subtarget->hasDivideInARMMode();
1083   if (!hasDivide) {
1084     // These are expanded into libcalls if the cpu doesn't have HW divider.
1085     setOperationAction(ISD::SDIV,  MVT::i32, LibCall);
1086     setOperationAction(ISD::UDIV,  MVT::i32, LibCall);
1087   }
1088
1089   if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) {
1090     setOperationAction(ISD::SDIV, MVT::i32, Custom);
1091     setOperationAction(ISD::UDIV, MVT::i32, Custom);
1092
1093     setOperationAction(ISD::SDIV, MVT::i64, Custom);
1094     setOperationAction(ISD::UDIV, MVT::i64, Custom);
1095   }
1096
1097   setOperationAction(ISD::SREM,  MVT::i32, Expand);
1098   setOperationAction(ISD::UREM,  MVT::i32, Expand);
1099
1100   // Register based DivRem for AEABI (RTABI 4.2)
1101   if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
1102       Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
1103       Subtarget->isTargetWindows()) {
1104     setOperationAction(ISD::SREM, MVT::i64, Custom);
1105     setOperationAction(ISD::UREM, MVT::i64, Custom);
1106     HasStandaloneRem = false;
1107
1108     if (Subtarget->isTargetWindows()) {
1109       const struct {
1110         const RTLIB::Libcall Op;
1111         const char * const Name;
1112         const CallingConv::ID CC;
1113       } LibraryCalls[] = {
1114         { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS },
1115         { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS },
1116         { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS },
1117         { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS },
1118
1119         { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS },
1120         { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS },
1121         { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS },
1122         { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS },
1123       };
1124
1125       for (const auto &LC : LibraryCalls) {
1126         setLibcallName(LC.Op, LC.Name);
1127         setLibcallCallingConv(LC.Op, LC.CC);
1128       }
1129     } else {
1130       const struct {
1131         const RTLIB::Libcall Op;
1132         const char * const Name;
1133         const CallingConv::ID CC;
1134       } LibraryCalls[] = {
1135         { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1136         { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1137         { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1138         { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS },
1139
1140         { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1141         { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1142         { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1143         { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS },
1144       };
1145
1146       for (const auto &LC : LibraryCalls) {
1147         setLibcallName(LC.Op, LC.Name);
1148         setLibcallCallingConv(LC.Op, LC.CC);
1149       }
1150     }
1151
1152     setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
1153     setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
1154     setOperationAction(ISD::SDIVREM, MVT::i64, Custom);
1155     setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
1156   } else {
1157     setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
1158     setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
1159   }
1160
1161   if (Subtarget->isTargetWindows() && Subtarget->getTargetTriple().isOSMSVCRT())
1162     for (auto &VT : {MVT::f32, MVT::f64})
1163       setOperationAction(ISD::FPOWI, VT, Custom);
1164
1165   setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
1166   setOperationAction(ISD::ConstantPool,  MVT::i32,   Custom);
1167   setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
1168   setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
1169
1170   setOperationAction(ISD::TRAP, MVT::Other, Legal);
1171   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
1172
1173   // Use the default implementation.
1174   setOperationAction(ISD::VASTART,            MVT::Other, Custom);
1175   setOperationAction(ISD::VAARG,              MVT::Other, Expand);
1176   setOperationAction(ISD::VACOPY,             MVT::Other, Expand);
1177   setOperationAction(ISD::VAEND,              MVT::Other, Expand);
1178   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
1179   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
1180
1181   if (Subtarget->isTargetWindows())
1182     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
1183   else
1184     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
1185
1186   // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1187   // the default expansion.
1188   InsertFencesForAtomic = false;
1189   if (Subtarget->hasAnyDataBarrier() &&
1190       (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1191     // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1192     // to ldrex/strex loops already.
1193     setOperationAction(ISD::ATOMIC_FENCE,     MVT::Other, Custom);
1194     if (!Subtarget->isThumb() || !Subtarget->isMClass())
1195       setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
1196
1197     // On v8, we have particularly efficient implementations of atomic fences
1198     // if they can be combined with nearby atomic loads and stores.
1199     if (!Subtarget->hasAcquireRelease() ||
1200         getTargetMachine().getOptLevel() == 0) {
1201       // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1202       InsertFencesForAtomic = true;
1203     }
1204   } else {
1205     // If there's anything we can use as a barrier, go through custom lowering
1206     // for ATOMIC_FENCE.
1207     // If target has DMB in thumb, Fences can be inserted.
1208     if (Subtarget->hasDataBarrier())
1209       InsertFencesForAtomic = true;
1210
1211     setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other,
1212                        Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1213
1214     // Set them all for expansion, which will force libcalls.
1215     setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
1216     setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
1217     setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
1218     setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Expand);
1219     setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Expand);
1220     setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Expand);
1221     setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Expand);
1222     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
1223     setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
1224     setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
1225     setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
1226     setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
1227     // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1228     // Unordered/Monotonic case.
1229     if (!InsertFencesForAtomic) {
1230       setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
1231       setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
1232     }
1233   }
1234
1235   setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
1236
1237   // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1238   if (!Subtarget->hasV6Ops()) {
1239     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
1240     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
1241   }
1242   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
1243
1244   if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1245       !Subtarget->isThumb1Only()) {
1246     // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1247     // iff target supports vfp2.
1248     setOperationAction(ISD::BITCAST, MVT::i64, Custom);
1249     setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
1250   }
1251
1252   // We want to custom lower some of our intrinsics.
1253   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1254   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
1255   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
1256   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
1257   if (Subtarget->useSjLjEH())
1258     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
1259
1260   setOperationAction(ISD::SETCC,     MVT::i32, Expand);
1261   setOperationAction(ISD::SETCC,     MVT::f32, Expand);
1262   setOperationAction(ISD::SETCC,     MVT::f64, Expand);
1263   setOperationAction(ISD::SELECT,    MVT::i32, Custom);
1264   setOperationAction(ISD::SELECT,    MVT::f32, Custom);
1265   setOperationAction(ISD::SELECT,    MVT::f64, Custom);
1266   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
1267   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
1268   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
1269   if (Subtarget->hasFullFP16()) {
1270     setOperationAction(ISD::SETCC,     MVT::f16, Expand);
1271     setOperationAction(ISD::SELECT,    MVT::f16, Custom);
1272     setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
1273   }
1274
1275   setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom);
1276
1277   setOperationAction(ISD::BRCOND,    MVT::Other, Custom);
1278   setOperationAction(ISD::BR_CC,     MVT::i32,   Custom);
1279   if (Subtarget->hasFullFP16())
1280       setOperationAction(ISD::BR_CC, MVT::f16,   Custom);
1281   setOperationAction(ISD::BR_CC,     MVT::f32,   Custom);
1282   setOperationAction(ISD::BR_CC,     MVT::f64,   Custom);
1283   setOperationAction(ISD::BR_JT,     MVT::Other, Custom);
1284
1285   // We don't support sin/cos/fmod/copysign/pow
1286   setOperationAction(ISD::FSIN,      MVT::f64, Expand);
1287   setOperationAction(ISD::FSIN,      MVT::f32, Expand);
1288   setOperationAction(ISD::FCOS,      MVT::f32, Expand);
1289   setOperationAction(ISD::FCOS,      MVT::f64, Expand);
1290   setOperationAction(ISD::FSINCOS,   MVT::f64, Expand);
1291   setOperationAction(ISD::FSINCOS,   MVT::f32, Expand);
1292   setOperationAction(ISD::FREM,      MVT::f64, Expand);
1293   setOperationAction(ISD::FREM,      MVT::f32, Expand);
1294   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1295       !Subtarget->isThumb1Only()) {
1296     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
1297     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
1298   }
1299   setOperationAction(ISD::FPOW,      MVT::f64, Expand);
1300   setOperationAction(ISD::FPOW,      MVT::f32, Expand);
1301
1302   if (!Subtarget->hasVFP4Base()) {
1303     setOperationAction(ISD::FMA, MVT::f64, Expand);
1304     setOperationAction(ISD::FMA, MVT::f32, Expand);
1305   }
1306
1307   // Various VFP goodness
1308   if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1309     // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1310     if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1311       setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
1312       setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
1313     }
1314
1315     // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1316     if (!Subtarget->hasFP16()) {
1317       setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
1318       setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
1319     }
1320   }
1321
1322   // Use __sincos_stret if available.
1323   if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1324       getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1325     setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1326     setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1327   }
1328
1329   // FP-ARMv8 implements a lot of rounding-like FP operations.
1330   if (Subtarget->hasFPARMv8Base()) {
1331     setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
1332     setOperationAction(ISD::FCEIL, MVT::f32, Legal);
1333     setOperationAction(ISD::FROUND, MVT::f32, Legal);
1334     setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
1335     setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
1336     setOperationAction(ISD::FRINT, MVT::f32, Legal);
1337     setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
1338     setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
1339     if (Subtarget->hasNEON()) {
1340       setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
1341       setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
1342       setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
1343       setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
1344     }
1345
1346     if (Subtarget->hasFP64()) {
1347       setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
1348       setOperationAction(ISD::FCEIL, MVT::f64, Legal);
1349       setOperationAction(ISD::FROUND, MVT::f64, Legal);
1350       setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
1351       setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
1352       setOperationAction(ISD::FRINT, MVT::f64, Legal);
1353       setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
1354       setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
1355     }
1356   }
1357
1358   // FP16 often need to be promoted to call lib functions
1359   if (Subtarget->hasFullFP16()) {
1360     setOperationAction(ISD::FREM, MVT::f16, Promote);
1361     setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
1362     setOperationAction(ISD::FSIN, MVT::f16, Promote);
1363     setOperationAction(ISD::FCOS, MVT::f16, Promote);
1364     setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
1365     setOperationAction(ISD::FPOWI, MVT::f16, Promote);
1366     setOperationAction(ISD::FPOW, MVT::f16, Promote);
1367     setOperationAction(ISD::FEXP, MVT::f16, Promote);
1368     setOperationAction(ISD::FEXP2, MVT::f16, Promote);
1369     setOperationAction(ISD::FLOG, MVT::f16, Promote);
1370     setOperationAction(ISD::FLOG10, MVT::f16, Promote);
1371     setOperationAction(ISD::FLOG2, MVT::f16, Promote);
1372
1373     setOperationAction(ISD::FROUND, MVT::f16, Legal);
1374   }
1375
1376   if (Subtarget->hasNEON()) {
1377     // vmin and vmax aren't available in a scalar form, so we use
1378     // a NEON instruction with an undef lane instead.
1379     setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
1380     setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
1381     setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
1382     setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
1383     setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);
1384     setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);
1385     setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
1386     setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);
1387
1388     if (Subtarget->hasFullFP16()) {
1389       setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal);
1390       setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal);
1391       setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal);
1392       setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal);
1393
1394       setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal);
1395       setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal);
1396       setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal);
1397       setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal);
1398     }
1399   }
1400
1401   // We have target-specific dag combine patterns for the following nodes:
1402   // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
1403   setTargetDAGCombine(ISD::ADD);
1404   setTargetDAGCombine(ISD::SUB);
1405   setTargetDAGCombine(ISD::MUL);
1406   setTargetDAGCombine(ISD::AND);
1407   setTargetDAGCombine(ISD::OR);
1408   setTargetDAGCombine(ISD::XOR);
1409
1410   if (Subtarget->hasV6Ops())
1411     setTargetDAGCombine(ISD::SRL);
1412   if (Subtarget->isThumb1Only())
1413     setTargetDAGCombine(ISD::SHL);
1414
1415   setStackPointerRegisterToSaveRestore(ARM::SP);
1416
1417   if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1418       !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1419     setSchedulingPreference(Sched::RegPressure);
1420   else
1421     setSchedulingPreference(Sched::Hybrid);
1422
1423   //// temporary - rewrite interface to use type
1424   MaxStoresPerMemset = 8;
1425   MaxStoresPerMemsetOptSize = 4;
1426   MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1427   MaxStoresPerMemcpyOptSize = 2;
1428   MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1429   MaxStoresPerMemmoveOptSize = 2;
1430
1431   // On ARM arguments smaller than 4 bytes are extended, so all arguments
1432   // are at least 4 bytes aligned.
1433   setMinStackArgumentAlignment(Align(4));
1434
1435   // Prefer likely predicted branches to selects on out-of-order cores.
1436   PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1437
1438   setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment()));
1439
1440   setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
1441
1442   if (Subtarget->isThumb() || Subtarget->isThumb2())
1443     setTargetDAGCombine(ISD::ABS);
1444 }
1445
1446 bool ARMTargetLowering::useSoftFloat() const {
1447   return Subtarget->useSoftFloat();
1448 }
1449
1450 // FIXME: It might make sense to define the representative register class as the
1451 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1452 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1453 // SPR's representative would be DPR_VFP2. This should work well if register
1454 // pressure tracking were modified such that a register use would increment the
1455 // pressure of the register class's representative and all of it's super
1456 // classes' representatives transitively. We have not implemented this because
1457 // of the difficulty prior to coalescing of modeling operand register classes
1458 // due to the common occurrence of cross class copies and subregister insertions
1459 // and extractions.
1460 std::pair<const TargetRegisterClass *, uint8_t>
1461 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1462                                            MVT VT) const {
1463   const TargetRegisterClass *RRC = nullptr;
1464   uint8_t Cost = 1;
1465   switch (VT.SimpleTy) {
1466   default:
1467     return TargetLowering::findRepresentativeClass(TRI, VT);
1468   // Use DPR as representative register class for all floating point
1469   // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1470   // the cost is 1 for both f32 and f64.
1471   case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1472   case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1473     RRC = &ARM::DPRRegClass;
1474     // When NEON is used for SP, only half of the register file is available
1475     // because operations that define both SP and DP results will be constrained
1476     // to the VFP2 class (D0-D15). We currently model this constraint prior to
1477     // coalescing by double-counting the SP regs. See the FIXME above.
1478     if (Subtarget->useNEONForSinglePrecisionFP())
1479       Cost = 2;
1480     break;
1481   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1482   case MVT::v4f32: case MVT::v2f64:
1483     RRC = &ARM::DPRRegClass;
1484     Cost = 2;
1485     break;
1486   case MVT::v4i64:
1487     RRC = &ARM::DPRRegClass;
1488     Cost = 4;
1489     break;
1490   case MVT::v8i64:
1491     RRC = &ARM::DPRRegClass;
1492     Cost = 8;
1493     break;
1494   }
1495   return std::make_pair(RRC, Cost);
1496 }
1497
1498 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
1499   switch ((ARMISD::NodeType)Opcode) {
1500   case ARMISD::FIRST_NUMBER:  break;
1501   case ARMISD::Wrapper:       return "ARMISD::Wrapper";
1502   case ARMISD::WrapperPIC:    return "ARMISD::WrapperPIC";
1503   case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
1504   case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL";
1505   case ARMISD::CALL:          return "ARMISD::CALL";
1506   case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
1507   case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
1508   case ARMISD::BRCOND:        return "ARMISD::BRCOND";
1509   case ARMISD::BR_JT:         return "ARMISD::BR_JT";
1510   case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
1511   case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
1512   case ARMISD::INTRET_FLAG:   return "ARMISD::INTRET_FLAG";
1513   case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
1514   case ARMISD::CMP:           return "ARMISD::CMP";
1515   case ARMISD::CMN:           return "ARMISD::CMN";
1516   case ARMISD::CMPZ:          return "ARMISD::CMPZ";
1517   case ARMISD::CMPFP:         return "ARMISD::CMPFP";
1518   case ARMISD::CMPFPw0:       return "ARMISD::CMPFPw0";
1519   case ARMISD::BCC_i64:       return "ARMISD::BCC_i64";
1520   case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
1521
1522   case ARMISD::CMOV:          return "ARMISD::CMOV";
1523   case ARMISD::SUBS:          return "ARMISD::SUBS";
1524
1525   case ARMISD::SSAT:          return "ARMISD::SSAT";
1526   case ARMISD::USAT:          return "ARMISD::USAT";
1527
1528   case ARMISD::ASRL:          return "ARMISD::ASRL";
1529   case ARMISD::LSRL:          return "ARMISD::LSRL";
1530   case ARMISD::LSLL:          return "ARMISD::LSLL";
1531
1532   case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
1533   case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
1534   case ARMISD::RRX:           return "ARMISD::RRX";
1535
1536   case ARMISD::ADDC:          return "ARMISD::ADDC";
1537   case ARMISD::ADDE:          return "ARMISD::ADDE";
1538   case ARMISD::SUBC:          return "ARMISD::SUBC";
1539   case ARMISD::SUBE:          return "ARMISD::SUBE";
1540   case ARMISD::LSLS:          return "ARMISD::LSLS";
1541
1542   case ARMISD::VMOVRRD:       return "ARMISD::VMOVRRD";
1543   case ARMISD::VMOVDRR:       return "ARMISD::VMOVDRR";
1544   case ARMISD::VMOVhr:        return "ARMISD::VMOVhr";
1545   case ARMISD::VMOVrh:        return "ARMISD::VMOVrh";
1546   case ARMISD::VMOVSR:        return "ARMISD::VMOVSR";
1547
1548   case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
1549   case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP";
1550   case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH";
1551
1552   case ARMISD::TC_RETURN:     return "ARMISD::TC_RETURN";
1553
1554   case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
1555
1556   case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
1557
1558   case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
1559
1560   case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
1561
1562   case ARMISD::WIN__CHKSTK:   return "ARMISD::WIN__CHKSTK";
1563   case ARMISD::WIN__DBZCHK:   return "ARMISD::WIN__DBZCHK";
1564
1565   case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST";
1566   case ARMISD::VCMP:          return "ARMISD::VCMP";
1567   case ARMISD::VCMPZ:         return "ARMISD::VCMPZ";
1568   case ARMISD::VTST:          return "ARMISD::VTST";
1569
1570   case ARMISD::VSHLs:         return "ARMISD::VSHLs";
1571   case ARMISD::VSHLu:         return "ARMISD::VSHLu";
1572   case ARMISD::VSHLIMM:       return "ARMISD::VSHLIMM";
1573   case ARMISD::VSHRsIMM:      return "ARMISD::VSHRsIMM";
1574   case ARMISD::VSHRuIMM:      return "ARMISD::VSHRuIMM";
1575   case ARMISD::VRSHRsIMM:     return "ARMISD::VRSHRsIMM";
1576   case ARMISD::VRSHRuIMM:     return "ARMISD::VRSHRuIMM";
1577   case ARMISD::VRSHRNIMM:     return "ARMISD::VRSHRNIMM";
1578   case ARMISD::VQSHLsIMM:     return "ARMISD::VQSHLsIMM";
1579   case ARMISD::VQSHLuIMM:     return "ARMISD::VQSHLuIMM";
1580   case ARMISD::VQSHLsuIMM:    return "ARMISD::VQSHLsuIMM";
1581   case ARMISD::VQSHRNsIMM:    return "ARMISD::VQSHRNsIMM";
1582   case ARMISD::VQSHRNuIMM:    return "ARMISD::VQSHRNuIMM";
1583   case ARMISD::VQSHRNsuIMM:   return "ARMISD::VQSHRNsuIMM";
1584   case ARMISD::VQRSHRNsIMM:   return "ARMISD::VQRSHRNsIMM";
1585   case ARMISD::VQRSHRNuIMM:   return "ARMISD::VQRSHRNuIMM";
1586   case ARMISD::VQRSHRNsuIMM:  return "ARMISD::VQRSHRNsuIMM";
1587   case ARMISD::VSLIIMM:       return "ARMISD::VSLIIMM";
1588   case ARMISD::VSRIIMM:       return "ARMISD::VSRIIMM";
1589   case ARMISD::VGETLANEu:     return "ARMISD::VGETLANEu";
1590   case ARMISD::VGETLANEs:     return "ARMISD::VGETLANEs";
1591   case ARMISD::VMOVIMM:       return "ARMISD::VMOVIMM";
1592   case ARMISD::VMVNIMM:       return "ARMISD::VMVNIMM";
1593   case ARMISD::VMOVFPIMM:     return "ARMISD::VMOVFPIMM";
1594   case ARMISD::VDUP:          return "ARMISD::VDUP";
1595   case ARMISD::VDUPLANE:      return "ARMISD::VDUPLANE";
1596   case ARMISD::VEXT:          return "ARMISD::VEXT";
1597   case ARMISD::VREV64:        return "ARMISD::VREV64";
1598   case ARMISD::VREV32:        return "ARMISD::VREV32";
1599   case ARMISD::VREV16:        return "ARMISD::VREV16";
1600   case ARMISD::VZIP:          return "ARMISD::VZIP";
1601   case ARMISD::VUZP:          return "ARMISD::VUZP";
1602   case ARMISD::VTRN:          return "ARMISD::VTRN";
1603   case ARMISD::VTBL1:         return "ARMISD::VTBL1";
1604   case ARMISD::VTBL2:         return "ARMISD::VTBL2";
1605   case ARMISD::VMULLs:        return "ARMISD::VMULLs";
1606   case ARMISD::VMULLu:        return "ARMISD::VMULLu";
1607   case ARMISD::UMAAL:         return "ARMISD::UMAAL";
1608   case ARMISD::UMLAL:         return "ARMISD::UMLAL";
1609   case ARMISD::SMLAL:         return "ARMISD::SMLAL";
1610   case ARMISD::SMLALBB:       return "ARMISD::SMLALBB";
1611   case ARMISD::SMLALBT:       return "ARMISD::SMLALBT";
1612   case ARMISD::SMLALTB:       return "ARMISD::SMLALTB";
1613   case ARMISD::SMLALTT:       return "ARMISD::SMLALTT";
1614   case ARMISD::SMULWB:        return "ARMISD::SMULWB";
1615   case ARMISD::SMULWT:        return "ARMISD::SMULWT";
1616   case ARMISD::SMLALD:        return "ARMISD::SMLALD";
1617   case ARMISD::SMLALDX:       return "ARMISD::SMLALDX";
1618   case ARMISD::SMLSLD:        return "ARMISD::SMLSLD";
1619   case ARMISD::SMLSLDX:       return "ARMISD::SMLSLDX";
1620   case ARMISD::SMMLAR:        return "ARMISD::SMMLAR";
1621   case ARMISD::SMMLSR:        return "ARMISD::SMMLSR";
1622   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
1623   case ARMISD::BFI:           return "ARMISD::BFI";
1624   case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
1625   case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
1626   case ARMISD::VBSL:          return "ARMISD::VBSL";
1627   case ARMISD::MEMCPY:        return "ARMISD::MEMCPY";
1628   case ARMISD::VLD1DUP:       return "ARMISD::VLD1DUP";
1629   case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
1630   case ARMISD::VLD3DUP:       return "ARMISD::VLD3DUP";
1631   case ARMISD::VLD4DUP:       return "ARMISD::VLD4DUP";
1632   case ARMISD::VLD1_UPD:      return "ARMISD::VLD1_UPD";
1633   case ARMISD::VLD2_UPD:      return "ARMISD::VLD2_UPD";
1634   case ARMISD::VLD3_UPD:      return "ARMISD::VLD3_UPD";
1635   case ARMISD::VLD4_UPD:      return "ARMISD::VLD4_UPD";
1636   case ARMISD::VLD2LN_UPD:    return "ARMISD::VLD2LN_UPD";
1637   case ARMISD::VLD3LN_UPD:    return "ARMISD::VLD3LN_UPD";
1638   case ARMISD::VLD4LN_UPD:    return "ARMISD::VLD4LN_UPD";
1639   case ARMISD::VLD1DUP_UPD:   return "ARMISD::VLD1DUP_UPD";
1640   case ARMISD::VLD2DUP_UPD:   return "ARMISD::VLD2DUP_UPD";
1641   case ARMISD::VLD3DUP_UPD:   return "ARMISD::VLD3DUP_UPD";
1642   case ARMISD::VLD4DUP_UPD:   return "ARMISD::VLD4DUP_UPD";
1643   case ARMISD::VST1_UPD:      return "ARMISD::VST1_UPD";
1644   case ARMISD::VST2_UPD:      return "ARMISD::VST2_UPD";
1645   case ARMISD::VST3_UPD:      return "ARMISD::VST3_UPD";
1646   case ARMISD::VST4_UPD:      return "ARMISD::VST4_UPD";
1647   case ARMISD::VST2LN_UPD:    return "ARMISD::VST2LN_UPD";
1648   case ARMISD::VST3LN_UPD:    return "ARMISD::VST3LN_UPD";
1649   case ARMISD::VST4LN_UPD:    return "ARMISD::VST4LN_UPD";
1650   case ARMISD::WLS:           return "ARMISD::WLS";
1651   case ARMISD::LE:            return "ARMISD::LE";
1652   case ARMISD::LOOP_DEC:      return "ARMISD::LOOP_DEC";
1653   case ARMISD::CSINV:         return "ARMISD::CSINV";
1654   case ARMISD::CSNEG:         return "ARMISD::CSNEG";
1655   case ARMISD::CSINC:         return "ARMISD::CSINC";
1656   }
1657   return nullptr;
1658 }
1659
1660 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
1661                                           EVT VT) const {
1662   if (!VT.isVector())
1663     return getPointerTy(DL);
1664
1665   // MVE has a predicate register.
1666   if (Subtarget->hasMVEIntegerOps() &&
1667       (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8))
1668     return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
1669   return VT.changeVectorElementTypeToInteger();
1670 }
1671
1672 /// getRegClassFor - Return the register class that should be used for the
1673 /// specified value type.
1674 const TargetRegisterClass *
1675 ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1676   (void)isDivergent;
1677   // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1678   // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1679   // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1680   // MVE Q registers.
1681   if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
1682     if (VT == MVT::v4i64)
1683       return &ARM::QQPRRegClass;
1684     if (VT == MVT::v8i64)
1685       return &ARM::QQQQPRRegClass;
1686   }
1687   return TargetLowering::getRegClassFor(VT);
1688 }
1689
1690 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1691 // source/dest is aligned and the copy size is large enough. We therefore want
1692 // to align such objects passed to memory intrinsics.
1693 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
1694                                                unsigned &PrefAlign) const {
1695   if (!isa<MemIntrinsic>(CI))
1696     return false;
1697   MinSize = 8;
1698   // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1699   // cycle faster than 4-byte aligned LDM.
1700   PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4);
1701   return true;
1702 }
1703
1704 // Create a fast isel object.
1705 FastISel *
1706 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
1707                                   const TargetLibraryInfo *libInfo) const {
1708   return ARM::createFastISel(funcInfo, libInfo);
1709 }
1710
1711 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
1712   unsigned NumVals = N->getNumValues();
1713   if (!NumVals)
1714     return Sched::RegPressure;
1715
1716   for (unsigned i = 0; i != NumVals; ++i) {
1717     EVT VT = N->getValueType(i);
1718     if (VT == MVT::Glue || VT == MVT::Other)
1719       continue;
1720     if (VT.isFloatingPoint() || VT.isVector())
1721       return Sched::ILP;
1722   }
1723
1724   if (!N->isMachineOpcode())
1725     return Sched::RegPressure;
1726
1727   // Load are scheduled for latency even if there instruction itinerary
1728   // is not available.
1729   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1730   const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1731
1732   if (MCID.getNumDefs() == 0)
1733     return Sched::RegPressure;
1734   if (!Itins->isEmpty() &&
1735       Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2)
1736     return Sched::ILP;
1737
1738   return Sched::RegPressure;
1739 }
1740
1741 //===----------------------------------------------------------------------===//
1742 // Lowering Code
1743 //===----------------------------------------------------------------------===//
1744
1745 static bool isSRL16(const SDValue &Op) {
1746   if (Op.getOpcode() != ISD::SRL)
1747     return false;
1748   if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1749     return Const->getZExtValue() == 16;
1750   return false;
1751 }
1752
1753 static bool isSRA16(const SDValue &Op) {
1754   if (Op.getOpcode() != ISD::SRA)
1755     return false;
1756   if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1757     return Const->getZExtValue() == 16;
1758   return false;
1759 }
1760
1761 static bool isSHL16(const SDValue &Op) {
1762   if (Op.getOpcode() != ISD::SHL)
1763     return false;
1764   if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1765     return Const->getZExtValue() == 16;
1766   return false;
1767 }
1768
1769 // Check for a signed 16-bit value. We special case SRA because it makes it
1770 // more simple when also looking for SRAs that aren't sign extending a
1771 // smaller value. Without the check, we'd need to take extra care with
1772 // checking order for some operations.
1773 static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
1774   if (isSRA16(Op))
1775     return isSHL16(Op.getOperand(0));
1776   return DAG.ComputeNumSignBits(Op) == 17;
1777 }
1778
1779 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1780 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
1781   switch (CC) {
1782   default: llvm_unreachable("Unknown condition code!");
1783   case ISD::SETNE:  return ARMCC::NE;
1784   case ISD::SETEQ:  return ARMCC::EQ;
1785   case ISD::SETGT:  return ARMCC::GT;
1786   case ISD::SETGE:  return ARMCC::GE;
1787   case ISD::SETLT:  return ARMCC::LT;
1788   case ISD::SETLE:  return ARMCC::LE;
1789   case ISD::SETUGT: return ARMCC::HI;
1790   case ISD::SETUGE: return ARMCC::HS;
1791   case ISD::SETULT: return ARMCC::LO;
1792   case ISD::SETULE: return ARMCC::LS;
1793   }
1794 }
1795
1796 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1797 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
1798                         ARMCC::CondCodes &CondCode2) {
1799   CondCode2 = ARMCC::AL;
1800   switch (CC) {
1801   default: llvm_unreachable("Unknown FP condition!");
1802   case ISD::SETEQ:
1803   case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1804   case ISD::SETGT:
1805   case ISD::SETOGT: CondCode = ARMCC::GT; break;
1806   case ISD::SETGE:
1807   case ISD::SETOGE: CondCode = ARMCC::GE; break;
1808   case ISD::SETOLT: CondCode = ARMCC::MI; break;
1809   case ISD::SETOLE: CondCode = ARMCC::LS; break;
1810   case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1811   case ISD::SETO:   CondCode = ARMCC::VC; break;
1812   case ISD::SETUO:  CondCode = ARMCC::VS; break;
1813   case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1814   case ISD::SETUGT: CondCode = ARMCC::HI; break;
1815   case ISD::SETUGE: CondCode = ARMCC::PL; break;
1816   case ISD::SETLT:
1817   case ISD::SETULT: CondCode = ARMCC::LT; break;
1818   case ISD::SETLE:
1819   case ISD::SETULE: CondCode = ARMCC::LE; break;
1820   case ISD::SETNE:
1821   case ISD::SETUNE: CondCode = ARMCC::NE; break;
1822   }
1823 }
1824
1825 //===----------------------------------------------------------------------===//
1826 //                      Calling Convention Implementation
1827 //===----------------------------------------------------------------------===//
1828
1829 /// getEffectiveCallingConv - Get the effective calling convention, taking into
1830 /// account presence of floating point hardware and calling convention
1831 /// limitations, such as support for variadic functions.
1832 CallingConv::ID
1833 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
1834                                            bool isVarArg) const {
1835   switch (CC) {
1836   default:
1837     report_fatal_error("Unsupported calling convention");
1838   case CallingConv::ARM_AAPCS:
1839   case CallingConv::ARM_APCS:
1840   case CallingConv::GHC:
1841     return CC;
1842   case CallingConv::PreserveMost:
1843     return CallingConv::PreserveMost;
1844   case CallingConv::ARM_AAPCS_VFP:
1845   case CallingConv::Swift:
1846     return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
1847   case CallingConv::C:
1848     if (!Subtarget->isAAPCS_ABI())
1849       return CallingConv::ARM_APCS;
1850     else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
1851              getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
1852              !isVarArg)
1853       return CallingConv::ARM_AAPCS_VFP;
1854     else
1855       return CallingConv::ARM_AAPCS;
1856   case CallingConv::Fast:
1857   case CallingConv::CXX_FAST_TLS:
1858     if (!Subtarget->isAAPCS_ABI()) {
1859       if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
1860         return CallingConv::Fast;
1861       return CallingConv::ARM_APCS;
1862     } else if (Subtarget->hasVFP2Base() &&
1863                !Subtarget->isThumb1Only() && !isVarArg)
1864       return CallingConv::ARM_AAPCS_VFP;
1865     else
1866       return CallingConv::ARM_AAPCS;
1867   }
1868 }
1869
1870 CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1871                                                  bool isVarArg) const {
1872   return CCAssignFnForNode(CC, false, isVarArg);
1873 }
1874
1875 CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1876                                                    bool isVarArg) const {
1877   return CCAssignFnForNode(CC, true, isVarArg);
1878 }
1879
1880 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given
1881 /// CallingConvention.
1882 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
1883                                                  bool Return,
1884                                                  bool isVarArg) const {
1885   switch (getEffectiveCallingConv(CC, isVarArg)) {
1886   default:
1887     report_fatal_error("Unsupported calling convention");
1888   case CallingConv::ARM_APCS:
1889     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1890   case CallingConv::ARM_AAPCS:
1891     return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1892   case CallingConv::ARM_AAPCS_VFP:
1893     return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1894   case CallingConv::Fast:
1895     return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
1896   case CallingConv::GHC:
1897     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
1898   case CallingConv::PreserveMost:
1899     return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1900   }
1901 }
1902
1903 /// LowerCallResult - Lower the result values of a call into the
1904 /// appropriate copies out of appropriate physical registers.
1905 SDValue ARMTargetLowering::LowerCallResult(
1906     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
1907     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1908     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
1909     SDValue ThisVal) const {
1910   // Assign locations to each value returned by this call.
1911   SmallVector<CCValAssign, 16> RVLocs;
1912   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1913                  *DAG.getContext());
1914   CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
1915
1916   // Copy all of the result registers out of their specified physreg.
1917   for (unsigned i = 0; i != RVLocs.size(); ++i) {
1918     CCValAssign VA = RVLocs[i];
1919
1920     // Pass 'this' value directly from the argument to return value, to avoid
1921     // reg unit interference
1922     if (i == 0 && isThisReturn) {
1923       assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
1924              "unexpected return calling convention register assignment");
1925       InVals.push_back(ThisVal);
1926       continue;
1927     }
1928
1929     SDValue Val;
1930     if (VA.needsCustom()) {
1931       // Handle f64 or half of a v2f64.
1932       SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1933                                       InFlag);
1934       Chain = Lo.getValue(1);
1935       InFlag = Lo.getValue(2);
1936       VA = RVLocs[++i]; // skip ahead to next loc
1937       SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1938                                       InFlag);
1939       Chain = Hi.getValue(1);
1940       InFlag = Hi.getValue(2);
1941       if (!Subtarget->isLittle())
1942         std::swap (Lo, Hi);
1943       Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1944
1945       if (VA.getLocVT() == MVT::v2f64) {
1946         SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
1947         Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1948                           DAG.getConstant(0, dl, MVT::i32));
1949
1950         VA = RVLocs[++i]; // skip ahead to next loc
1951         Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
1952         Chain = Lo.getValue(1);
1953         InFlag = Lo.getValue(2);
1954         VA = RVLocs[++i]; // skip ahead to next loc
1955         Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
1956         Chain = Hi.getValue(1);
1957         InFlag = Hi.getValue(2);
1958         if (!Subtarget->isLittle())
1959           std::swap (Lo, Hi);
1960         Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1961         Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1962                           DAG.getConstant(1, dl, MVT::i32));
1963       }
1964     } else {
1965       Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1966                                InFlag);
1967       Chain = Val.getValue(1);
1968       InFlag = Val.getValue(2);
1969     }
1970
1971     switch (VA.getLocInfo()) {
1972     default: llvm_unreachable("Unknown loc info!");
1973     case CCValAssign::Full: break;
1974     case CCValAssign::BCvt:
1975       Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
1976       break;
1977     }
1978
1979     InVals.push_back(Val);
1980   }
1981
1982   return Chain;
1983 }
1984
1985 /// LowerMemOpCallTo - Store the argument to the stack.
1986 SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
1987                                             SDValue Arg, const SDLoc &dl,
1988                                             SelectionDAG &DAG,
1989                                             const CCValAssign &VA,
1990                                             ISD::ArgFlagsTy Flags) const {
1991   unsigned LocMemOffset = VA.getLocMemOffset();
1992   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
1993   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
1994                        StackPtr, PtrOff);
1995   return DAG.getStore(
1996       Chain, dl, Arg, PtrOff,
1997       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
1998 }
1999
2000 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
2001                                          SDValue Chain, SDValue &Arg,
2002                                          RegsToPassVector &RegsToPass,
2003                                          CCValAssign &VA, CCValAssign &NextVA,
2004                                          SDValue &StackPtr,
2005                                          SmallVectorImpl<SDValue> &MemOpChains,
2006                                          ISD::ArgFlagsTy Flags) const {
2007   SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2008                               DAG.getVTList(MVT::i32, MVT::i32), Arg);
2009   unsigned id = Subtarget->isLittle() ? 0 : 1;
2010   RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
2011
2012   if (NextVA.isRegLoc())
2013     RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
2014   else {
2015     assert(NextVA.isMemLoc());
2016     if (!StackPtr.getNode())
2017       StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
2018                                     getPointerTy(DAG.getDataLayout()));
2019
2020     MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id),
2021                                            dl, DAG, NextVA,
2022                                            Flags));
2023   }
2024 }
2025
2026 /// LowerCall - Lowering a call into a callseq_start <-
2027 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2028 /// nodes.
2029 SDValue
2030 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2031                              SmallVectorImpl<SDValue> &InVals) const {
2032   SelectionDAG &DAG                     = CLI.DAG;
2033   SDLoc &dl                             = CLI.DL;
2034   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2035   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
2036   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
2037   SDValue Chain                         = CLI.Chain;
2038   SDValue Callee                        = CLI.Callee;
2039   bool &isTailCall                      = CLI.IsTailCall;
2040   CallingConv::ID CallConv              = CLI.CallConv;
2041   bool doesNotRet                       = CLI.DoesNotReturn;
2042   bool isVarArg                         = CLI.IsVarArg;
2043
2044   MachineFunction &MF = DAG.getMachineFunction();
2045   MachineFunction::CallSiteInfo CSInfo;
2046   bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2047   bool isThisReturn = false;
2048   auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
2049   bool PreferIndirect = false;
2050
2051   // Disable tail calls if they're not supported.
2052   if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true")
2053     isTailCall = false;
2054
2055   if (isa<GlobalAddressSDNode>(Callee)) {
2056     // If we're optimizing for minimum size and the function is called three or
2057     // more times in this block, we can improve codesize by calling indirectly
2058     // as BLXr has a 16-bit encoding.
2059     auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2060     if (CLI.CS) {
2061       auto *BB = CLI.CS.getParent();
2062       PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2063                        count_if(GV->users(), [&BB](const User *U) {
2064                          return isa<Instruction>(U) &&
2065                                 cast<Instruction>(U)->getParent() == BB;
2066                        }) > 2;
2067     }
2068   }
2069   if (isTailCall) {
2070     // Check if it's really possible to do a tail call.
2071     isTailCall = IsEligibleForTailCallOptimization(
2072         Callee, CallConv, isVarArg, isStructRet,
2073         MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG,
2074         PreferIndirect);
2075     if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall())
2076       report_fatal_error("failed to perform tail call elimination on a call "
2077                          "site marked musttail");
2078     // We don't support GuaranteedTailCallOpt for ARM, only automatically
2079     // detected sibcalls.
2080     if (isTailCall)
2081       ++NumTailCalls;
2082   }
2083
2084   // Analyze operands of the call, assigning locations to each operand.
2085   SmallVector<CCValAssign, 16> ArgLocs;
2086   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2087                  *DAG.getContext());
2088   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
2089
2090   // Get a count of how many bytes are to be pushed on the stack.
2091   unsigned NumBytes = CCInfo.getNextStackOffset();
2092
2093   if (isTailCall) {
2094     // For tail calls, memory operands are available in our caller's stack.
2095     NumBytes = 0;
2096   } else {
2097     // Adjust the stack pointer for the new arguments...
2098     // These operations are automatically eliminated by the prolog/epilog pass
2099     Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
2100   }
2101
2102   SDValue StackPtr =
2103       DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2104
2105   RegsToPassVector RegsToPass;
2106   SmallVector<SDValue, 8> MemOpChains;
2107
2108   // Walk the register/memloc assignments, inserting copies/loads.  In the case
2109   // of tail call optimization, arguments are handled later.
2110   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2111        i != e;
2112        ++i, ++realArgIdx) {
2113     CCValAssign &VA = ArgLocs[i];
2114     SDValue Arg = OutVals[realArgIdx];
2115     ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2116     bool isByVal = Flags.isByVal();
2117
2118     // Promote the value if needed.
2119     switch (VA.getLocInfo()) {
2120     default: llvm_unreachable("Unknown loc info!");
2121     case CCValAssign::Full: break;
2122     case CCValAssign::SExt:
2123       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
2124       break;
2125     case CCValAssign::ZExt:
2126       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
2127       break;
2128     case CCValAssign::AExt:
2129       Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
2130       break;
2131     case CCValAssign::BCvt:
2132       Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2133       break;
2134     }
2135
2136     // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2137     if (VA.needsCustom()) {
2138       if (VA.getLocVT() == MVT::v2f64) {
2139         SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2140                                   DAG.getConstant(0, dl, MVT::i32));
2141         SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2142                                   DAG.getConstant(1, dl, MVT::i32));
2143
2144         PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
2145                          VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
2146
2147         VA = ArgLocs[++i]; // skip ahead to next loc
2148         if (VA.isRegLoc()) {
2149           PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
2150                            VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
2151         } else {
2152           assert(VA.isMemLoc());
2153
2154           MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
2155                                                  dl, DAG, VA, Flags));
2156         }
2157       } else {
2158         PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
2159                          StackPtr, MemOpChains, Flags);
2160       }
2161     } else if (VA.isRegLoc()) {
2162       if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2163           Outs[0].VT == MVT::i32) {
2164         assert(VA.getLocVT() == MVT::i32 &&
2165                "unexpected calling convention register assignment");
2166         assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2167                "unexpected use of 'returned'");
2168         isThisReturn = true;
2169       }
2170       const TargetOptions &Options = DAG.getTarget().Options;
2171       if (Options.EnableDebugEntryValues)
2172         CSInfo.emplace_back(VA.getLocReg(), i);
2173       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2174     } else if (isByVal) {
2175       assert(VA.isMemLoc());
2176       unsigned offset = 0;
2177
2178       // True if this byval aggregate will be split between registers
2179       // and memory.
2180       unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2181       unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2182
2183       if (CurByValIdx < ByValArgsCount) {
2184
2185         unsigned RegBegin, RegEnd;
2186         CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
2187
2188         EVT PtrVT =
2189             DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
2190         unsigned int i, j;
2191         for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2192           SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2193           SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
2194           SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
2195                                      MachinePointerInfo(),
2196                                      DAG.InferPtrAlignment(AddArg));
2197           MemOpChains.push_back(Load.getValue(1));
2198           RegsToPass.push_back(std::make_pair(j, Load));
2199         }
2200
2201         // If parameter size outsides register area, "offset" value
2202         // helps us to calculate stack slot for remained part properly.
2203         offset = RegEnd - RegBegin;
2204
2205         CCInfo.nextInRegsParam();
2206       }
2207
2208       if (Flags.getByValSize() > 4*offset) {
2209         auto PtrVT = getPointerTy(DAG.getDataLayout());
2210         unsigned LocMemOffset = VA.getLocMemOffset();
2211         SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
2212         SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff);
2213         SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2214         SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
2215         SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2216                                            MVT::i32);
2217         SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl,
2218                                             MVT::i32);
2219
2220         SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2221         SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2222         MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2223                                           Ops));
2224       }
2225     } else if (!isTailCall) {
2226       assert(VA.isMemLoc());
2227
2228       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2229                                              dl, DAG, VA, Flags));
2230     }
2231   }
2232
2233   if (!MemOpChains.empty())
2234     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2235
2236   // Build a sequence of copy-to-reg nodes chained together with token chain
2237   // and flag operands which copy the outgoing args into the appropriate regs.
2238   SDValue InFlag;
2239   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2240     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2241                              RegsToPass[i].second, InFlag);
2242     InFlag = Chain.getValue(1);
2243   }
2244
2245   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2246   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2247   // node so that legalize doesn't hack it.
2248   bool isDirect = false;
2249
2250   const TargetMachine &TM = getTargetMachine();
2251   const Module *Mod = MF.getFunction().getParent();
2252   const GlobalValue *GV = nullptr;
2253   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2254     GV = G->getGlobal();
2255   bool isStub =
2256       !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO();
2257
2258   bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2259   bool isLocalARMFunc = false;
2260   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2261   auto PtrVt = getPointerTy(DAG.getDataLayout());
2262
2263   if (Subtarget->genLongCalls()) {
2264     assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
2265            "long-calls codegen is not position independent!");
2266     // Handle a global address or an external symbol. If it's not one of
2267     // those, the target's already in a register, so we don't need to do
2268     // anything extra.
2269     if (isa<GlobalAddressSDNode>(Callee)) {
2270       // Create a constant pool entry for the callee address
2271       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2272       ARMConstantPoolValue *CPV =
2273         ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
2274
2275       // Get the address of the callee into a register
2276       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
2277       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2278       Callee = DAG.getLoad(
2279           PtrVt, dl, DAG.getEntryNode(), CPAddr,
2280           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2281     } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2282       const char *Sym = S->getSymbol();
2283
2284       // Create a constant pool entry for the callee address
2285       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2286       ARMConstantPoolValue *CPV =
2287         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
2288                                       ARMPCLabelIndex, 0);
2289       // Get the address of the callee into a register
2290       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
2291       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2292       Callee = DAG.getLoad(
2293           PtrVt, dl, DAG.getEntryNode(), CPAddr,
2294           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2295     }
2296   } else if (isa<GlobalAddressSDNode>(Callee)) {
2297     if (!PreferIndirect) {
2298       isDirect = true;
2299       bool isDef = GV->isStrongDefinitionForLinker();
2300
2301       // ARM call to a local ARM function is predicable.
2302       isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2303       // tBX takes a register source operand.
2304       if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2305         assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
2306         Callee = DAG.getNode(
2307             ARMISD::WrapperPIC, dl, PtrVt,
2308             DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2309         Callee = DAG.getLoad(
2310             PtrVt, dl, DAG.getEntryNode(), Callee,
2311             MachinePointerInfo::getGOT(DAG.getMachineFunction()),
2312             /* Alignment = */ 0, MachineMemOperand::MODereferenceable |
2313                                      MachineMemOperand::MOInvariant);
2314       } else if (Subtarget->isTargetCOFF()) {
2315         assert(Subtarget->isTargetWindows() &&
2316                "Windows is the only supported COFF target");
2317         unsigned TargetFlags = GV->hasDLLImportStorageClass()
2318                                    ? ARMII::MO_DLLIMPORT
2319                                    : ARMII::MO_NO_FLAG;
2320         Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0,
2321                                             TargetFlags);
2322         if (GV->hasDLLImportStorageClass())
2323           Callee =
2324               DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2325                           DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2326                           MachinePointerInfo::getGOT(DAG.getMachineFunction()));
2327       } else {
2328         Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0);
2329       }
2330     }
2331   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2332     isDirect = true;
2333     // tBX takes a register source operand.
2334     const char *Sym = S->getSymbol();
2335     if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2336       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2337       ARMConstantPoolValue *CPV =
2338         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
2339                                       ARMPCLabelIndex, 4);
2340       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
2341       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2342       Callee = DAG.getLoad(
2343           PtrVt, dl, DAG.getEntryNode(), CPAddr,
2344           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
2345       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2346       Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2347     } else {
2348       Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2349     }
2350   }
2351
2352   // FIXME: handle tail calls differently.
2353   unsigned CallOpc;
2354   if (Subtarget->isThumb()) {
2355     if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2356       CallOpc = ARMISD::CALL_NOLINK;
2357     else
2358       CallOpc = ARMISD::CALL;
2359   } else {
2360     if (!isDirect && !Subtarget->hasV5TOps())
2361       CallOpc = ARMISD::CALL_NOLINK;
2362     else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2363              // Emit regular call when code size is the priority
2364              !Subtarget->hasMinSize())
2365       // "mov lr, pc; b _foo" to avoid confusing the RSP
2366       CallOpc = ARMISD::CALL_NOLINK;
2367     else
2368       CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2369   }
2370
2371   std::vector<SDValue> Ops;
2372   Ops.push_back(Chain);
2373   Ops.push_back(Callee);
2374
2375   // Add argument registers to the end of the list so that they are known live
2376   // into the call.
2377   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2378     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2379                                   RegsToPass[i].second.getValueType()));
2380
2381   // Add a register mask operand representing the call-preserved registers.
2382   if (!isTailCall) {
2383     const uint32_t *Mask;
2384     const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2385     if (isThisReturn) {
2386       // For 'this' returns, use the R0-preserving mask if applicable
2387       Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2388       if (!Mask) {
2389         // Set isThisReturn to false if the calling convention is not one that
2390         // allows 'returned' to be modeled in this way, so LowerCallResult does
2391         // not try to pass 'this' straight through
2392         isThisReturn = false;
2393         Mask = ARI->getCallPreservedMask(MF, CallConv);
2394       }
2395     } else
2396       Mask = ARI->getCallPreservedMask(MF, CallConv);
2397
2398     assert(Mask && "Missing call preserved mask for calling convention");
2399     Ops.push_back(DAG.getRegisterMask(Mask));
2400   }
2401
2402   if (InFlag.getNode())
2403     Ops.push_back(InFlag);
2404
2405   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2406   if (isTailCall) {
2407     MF.getFrameInfo().setHasTailCall();
2408     SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
2409     DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2410     return Ret;
2411   }
2412
2413   // Returns a chain and a flag for retval copy to use.
2414   Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
2415   InFlag = Chain.getValue(1);
2416   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2417
2418   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
2419                              DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
2420   if (!Ins.empty())
2421     InFlag = Chain.getValue(1);
2422
2423   // Handle result values, copying them out of physregs into vregs that we
2424   // return.
2425   return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
2426                          InVals, isThisReturn,
2427                          isThisReturn ? OutVals[0] : SDValue());
2428 }
2429
2430 /// HandleByVal - Every parameter *after* a byval parameter is passed
2431 /// on the stack.  Remember the next parameter register to allocate,
2432 /// and then confiscate the rest of the parameter registers to insure
2433 /// this.
2434 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2435                                     unsigned Align) const {
2436   // Byval (as with any stack) slots are always at least 4 byte aligned.
2437   Align = std::max(Align, 4U);
2438
2439   unsigned Reg = State->AllocateReg(GPRArgRegs);
2440   if (!Reg)
2441     return;
2442
2443   unsigned AlignInRegs = Align / 4;
2444   unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2445   for (unsigned i = 0; i < Waste; ++i)
2446     Reg = State->AllocateReg(GPRArgRegs);
2447
2448   if (!Reg)
2449     return;
2450
2451   unsigned Excess = 4 * (ARM::R4 - Reg);
2452
2453   // Special case when NSAA != SP and parameter size greater than size of
2454   // all remained GPR regs. In that case we can't split parameter, we must
2455   // send it to stack. We also must set NCRN to R4, so waste all
2456   // remained registers.
2457   const unsigned NSAAOffset = State->getNextStackOffset();
2458   if (NSAAOffset != 0 && Size > Excess) {
2459     while (State->AllocateReg(GPRArgRegs))
2460       ;
2461     return;
2462   }
2463
2464   // First register for byval parameter is the first register that wasn't
2465   // allocated before this method call, so it would be "reg".
2466   // If parameter is small enough to be saved in range [reg, r4), then
2467   // the end (first after last) register would be reg + param-size-in-regs,
2468   // else parameter would be splitted between registers and stack,
2469   // end register would be r4 in this case.
2470   unsigned ByValRegBegin = Reg;
2471   unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2472   State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2473   // Note, first register is allocated in the beginning of function already,
2474   // allocate remained amount of registers we need.
2475   for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2476     State->AllocateReg(GPRArgRegs);
2477   // A byval parameter that is split between registers and memory needs its
2478   // size truncated here.
2479   // In the case where the entire structure fits in registers, we set the
2480   // size in memory to zero.
2481   Size = std::max<int>(Size - Excess, 0);
2482 }
2483
2484 /// MatchingStackOffset - Return true if the given stack call argument is
2485 /// already available in the same position (relatively) of the caller's
2486 /// incoming argument stack.
2487 static
2488 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2489                          MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
2490                          const TargetInstrInfo *TII) {
2491   unsigned Bytes = Arg.getValueSizeInBits() / 8;
2492   int FI = std::numeric_limits<int>::max();
2493   if (Arg.getOpcode() == ISD::CopyFromReg) {
2494     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2495     if (!Register::isVirtualRegister(VR))
2496       return false;
2497     MachineInstr *Def = MRI->getVRegDef(VR);
2498     if (!Def)
2499       return false;
2500     if (!Flags.isByVal()) {
2501       if (!TII->isLoadFromStackSlot(*Def, FI))
2502         return false;
2503     } else {
2504       return false;
2505     }
2506   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2507     if (Flags.isByVal())
2508       // ByVal argument is passed in as a pointer but it's now being
2509       // dereferenced. e.g.
2510       // define @foo(%struct.X* %A) {
2511       //   tail call @bar(%struct.X* byval %A)
2512       // }
2513       return false;
2514     SDValue Ptr = Ld->getBasePtr();
2515     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2516     if (!FINode)
2517       return false;
2518     FI = FINode->getIndex();
2519   } else
2520     return false;
2521
2522   assert(FI != std::numeric_limits<int>::max());
2523   if (!MFI.isFixedObjectIndex(FI))
2524     return false;
2525   return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
2526 }
2527
2528 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
2529 /// for tail call optimization. Targets which want to do tail call
2530 /// optimization should implement this function.
2531 bool ARMTargetLowering::IsEligibleForTailCallOptimization(
2532     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
2533     bool isCalleeStructRet, bool isCallerStructRet,
2534     const SmallVectorImpl<ISD::OutputArg> &Outs,
2535     const SmallVectorImpl<SDValue> &OutVals,
2536     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG,
2537     const bool isIndirect) const {
2538   MachineFunction &MF = DAG.getMachineFunction();
2539   const Function &CallerF = MF.getFunction();
2540   CallingConv::ID CallerCC = CallerF.getCallingConv();
2541
2542   assert(Subtarget->supportsTailCall());
2543
2544   // Indirect tail calls cannot be optimized for Thumb1 if the args
2545   // to the call take up r0-r3. The reason is that there are no legal registers
2546   // left to hold the pointer to the function to be called.
2547   if (Subtarget->isThumb1Only() && Outs.size() >= 4 &&
2548       (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect))
2549     return false;
2550
2551   // Look for obvious safe cases to perform tail call optimization that do not
2552   // require ABI changes. This is what gcc calls sibcall.
2553
2554   // Exception-handling functions need a special set of instructions to indicate
2555   // a return to the hardware. Tail-calling another function would probably
2556   // break this.
2557   if (CallerF.hasFnAttribute("interrupt"))
2558     return false;
2559
2560   // Also avoid sibcall optimization if either caller or callee uses struct
2561   // return semantics.
2562   if (isCalleeStructRet || isCallerStructRet)
2563     return false;
2564
2565   // Externally-defined functions with weak linkage should not be
2566   // tail-called on ARM when the OS does not support dynamic
2567   // pre-emption of symbols, as the AAELF spec requires normal calls
2568   // to undefined weak functions to be replaced with a NOP or jump to the
2569   // next instruction. The behaviour of branch instructions in this
2570   // situation (as used for tail calls) is implementation-defined, so we
2571   // cannot rely on the linker replacing the tail call with a return.
2572   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2573     const GlobalValue *GV = G->getGlobal();
2574     const Triple &TT = getTargetMachine().getTargetTriple();
2575     if (GV->hasExternalWeakLinkage() &&
2576         (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
2577       return false;
2578   }
2579
2580   // Check that the call results are passed in the same way.
2581   LLVMContext &C = *DAG.getContext();
2582   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
2583                                   CCAssignFnForReturn(CalleeCC, isVarArg),
2584                                   CCAssignFnForReturn(CallerCC, isVarArg)))
2585     return false;
2586   // The callee has to preserve all registers the caller needs to preserve.
2587   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
2588   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2589   if (CalleeCC != CallerCC) {
2590     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2591     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2592       return false;
2593   }
2594
2595   // If Caller's vararg or byval argument has been split between registers and
2596   // stack, do not perform tail call, since part of the argument is in caller's
2597   // local frame.
2598   const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
2599   if (AFI_Caller->getArgRegsSaveSize())
2600     return false;
2601
2602   // If the callee takes no arguments then go on to check the results of the
2603   // call.
2604   if (!Outs.empty()) {
2605     // Check if stack adjustment is needed. For now, do not do this if any
2606     // argument is passed on the stack.
2607     SmallVector<CCValAssign, 16> ArgLocs;
2608     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
2609     CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
2610     if (CCInfo.getNextStackOffset()) {
2611       // Check if the arguments are already laid out in the right way as
2612       // the caller's fixed stack objects.
2613       MachineFrameInfo &MFI = MF.getFrameInfo();
2614       const MachineRegisterInfo *MRI = &MF.getRegInfo();
2615       const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2616       for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2617            i != e;
2618            ++i, ++realArgIdx) {
2619         CCValAssign &VA = ArgLocs[i];
2620         EVT RegVT = VA.getLocVT();
2621         SDValue Arg = OutVals[realArgIdx];
2622         ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2623         if (VA.getLocInfo() == CCValAssign::Indirect)
2624           return false;
2625         if (VA.needsCustom()) {
2626           // f64 and vector types are split into multiple registers or
2627           // register/stack-slot combinations.  The types will not match
2628           // the registers; give up on memory f64 refs until we figure
2629           // out what to do about this.
2630           if (!VA.isRegLoc())
2631             return false;
2632           if (!ArgLocs[++i].isRegLoc())
2633             return false;
2634           if (RegVT == MVT::v2f64) {
2635             if (!ArgLocs[++i].isRegLoc())
2636               return false;
2637             if (!ArgLocs[++i].isRegLoc())
2638               return false;
2639           }
2640         } else if (!VA.isRegLoc()) {
2641           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
2642                                    MFI, MRI, TII))
2643             return false;
2644         }
2645       }
2646     }
2647
2648     const MachineRegisterInfo &MRI = MF.getRegInfo();
2649     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
2650       return false;
2651   }
2652
2653   return true;
2654 }
2655
2656 bool
2657 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2658                                   MachineFunction &MF, bool isVarArg,
2659                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
2660                                   LLVMContext &Context) const {
2661   SmallVector<CCValAssign, 16> RVLocs;
2662   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2663   return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2664 }
2665
2666 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
2667                                     const SDLoc &DL, SelectionDAG &DAG) {
2668   const MachineFunction &MF = DAG.getMachineFunction();
2669   const Function &F = MF.getFunction();
2670
2671   StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
2672
2673   // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
2674   // version of the "preferred return address". These offsets affect the return
2675   // instruction if this is a return from PL1 without hypervisor extensions.
2676   //    IRQ/FIQ: +4     "subs pc, lr, #4"
2677   //    SWI:     0      "subs pc, lr, #0"
2678   //    ABORT:   +4     "subs pc, lr, #4"
2679   //    UNDEF:   +4/+2  "subs pc, lr, #0"
2680   // UNDEF varies depending on where the exception came from ARM or Thumb
2681   // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
2682
2683   int64_t LROffset;
2684   if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
2685       IntKind == "ABORT")
2686     LROffset = 4;
2687   else if (IntKind == "SWI" || IntKind == "UNDEF")
2688     LROffset = 0;
2689   else
2690     report_fatal_error("Unsupported interrupt attribute. If present, value "
2691                        "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
2692
2693   RetOps.insert(RetOps.begin() + 1,
2694                 DAG.getConstant(LROffset, DL, MVT::i32, false));
2695
2696   return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps);
2697 }
2698
2699 SDValue
2700 ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2701                                bool isVarArg,
2702                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2703                                const SmallVectorImpl<SDValue> &OutVals,
2704                                const SDLoc &dl, SelectionDAG &DAG) const {
2705   // CCValAssign - represent the assignment of the return value to a location.
2706   SmallVector<CCValAssign, 16> RVLocs;
2707
2708   // CCState - Info about the registers and stack slots.
2709   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2710                  *DAG.getContext());
2711
2712   // Analyze outgoing return values.
2713   CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2714
2715   SDValue Flag;
2716   SmallVector<SDValue, 4> RetOps;
2717   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2718   bool isLittleEndian = Subtarget->isLittle();
2719
2720   MachineFunction &MF = DAG.getMachineFunction();
2721   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2722   AFI->setReturnRegsCount(RVLocs.size());
2723
2724   // Copy the result values into the output registers.
2725   for (unsigned i = 0, realRVLocIdx = 0;
2726        i != RVLocs.size();
2727        ++i, ++realRVLocIdx) {
2728     CCValAssign &VA = RVLocs[i];
2729     assert(VA.isRegLoc() && "Can only return in registers!");
2730
2731     SDValue Arg = OutVals[realRVLocIdx];
2732     bool ReturnF16 = false;
2733
2734     if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {
2735       // Half-precision return values can be returned like this:
2736       //
2737       // t11 f16 = fadd ...
2738       // t12: i16 = bitcast t11
2739       //   t13: i32 = zero_extend t12
2740       // t14: f32 = bitcast t13  <~~~~~~~ Arg
2741       //
2742       // to avoid code generation for bitcasts, we simply set Arg to the node
2743       // that produces the f16 value, t11 in this case.
2744       //
2745       if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
2746         SDValue ZE = Arg.getOperand(0);
2747         if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
2748           SDValue BC = ZE.getOperand(0);
2749           if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
2750             Arg = BC.getOperand(0);
2751             ReturnF16 = true;
2752           }
2753         }
2754       }
2755     }
2756
2757     switch (VA.getLocInfo()) {
2758     default: llvm_unreachable("Unknown loc info!");
2759     case CCValAssign::Full: break;
2760     case CCValAssign::BCvt:
2761       if (!ReturnF16)
2762         Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2763       break;
2764     }
2765
2766     if (VA.needsCustom()) {
2767       if (VA.getLocVT() == MVT::v2f64) {
2768         // Extract the first half and return it in two registers.
2769         SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2770                                    DAG.getConstant(0, dl, MVT::i32));
2771         SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
2772                                        DAG.getVTList(MVT::i32, MVT::i32), Half);
2773
2774         Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2775                                  HalfGPRs.getValue(isLittleEndian ? 0 : 1),
2776                                  Flag);
2777         Flag = Chain.getValue(1);
2778         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2779         VA = RVLocs[++i]; // skip ahead to next loc
2780         Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2781                                  HalfGPRs.getValue(isLittleEndian ? 1 : 0),
2782                                  Flag);
2783         Flag = Chain.getValue(1);
2784         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2785         VA = RVLocs[++i]; // skip ahead to next loc
2786
2787         // Extract the 2nd half and fall through to handle it as an f64 value.
2788         Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2789                           DAG.getConstant(1, dl, MVT::i32));
2790       }
2791       // Legalize ret f64 -> ret 2 x i32.  We always have fmrrd if f64 is
2792       // available.
2793       SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2794                                   DAG.getVTList(MVT::i32, MVT::i32), Arg);
2795       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2796                                fmrrd.getValue(isLittleEndian ? 0 : 1),
2797                                Flag);
2798       Flag = Chain.getValue(1);
2799       RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2800       VA = RVLocs[++i]; // skip ahead to next loc
2801       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
2802                                fmrrd.getValue(isLittleEndian ? 1 : 0),
2803                                Flag);
2804     } else
2805       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
2806
2807     // Guarantee that all emitted copies are
2808     // stuck together, avoiding something bad.
2809     Flag = Chain.getValue(1);
2810     RetOps.push_back(DAG.getRegister(VA.getLocReg(),
2811                                      ReturnF16 ? MVT::f16 : VA.getLocVT()));
2812   }
2813   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
2814   const MCPhysReg *I =
2815       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2816   if (I) {
2817     for (; *I; ++I) {
2818       if (ARM::GPRRegClass.contains(*I))
2819         RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2820       else if (ARM::DPRRegClass.contains(*I))
2821         RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
2822       else
2823         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2824     }
2825   }
2826
2827   // Update chain and glue.
2828   RetOps[0] = Chain;
2829   if (Flag.getNode())
2830     RetOps.push_back(Flag);
2831
2832   // CPUs which aren't M-class use a special sequence to return from
2833   // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
2834   // though we use "subs pc, lr, #N").
2835   //
2836   // M-class CPUs actually use a normal return sequence with a special
2837   // (hardware-provided) value in LR, so the normal code path works.
2838   if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
2839       !Subtarget->isMClass()) {
2840     if (Subtarget->isThumb1Only())
2841       report_fatal_error("interrupt attribute is not supported in Thumb1");
2842     return LowerInterruptReturn(RetOps, dl, DAG);
2843   }
2844
2845   return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps);
2846 }
2847
2848 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2849   if (N->getNumValues() != 1)
2850     return false;
2851   if (!N->hasNUsesOfValue(1, 0))
2852     return false;
2853
2854   SDValue TCChain = Chain;
2855   SDNode *Copy = *N->use_begin();
2856   if (Copy->getOpcode() == ISD::CopyToReg) {
2857     // If the copy has a glue operand, we conservatively assume it isn't safe to
2858     // perform a tail call.
2859     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2860       return false;
2861     TCChain = Copy->getOperand(0);
2862   } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
2863     SDNode *VMov = Copy;
2864     // f64 returned in a pair of GPRs.
2865     SmallPtrSet<SDNode*, 2> Copies;
2866     for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
2867          UI != UE; ++UI) {
2868       if (UI->getOpcode() != ISD::CopyToReg)
2869         return false;
2870       Copies.insert(*UI);
2871     }
2872     if (Copies.size() > 2)
2873       return false;
2874
2875     for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
2876          UI != UE; ++UI) {
2877       SDValue UseChain = UI->getOperand(0);
2878       if (Copies.count(UseChain.getNode()))
2879         // Second CopyToReg
2880         Copy = *UI;
2881       else {
2882         // We are at the top of this chain.
2883         // If the copy has a glue operand, we conservatively assume it
2884         // isn't safe to perform a tail call.
2885         if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue)
2886           return false;
2887         // First CopyToReg
2888         TCChain = UseChain;
2889       }
2890     }
2891   } else if (Copy->getOpcode() == ISD::BITCAST) {
2892     // f32 returned in a single GPR.
2893     if (!Copy->hasOneUse())
2894       return false;
2895     Copy = *Copy->use_begin();
2896     if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
2897       return false;
2898     // If the copy has a glue operand, we conservatively assume it isn't safe to
2899     // perform a tail call.
2900     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2901       return false;
2902     TCChain = Copy->getOperand(0);
2903   } else {
2904     return false;
2905   }
2906
2907   bool HasRet = false;
2908   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2909        UI != UE; ++UI) {
2910     if (UI->getOpcode() != ARMISD::RET_FLAG &&
2911         UI->getOpcode() != ARMISD::INTRET_FLAG)
2912       return false;
2913     HasRet = true;
2914   }
2915
2916   if (!HasRet)
2917     return false;
2918
2919   Chain = TCChain;
2920   return true;
2921 }
2922
2923 bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2924   if (!Subtarget->supportsTailCall())
2925     return false;
2926
2927   auto Attr =
2928       CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2929   if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2930     return false;
2931
2932   return true;
2933 }
2934
2935 // Trying to write a 64 bit value so need to split into two 32 bit values first,
2936 // and pass the lower and high parts through.
2937 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) {
2938   SDLoc DL(Op);
2939   SDValue WriteValue = Op->getOperand(2);
2940
2941   // This function is only supposed to be called for i64 type argument.
2942   assert(WriteValue.getValueType() == MVT::i64
2943           && "LowerWRITE_REGISTER called for non-i64 type argument.");
2944
2945   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
2946                            DAG.getConstant(0, DL, MVT::i32));
2947   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
2948                            DAG.getConstant(1, DL, MVT::i32));
2949   SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
2950   return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
2951 }
2952
2953 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
2954 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
2955 // one of the above mentioned nodes. It has to be wrapped because otherwise
2956 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
2957 // be used to form addressing mode. These wrapped nodes will be selected
2958 // into MOVi.
2959 SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
2960                                              SelectionDAG &DAG) const {
2961   EVT PtrVT = Op.getValueType();
2962   // FIXME there is no actual debug info here
2963   SDLoc dl(Op);
2964   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
2965   SDValue Res;
2966
2967   // When generating execute-only code Constant Pools must be promoted to the
2968   // global data section. It's a bit ugly that we can't share them across basic
2969   // blocks, but this way we guarantee that execute-only behaves correct with
2970   // position-independent addressing modes.
2971   if (Subtarget->genExecuteOnly()) {
2972     auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
2973     auto T = const_cast<Type*>(CP->getType());
2974     auto C = const_cast<Constant*>(CP->getConstVal());
2975     auto M = const_cast<Module*>(DAG.getMachineFunction().
2976                                  getFunction().getParent());
2977     auto GV = new GlobalVariable(
2978                     *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
2979                     Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
2980                     Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
2981                     Twine(AFI->createPICLabelUId())
2982                   );
2983     SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV),
2984                                             dl, PtrVT);
2985     return LowerGlobalAddress(GA, DAG);
2986   }
2987
2988   if (CP->isMachineConstantPoolEntry())
2989     Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
2990                                     CP->getAlignment());
2991   else
2992     Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
2993                                     CP->getAlignment());
2994   return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
2995 }
2996
2997 unsigned ARMTargetLowering::getJumpTableEncoding() const {
2998   return MachineJumpTableInfo::EK_Inline;
2999 }
3000
3001 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
3002                                              SelectionDAG &DAG) const {
3003   MachineFunction &MF = DAG.getMachineFunction();
3004   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3005   unsigned ARMPCLabelIndex = 0;
3006   SDLoc DL(Op);
3007   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3008   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
3009   SDValue CPAddr;
3010   bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
3011   if (!IsPositionIndependent) {
3012     CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
3013   } else {
3014     unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3015     ARMPCLabelIndex = AFI->createPICLabelUId();
3016     ARMConstantPoolValue *CPV =
3017       ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
3018                                       ARMCP::CPBlockAddress, PCAdj);
3019     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
3020   }
3021   CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
3022   SDValue Result = DAG.getLoad(
3023       PtrVT, DL, DAG.getEntryNode(), CPAddr,
3024       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3025   if (!IsPositionIndependent)
3026     return Result;
3027   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
3028   return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
3029 }
3030
3031 /// Convert a TLS address reference into the correct sequence of loads
3032 /// and calls to compute the variable's address for Darwin, and return an
3033 /// SDValue containing the final node.
3034
3035 /// Darwin only has one TLS scheme which must be capable of dealing with the
3036 /// fully general situation, in the worst case. This means:
3037 ///     + "extern __thread" declaration.
3038 ///     + Defined in a possibly unknown dynamic library.
3039 ///
3040 /// The general system is that each __thread variable has a [3 x i32] descriptor
3041 /// which contains information used by the runtime to calculate the address. The
3042 /// only part of this the compiler needs to know about is the first word, which
3043 /// contains a function pointer that must be called with the address of the
3044 /// entire descriptor in "r0".
3045 ///
3046 /// Since this descriptor may be in a different unit, in general access must
3047 /// proceed along the usual ARM rules. A common sequence to produce is:
3048 ///
3049 ///     movw rT1, :lower16:_var$non_lazy_ptr
3050 ///     movt rT1, :upper16:_var$non_lazy_ptr
3051 ///     ldr r0, [rT1]
3052 ///     ldr rT2, [r0]
3053 ///     blx rT2
3054 ///     [...address now in r0...]
3055 SDValue
3056 ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
3057                                                SelectionDAG &DAG) const {
3058   assert(Subtarget->isTargetDarwin() &&
3059          "This function expects a Darwin target");
3060   SDLoc DL(Op);
3061
3062   // First step is to get the address of the actua global symbol. This is where
3063   // the TLS descriptor lives.
3064   SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
3065
3066   // The first entry in the descriptor is a function pointer that we must call
3067   // to obtain the address of the variable.
3068   SDValue Chain = DAG.getEntryNode();
3069   SDValue FuncTLVGet = DAG.getLoad(
3070       MVT::i32, DL, Chain, DescAddr,
3071       MachinePointerInfo::getGOT(DAG.getMachineFunction()),
3072       /* Alignment = */ 4,
3073       MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable |
3074           MachineMemOperand::MOInvariant);
3075   Chain = FuncTLVGet.getValue(1);
3076
3077   MachineFunction &F = DAG.getMachineFunction();
3078   MachineFrameInfo &MFI = F.getFrameInfo();
3079   MFI.setAdjustsStack(true);
3080
3081   // TLS calls preserve all registers except those that absolutely must be
3082   // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3083   // silly).
3084   auto TRI =
3085       getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo();
3086   auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
3087   const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
3088
3089   // Finally, we can make the call. This is just a degenerate version of a
3090   // normal AArch64 call node: r0 takes the address of the descriptor, and
3091   // returns the address of the variable in this thread.
3092   Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
3093   Chain =
3094       DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3095                   Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
3096                   DAG.getRegisterMask(Mask), Chain.getValue(1));
3097   return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
3098 }
3099
3100 SDValue
3101 ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
3102                                                 SelectionDAG &DAG) const {
3103   assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
3104
3105   SDValue Chain = DAG.getEntryNode();
3106   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3107   SDLoc DL(Op);
3108
3109   // Load the current TEB (thread environment block)
3110   SDValue Ops[] = {Chain,
3111                    DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
3112                    DAG.getTargetConstant(15, DL, MVT::i32),
3113                    DAG.getTargetConstant(0, DL, MVT::i32),
3114                    DAG.getTargetConstant(13, DL, MVT::i32),
3115                    DAG.getTargetConstant(0, DL, MVT::i32),
3116                    DAG.getTargetConstant(2, DL, MVT::i32)};
3117   SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
3118                                    DAG.getVTList(MVT::i32, MVT::Other), Ops);
3119
3120   SDValue TEB = CurrentTEB.getValue(0);
3121   Chain = CurrentTEB.getValue(1);
3122
3123   // Load the ThreadLocalStoragePointer from the TEB
3124   // A pointer to the TLS array is located at offset 0x2c from the TEB.
3125   SDValue TLSArray =
3126       DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
3127   TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
3128
3129   // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3130   // offset into the TLSArray.
3131
3132   // Load the TLS index from the C runtime
3133   SDValue TLSIndex =
3134       DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
3135   TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
3136   TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
3137
3138   SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
3139                               DAG.getConstant(2, DL, MVT::i32));
3140   SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
3141                             DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
3142                             MachinePointerInfo());
3143
3144   // Get the offset of the start of the .tls section (section base)
3145   const auto *GA = cast<GlobalAddressSDNode>(Op);
3146   auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
3147   SDValue Offset = DAG.getLoad(
3148       PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
3149                                     DAG.getTargetConstantPool(CPV, PtrVT, 4)),
3150       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3151
3152   return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
3153 }
3154
3155 // Lower ISD::GlobalTLSAddress using the "general dynamic" model
3156 SDValue
3157 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
3158                                                  SelectionDAG &DAG) const {
3159   SDLoc dl(GA);
3160   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3161   unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3162   MachineFunction &MF = DAG.getMachineFunction();
3163   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3164   unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3165   ARMConstantPoolValue *CPV =
3166     ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3167                                     ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
3168   SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
3169   Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
3170   Argument = DAG.getLoad(
3171       PtrVT, dl, DAG.getEntryNode(), Argument,
3172       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3173   SDValue Chain = Argument.getValue(1);
3174
3175   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3176   Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
3177
3178   // call __tls_get_addr.
3179   ArgListTy Args;
3180   ArgListEntry Entry;
3181   Entry.Node = Argument;
3182   Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
3183   Args.push_back(Entry);
3184
3185   // FIXME: is there useful debug info available here?
3186   TargetLowering::CallLoweringInfo CLI(DAG);
3187   CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3188       CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
3189       DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
3190
3191   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3192   return CallResult.first;
3193 }
3194
3195 // Lower ISD::GlobalTLSAddress using the "initial exec" or
3196 // "local exec" model.
3197 SDValue
3198 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
3199                                         SelectionDAG &DAG,
3200                                         TLSModel::Model model) const {
3201   const GlobalValue *GV = GA->getGlobal();
3202   SDLoc dl(GA);
3203   SDValue Offset;
3204   SDValue Chain = DAG.getEntryNode();
3205   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3206   // Get the Thread Pointer
3207   SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3208
3209   if (model == TLSModel::InitialExec) {
3210     MachineFunction &MF = DAG.getMachineFunction();
3211     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3212     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3213     // Initial exec model.
3214     unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3215     ARMConstantPoolValue *CPV =
3216       ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3217                                       ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
3218                                       true);
3219     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
3220     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3221     Offset = DAG.getLoad(
3222         PtrVT, dl, Chain, Offset,
3223         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3224     Chain = Offset.getValue(1);
3225
3226     SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3227     Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
3228
3229     Offset = DAG.getLoad(
3230         PtrVT, dl, Chain, Offset,
3231         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3232   } else {
3233     // local exec model
3234     assert(model == TLSModel::LocalExec);
3235     ARMConstantPoolValue *CPV =
3236       ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
3237     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
3238     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3239     Offset = DAG.getLoad(
3240         PtrVT, dl, Chain, Offset,
3241         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3242   }
3243
3244   // The address of the thread local variable is the add of the thread
3245   // pointer with the offset of the variable.
3246   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
3247 }
3248
3249 SDValue
3250 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
3251   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3252   if (DAG.getTarget().useEmulatedTLS())
3253     return LowerToTLSEmulatedModel(GA, DAG);
3254
3255   if (Subtarget->isTargetDarwin())
3256     return LowerGlobalTLSAddressDarwin(Op, DAG);
3257
3258   if (Subtarget->isTargetWindows())
3259     return LowerGlobalTLSAddressWindows(Op, DAG);
3260
3261   // TODO: implement the "local dynamic" model
3262   assert(Subtarget->isTargetELF() && "Only ELF implemented here");
3263   TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());
3264
3265   switch (model) {
3266     case TLSModel::GeneralDynamic:
3267     case TLSModel::LocalDynamic:
3268       return LowerToTLSGeneralDynamicModel(GA, DAG);
3269     case TLSModel::InitialExec:
3270     case TLSModel::LocalExec:
3271       return LowerToTLSExecModels(GA, DAG, model);
3272   }
3273   llvm_unreachable("bogus TLS model");
3274 }
3275
3276 /// Return true if all users of V are within function F, looking through
3277 /// ConstantExprs.
3278 static bool allUsersAreInFunction(const Value *V, const Function *F) {
3279   SmallVector<const User*,4> Worklist;
3280   for (auto *U : V->users())
3281     Worklist.push_back(U);
3282   while (!Worklist.empty()) {
3283     auto *U = Worklist.pop_back_val();
3284     if (isa<ConstantExpr>(U)) {
3285       for (auto *UU : U->users())
3286         Worklist.push_back(UU);
3287       continue;
3288     }
3289
3290     auto *I = dyn_cast<Instruction>(U);
3291     if (!I || I->getParent()->getParent() != F)
3292       return false;
3293   }
3294   return true;
3295 }
3296
3297 static SDValue promoteToConstantPool(const ARMTargetLowering *TLI,
3298                                      const GlobalValue *GV, SelectionDAG &DAG,
3299                                      EVT PtrVT, const SDLoc &dl) {
3300   // If we're creating a pool entry for a constant global with unnamed address,
3301   // and the global is small enough, we can emit it inline into the constant pool
3302   // to save ourselves an indirection.
3303   //
3304   // This is a win if the constant is only used in one function (so it doesn't
3305   // need to be duplicated) or duplicating the constant wouldn't increase code
3306   // size (implying the constant is no larger than 4 bytes).
3307   const Function &F = DAG.getMachineFunction().getFunction();
3308
3309   // We rely on this decision to inline being idemopotent and unrelated to the
3310   // use-site. We know that if we inline a variable at one use site, we'll
3311   // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3312   // doesn't know about this optimization, so bail out if it's enabled else
3313   // we could decide to inline here (and thus never emit the GV) but require
3314   // the GV from fast-isel generated code.
3315   if (!EnableConstpoolPromotion ||
3316       DAG.getMachineFunction().getTarget().Options.EnableFastISel)
3317       return SDValue();
3318
3319   auto *GVar = dyn_cast<GlobalVariable>(GV);
3320   if (!GVar || !GVar->hasInitializer() ||
3321       !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3322       !GVar->hasLocalLinkage())
3323     return SDValue();
3324
3325   // If we inline a value that contains relocations, we move the relocations
3326   // from .data to .text. This is not allowed in position-independent code.
3327   auto *Init = GVar->getInitializer();
3328   if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
3329       Init->needsRelocation())
3330     return SDValue();
3331
3332   // The constant islands pass can only really deal with alignment requests
3333   // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3334   // any type wanting greater alignment requirements than 4 bytes. We also
3335   // can only promote constants that are multiples of 4 bytes in size or
3336   // are paddable to a multiple of 4. Currently we only try and pad constants
3337   // that are strings for simplicity.
3338   auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
3339   unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
3340   unsigned Align = DAG.getDataLayout().getPreferredAlignment(GVar);
3341   unsigned RequiredPadding = 4 - (Size % 4);
3342   bool PaddingPossible =
3343     RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3344   if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize ||
3345       Size == 0)
3346     return SDValue();
3347
3348   unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3349   MachineFunction &MF = DAG.getMachineFunction();
3350   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3351
3352   // We can't bloat the constant pool too much, else the ConstantIslands pass
3353   // may fail to converge. If we haven't promoted this global yet (it may have
3354   // multiple uses), and promoting it would increase the constant pool size (Sz
3355   // > 4), ensure we have space to do so up to MaxTotal.
3356   if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
3357     if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3358         ConstpoolPromotionMaxTotal)
3359       return SDValue();
3360
3361   // This is only valid if all users are in a single function; we can't clone
3362   // the constant in general. The LLVM IR unnamed_addr allows merging
3363   // constants, but not cloning them.
3364   //
3365   // We could potentially allow cloning if we could prove all uses of the
3366   // constant in the current function don't care about the address, like
3367   // printf format strings. But that isn't implemented for now.
3368   if (!allUsersAreInFunction(GVar, &F))
3369     return SDValue();
3370
3371   // We're going to inline this global. Pad it out if needed.
3372   if (RequiredPadding != 4) {
3373     StringRef S = CDAInit->getAsString();
3374
3375     SmallVector<uint8_t,16> V(S.size());
3376     std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
3377     while (RequiredPadding--)
3378       V.push_back(0);
3379     Init = ConstantDataArray::get(*DAG.getContext(), V);
3380   }
3381
3382   auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
3383   SDValue CPAddr =
3384     DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4);
3385   if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
3386     AFI->markGlobalAsPromotedToConstantPool(GVar);
3387     AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() +
3388                                       PaddedSize - 4);
3389   }
3390   ++NumConstpoolPromoted;
3391   return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3392 }
3393
3394 bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const {
3395   if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
3396     if (!(GV = GA->getBaseObject()))
3397       return false;
3398   if (const auto *V = dyn_cast<GlobalVariable>(GV))
3399     return V->isConstant();
3400   return isa<Function>(GV);
3401 }
3402
3403 SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
3404                                               SelectionDAG &DAG) const {
3405   switch (Subtarget->getTargetTriple().getObjectFormat()) {
3406   default: llvm_unreachable("unknown object format");
3407   case Triple::COFF:
3408     return LowerGlobalAddressWindows(Op, DAG);
3409   case Triple::ELF:
3410     return LowerGlobalAddressELF(Op, DAG);
3411   case Triple::MachO:
3412     return LowerGlobalAddressDarwin(Op, DAG);
3413   }
3414 }
3415
3416 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3417                                                  SelectionDAG &DAG) const {
3418   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3419   SDLoc dl(Op);
3420   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3421   const TargetMachine &TM = getTargetMachine();
3422   bool IsRO = isReadOnly(GV);
3423
3424   // promoteToConstantPool only if not generating XO text section
3425   if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly())
3426     if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
3427       return V;
3428
3429   if (isPositionIndependent()) {
3430     bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
3431     SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
3432                                            UseGOT_PREL ? ARMII::MO_GOT : 0);
3433     SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3434     if (UseGOT_PREL)
3435       Result =
3436           DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3437                       MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3438     return Result;
3439   } else if (Subtarget->isROPI() && IsRO) {
3440     // PC-relative.
3441     SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3442     SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3443     return Result;
3444   } else if (Subtarget->isRWPI() && !IsRO) {
3445     // SB-relative.
3446     SDValue RelAddr;
3447     if (Subtarget->useMovt()) {
3448       ++NumMovwMovt;
3449       SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
3450       RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
3451     } else { // use literal pool for address constant
3452       ARMConstantPoolValue *CPV =
3453         ARMConstantPoolConstant::Create(GV, ARMCP::SBREL);
3454       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
3455       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3456       RelAddr = DAG.getLoad(
3457           PtrVT, dl, DAG.getEntryNode(), CPAddr,
3458           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3459     }
3460     SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3461     SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
3462     return Result;
3463   }
3464
3465   // If we have T2 ops, we can materialize the address directly via movt/movw
3466   // pair. This is always cheaper.
3467   if (Subtarget->useMovt()) {
3468     ++NumMovwMovt;
3469     // FIXME: Once remat is capable of dealing with instructions with register
3470     // operands, expand this into two nodes.
3471     return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
3472                        DAG.getTargetGlobalAddress(GV, dl, PtrVT));
3473   } else {
3474     SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
3475     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3476     return DAG.getLoad(
3477         PtrVT, dl, DAG.getEntryNode(), CPAddr,
3478         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3479   }
3480 }
3481
3482 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
3483                                                     SelectionDAG &DAG) const {
3484   assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3485          "ROPI/RWPI not currently supported for Darwin");
3486   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3487   SDLoc dl(Op);
3488   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3489
3490   if (Subtarget->useMovt())
3491     ++NumMovwMovt;
3492
3493   // FIXME: Once remat is capable of dealing with instructions with register
3494   // operands, expand this into multiple nodes
3495   unsigned Wrapper =
3496       isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;
3497
3498   SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
3499   SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
3500
3501   if (Subtarget->isGVIndirectSymbol(GV))
3502     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3503                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3504   return Result;
3505 }
3506
3507 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
3508                                                      SelectionDAG &DAG) const {
3509   assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
3510   assert(Subtarget->useMovt() &&
3511          "Windows on ARM expects to use movw/movt");
3512   assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3513          "ROPI/RWPI not currently supported for Windows");
3514
3515   const TargetMachine &TM = getTargetMachine();
3516   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3517   ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
3518   if (GV->hasDLLImportStorageClass())
3519     TargetFlags = ARMII::MO_DLLIMPORT;
3520   else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
3521     TargetFlags = ARMII::MO_COFFSTUB;
3522   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3523   SDValue Result;
3524   SDLoc DL(Op);
3525
3526   ++NumMovwMovt;
3527
3528   // FIXME: Once remat is capable of dealing with instructions with register
3529   // operands, expand this into two nodes.
3530   Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
3531                        DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
3532                                                   TargetFlags));
3533   if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
3534     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3535                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3536   return Result;
3537 }
3538
3539 SDValue
3540 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
3541   SDLoc dl(Op);
3542   SDValue Val = DAG.getConstant(0, dl, MVT::i32);
3543   return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
3544                      DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
3545                      Op.getOperand(1), Val);
3546 }
3547
3548 SDValue
3549 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
3550   SDLoc dl(Op);
3551   return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
3552                      Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
3553 }
3554
3555 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
3556                                                       SelectionDAG &DAG) const {
3557   SDLoc dl(Op);
3558   return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
3559                      Op.getOperand(0));
3560 }
3561
3562 SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
3563     SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
3564   unsigned IntNo =
3565       cast<ConstantSDNode>(
3566           Op.getOperand(Op.getOperand(0).getValueType() == MVT::Other))
3567           ->getZExtValue();
3568   switch (IntNo) {
3569     default:
3570       return SDValue();  // Don't custom lower most intrinsics.
3571     case Intrinsic::arm_gnu_eabi_mcount: {
3572       MachineFunction &MF = DAG.getMachineFunction();
3573       EVT PtrVT = getPointerTy(DAG.getDataLayout());
3574       SDLoc dl(Op);
3575       SDValue Chain = Op.getOperand(0);
3576       // call "\01__gnu_mcount_nc"
3577       const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
3578       const uint32_t *Mask =
3579           ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C);
3580       assert(Mask && "Missing call preserved mask for calling convention");
3581       // Mark LR an implicit live-in.
3582       unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
3583       SDValue ReturnAddress =
3584           DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
3585       std::vector<EVT> ResultTys = {MVT::Other, MVT::Glue};
3586       SDValue Callee =
3587           DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
3588       SDValue RegisterMask = DAG.getRegisterMask(Mask);
3589       if (Subtarget->isThumb())
3590         return SDValue(
3591             DAG.getMachineNode(
3592                 ARM::tBL_PUSHLR, dl, ResultTys,
3593                 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
3594                  DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
3595             0);
3596       return SDValue(
3597           DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
3598                              {ReturnAddress, Callee, RegisterMask, Chain}),
3599           0);
3600     }
3601   }
3602 }
3603
3604 SDValue
3605 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
3606                                           const ARMSubtarget *Subtarget) const {
3607   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3608   SDLoc dl(Op);
3609   switch (IntNo) {
3610   default: return SDValue();    // Don't custom lower most intrinsics.
3611   case Intrinsic::thread_pointer: {
3612     EVT PtrVT = getPointerTy(DAG.getDataLayout());
3613     return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3614   }
3615   case Intrinsic::eh_sjlj_lsda: {
3616     MachineFunction &MF = DAG.getMachineFunction();
3617     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3618     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3619     EVT PtrVT = getPointerTy(DAG.getDataLayout());
3620     SDValue CPAddr;
3621     bool IsPositionIndependent = isPositionIndependent();
3622     unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
3623     ARMConstantPoolValue *CPV =
3624       ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
3625                                       ARMCP::CPLSDA, PCAdj);
3626     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
3627     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3628     SDValue Result = DAG.getLoad(
3629         PtrVT, dl, DAG.getEntryNode(), CPAddr,
3630         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3631
3632     if (IsPositionIndependent) {
3633       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3634       Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
3635     }
3636     return Result;
3637   }
3638   case Intrinsic::arm_neon_vabs:
3639     return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
3640                         Op.getOperand(1));
3641   case Intrinsic::arm_neon_vmulls:
3642   case Intrinsic::arm_neon_vmullu: {
3643     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
3644       ? ARMISD::VMULLs : ARMISD::VMULLu;
3645     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3646                        Op.getOperand(1), Op.getOperand(2));
3647   }
3648   case Intrinsic::arm_neon_vminnm:
3649   case Intrinsic::arm_neon_vmaxnm: {
3650     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
3651       ? ISD::FMINNUM : ISD::FMAXNUM;
3652     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3653                        Op.getOperand(1), Op.getOperand(2));
3654   }
3655   case Intrinsic::arm_neon_vminu:
3656   case Intrinsic::arm_neon_vmaxu: {
3657     if (Op.getValueType().isFloatingPoint())
3658       return SDValue();
3659     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
3660       ? ISD::UMIN : ISD::UMAX;
3661     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3662                          Op.getOperand(1), Op.getOperand(2));
3663   }
3664   case Intrinsic::arm_neon_vmins:
3665   case Intrinsic::arm_neon_vmaxs: {
3666     // v{min,max}s is overloaded between signed integers and floats.
3667     if (!Op.getValueType().isFloatingPoint()) {
3668       unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3669         ? ISD::SMIN : ISD::SMAX;
3670       return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3671                          Op.getOperand(1), Op.getOperand(2));
3672     }
3673     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3674       ? ISD::FMINIMUM : ISD::FMAXIMUM;
3675     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3676                        Op.getOperand(1), Op.getOperand(2));
3677   }
3678   case Intrinsic::arm_neon_vtbl1:
3679     return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
3680                        Op.getOperand(1), Op.getOperand(2));
3681   case Intrinsic::arm_neon_vtbl2:
3682     return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
3683                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3684   }
3685 }
3686
3687 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
3688                                  const ARMSubtarget *Subtarget) {
3689   SDLoc dl(Op);
3690   ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2));
3691   auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue());
3692   if (SSID == SyncScope::SingleThread)
3693     return Op;
3694
3695   if (!Subtarget->hasDataBarrier()) {
3696     // Some ARMv6 cpus can support data barriers with an mcr instruction.
3697     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
3698     // here.
3699     assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
3700            "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
3701     return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
3702                        DAG.getConstant(0, dl, MVT::i32));
3703   }
3704
3705   ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
3706   AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
3707   ARM_MB::MemBOpt Domain = ARM_MB::ISH;
3708   if (Subtarget->isMClass()) {
3709     // Only a full system barrier exists in the M-class architectures.
3710     Domain = ARM_MB::SY;
3711   } else if (Subtarget->preferISHSTBarriers() &&
3712              Ord == AtomicOrdering::Release) {
3713     // Swift happens to implement ISHST barriers in a way that's compatible with
3714     // Release semantics but weaker than ISH so we'd be fools not to use
3715     // it. Beware: other processors probably don't!
3716     Domain = ARM_MB::ISHST;
3717   }
3718
3719   return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
3720                      DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
3721                      DAG.getConstant(Domain, dl, MVT::i32));
3722 }
3723
3724 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
3725                              const ARMSubtarget *Subtarget) {
3726   // ARM pre v5TE and Thumb1 does not have preload instructions.
3727   if (!(Subtarget->isThumb2() ||
3728         (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
3729     // Just preserve the chain.
3730     return Op.getOperand(0);
3731
3732   SDLoc dl(Op);
3733   unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
3734   if (!isRead &&
3735       (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
3736     // ARMv7 with MP extension has PLDW.
3737     return Op.getOperand(0);
3738
3739   unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
3740   if (Subtarget->isThumb()) {
3741     // Invert the bits.
3742     isRead = ~isRead & 1;
3743     isData = ~isData & 1;
3744   }
3745
3746   return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
3747                      Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
3748                      DAG.getConstant(isData, dl, MVT::i32));
3749 }
3750
3751 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
3752   MachineFunction &MF = DAG.getMachineFunction();
3753   ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
3754
3755   // vastart just stores the address of the VarArgsFrameIndex slot into the
3756   // memory location argument.
3757   SDLoc dl(Op);
3758   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
3759   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
3760   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3761   return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
3762                       MachinePointerInfo(SV));
3763 }
3764
3765 SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
3766                                                 CCValAssign &NextVA,
3767                                                 SDValue &Root,
3768                                                 SelectionDAG &DAG,
3769                                                 const SDLoc &dl) const {
3770   MachineFunction &MF = DAG.getMachineFunction();
3771   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3772
3773   const TargetRegisterClass *RC;
3774   if (AFI->isThumb1OnlyFunction())
3775     RC = &ARM::tGPRRegClass;
3776   else
3777     RC = &ARM::GPRRegClass;
3778
3779   // Transform the arguments stored in physical registers into virtual ones.
3780   unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3781   SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
3782
3783   SDValue ArgValue2;
3784   if (NextVA.isMemLoc()) {
3785     MachineFrameInfo &MFI = MF.getFrameInfo();
3786     int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
3787
3788     // Create load node to retrieve arguments from the stack.
3789     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3790     ArgValue2 = DAG.getLoad(
3791         MVT::i32, dl, Root, FIN,
3792         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3793   } else {
3794     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
3795     ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
3796   }
3797   if (!Subtarget->isLittle())
3798     std::swap (ArgValue, ArgValue2);
3799   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
3800 }
3801
3802 // The remaining GPRs hold either the beginning of variable-argument
3803 // data, or the beginning of an aggregate passed by value (usually
3804 // byval).  Either way, we allocate stack slots adjacent to the data
3805 // provided by our caller, and store the unallocated registers there.
3806 // If this is a variadic function, the va_list pointer will begin with
3807 // these values; otherwise, this reassembles a (byval) structure that
3808 // was split between registers and memory.
3809 // Return: The frame index registers were stored into.
3810 int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
3811                                       const SDLoc &dl, SDValue &Chain,
3812                                       const Value *OrigArg,
3813                                       unsigned InRegsParamRecordIdx,
3814                                       int ArgOffset, unsigned ArgSize) const {
3815   // Currently, two use-cases possible:
3816   // Case #1. Non-var-args function, and we meet first byval parameter.
3817   //          Setup first unallocated register as first byval register;
3818   //          eat all remained registers
3819   //          (these two actions are performed by HandleByVal method).
3820   //          Then, here, we initialize stack frame with
3821   //          "store-reg" instructions.
3822   // Case #2. Var-args function, that doesn't contain byval parameters.
3823   //          The same: eat all remained unallocated registers,
3824   //          initialize stack frame.
3825
3826   MachineFunction &MF = DAG.getMachineFunction();
3827   MachineFrameInfo &MFI = MF.getFrameInfo();
3828   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3829   unsigned RBegin, REnd;
3830   if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
3831     CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
3832   } else {
3833     unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
3834     RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
3835     REnd = ARM::R4;
3836   }
3837
3838   if (REnd != RBegin)
3839     ArgOffset = -4 * (ARM::R4 - RBegin);
3840
3841   auto PtrVT = getPointerTy(DAG.getDataLayout());
3842   int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
3843   SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
3844
3845   SmallVector<SDValue, 4> MemOps;
3846   const TargetRegisterClass *RC =
3847       AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
3848
3849   for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
3850     unsigned VReg = MF.addLiveIn(Reg, RC);
3851     SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
3852     SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
3853                                  MachinePointerInfo(OrigArg, 4 * i));
3854     MemOps.push_back(Store);
3855     FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
3856   }
3857
3858   if (!MemOps.empty())
3859     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3860   return FrameIndex;
3861 }
3862
3863 // Setup stack frame, the va_list pointer will start from.
3864 void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
3865                                              const SDLoc &dl, SDValue &Chain,
3866                                              unsigned ArgOffset,
3867                                              unsigned TotalArgRegsSaveSize,
3868                                              bool ForceMutable) const {
3869   MachineFunction &MF = DAG.getMachineFunction();
3870   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3871
3872   // Try to store any remaining integer argument regs
3873   // to their spots on the stack so that they may be loaded by dereferencing
3874   // the result of va_next.
3875   // If there is no regs to be stored, just point address after last
3876   // argument passed via stack.
3877   int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
3878                                   CCInfo.getInRegsParamsCount(),
3879                                   CCInfo.getNextStackOffset(),
3880                                   std::max(4U, TotalArgRegsSaveSize));
3881   AFI->setVarArgsFrameIndex(FrameIndex);
3882 }
3883
3884 SDValue ARMTargetLowering::LowerFormalArguments(
3885     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3886     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3887     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3888   MachineFunction &MF = DAG.getMachineFunction();
3889   MachineFrameInfo &MFI = MF.getFrameInfo();
3890
3891   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3892
3893   // Assign locations to all of the incoming arguments.
3894   SmallVector<CCValAssign, 16> ArgLocs;
3895   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3896                  *DAG.getContext());
3897   CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
3898
3899   SmallVector<SDValue, 16> ArgValues;
3900   SDValue ArgValue;
3901   Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
3902   unsigned CurArgIdx = 0;
3903
3904   // Initially ArgRegsSaveSize is zero.
3905   // Then we increase this value each time we meet byval parameter.
3906   // We also increase this value in case of varargs function.
3907   AFI->setArgRegsSaveSize(0);
3908
3909   // Calculate the amount of stack space that we need to allocate to store
3910   // byval and variadic arguments that are passed in registers.
3911   // We need to know this before we allocate the first byval or variadic
3912   // argument, as they will be allocated a stack slot below the CFA (Canonical
3913   // Frame Address, the stack pointer at entry to the function).
3914   unsigned ArgRegBegin = ARM::R4;
3915   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3916     if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
3917       break;
3918
3919     CCValAssign &VA = ArgLocs[i];
3920     unsigned Index = VA.getValNo();
3921     ISD::ArgFlagsTy Flags = Ins[Index].Flags;
3922     if (!Flags.isByVal())
3923       continue;
3924
3925     assert(VA.isMemLoc() && "unexpected byval pointer in reg");
3926     unsigned RBegin, REnd;
3927     CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
3928     ArgRegBegin = std::min(ArgRegBegin, RBegin);
3929
3930     CCInfo.nextInRegsParam();
3931   }
3932   CCInfo.rewindByValRegsInfo();
3933
3934   int lastInsIndex = -1;
3935   if (isVarArg && MFI.hasVAStart()) {
3936     unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
3937     if (RegIdx != array_lengthof(GPRArgRegs))
3938       ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
3939   }
3940
3941   unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
3942   AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
3943   auto PtrVT = getPointerTy(DAG.getDataLayout());
3944
3945   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3946     CCValAssign &VA = ArgLocs[i];
3947     if (Ins[VA.getValNo()].isOrigArg()) {
3948       std::advance(CurOrigArg,
3949                    Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
3950       CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
3951     }
3952     // Arguments stored in registers.
3953     if (VA.isRegLoc()) {
3954       EVT RegVT = VA.getLocVT();
3955
3956       if (VA.needsCustom()) {
3957         // f64 and vector types are split up into multiple registers or
3958         // combinations of registers and stack slots.
3959         if (VA.getLocVT() == MVT::v2f64) {
3960           SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
3961                                                    Chain, DAG, dl);
3962           VA = ArgLocs[++i]; // skip ahead to next loc
3963           SDValue ArgValue2;
3964           if (VA.isMemLoc()) {
3965             int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
3966             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3967             ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
3968                                     MachinePointerInfo::getFixedStack(
3969                                         DAG.getMachineFunction(), FI));
3970           } else {
3971             ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
3972                                              Chain, DAG, dl);
3973           }
3974           ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
3975           ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
3976                                  ArgValue, ArgValue1,
3977                                  DAG.getIntPtrConstant(0, dl));
3978           ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
3979                                  ArgValue, ArgValue2,
3980                                  DAG.getIntPtrConstant(1, dl));
3981         } else
3982           ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
3983       } else {
3984         const TargetRegisterClass *RC;
3985
3986
3987         if (RegVT == MVT::f16)
3988           RC = &ARM::HPRRegClass;
3989         else if (RegVT == MVT::f32)
3990           RC = &ARM::SPRRegClass;
3991         else if (RegVT == MVT::f64 || RegVT == MVT::v4f16)
3992           RC = &ARM::DPRRegClass;
3993         else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16)
3994           RC = &ARM::QPRRegClass;
3995         else if (RegVT == MVT::i32)
3996           RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
3997                                            : &ARM::GPRRegClass;
3998         else
3999           llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
4000
4001         // Transform the arguments in physical registers into virtual ones.
4002         unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
4003         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4004
4005         // If this value is passed in r0 and has the returned attribute (e.g.
4006         // C++ 'structors), record this fact for later use.
4007         if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
4008           AFI->setPreservesR0();
4009         }
4010       }
4011
4012       // If this is an 8 or 16-bit value, it is really passed promoted
4013       // to 32 bits.  Insert an assert[sz]ext to capture this, then
4014       // truncate to the right size.
4015       switch (VA.getLocInfo()) {
4016       default: llvm_unreachable("Unknown loc info!");
4017       case CCValAssign::Full: break;
4018       case CCValAssign::BCvt:
4019         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
4020         break;
4021       case CCValAssign::SExt:
4022         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
4023                                DAG.getValueType(VA.getValVT()));
4024         ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
4025         break;
4026       case CCValAssign::ZExt:
4027         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
4028                                DAG.getValueType(VA.getValVT()));
4029         ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
4030         break;
4031       }
4032
4033       InVals.push_back(ArgValue);
4034     } else { // VA.isRegLoc()
4035       // sanity check
4036       assert(VA.isMemLoc());
4037       assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
4038
4039       int index = VA.getValNo();
4040
4041       // Some Ins[] entries become multiple ArgLoc[] entries.
4042       // Process them only once.
4043       if (index != lastInsIndex)
4044         {
4045           ISD::ArgFlagsTy Flags = Ins[index].Flags;
4046           // FIXME: For now, all byval parameter objects are marked mutable.
4047           // This can be changed with more analysis.
4048           // In case of tail call optimization mark all arguments mutable.
4049           // Since they could be overwritten by lowering of arguments in case of
4050           // a tail call.
4051           if (Flags.isByVal()) {
4052             assert(Ins[index].isOrigArg() &&
4053                    "Byval arguments cannot be implicit");
4054             unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
4055
4056             int FrameIndex = StoreByValRegs(
4057                 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
4058                 VA.getLocMemOffset(), Flags.getByValSize());
4059             InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
4060             CCInfo.nextInRegsParam();
4061           } else {
4062             unsigned FIOffset = VA.getLocMemOffset();
4063             int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
4064                                            FIOffset, true);
4065
4066             // Create load nodes to retrieve arguments from the stack.
4067             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4068             InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
4069                                          MachinePointerInfo::getFixedStack(
4070                                              DAG.getMachineFunction(), FI)));
4071           }
4072           lastInsIndex = index;
4073         }
4074     }
4075   }
4076
4077   // varargs
4078   if (isVarArg && MFI.hasVAStart())
4079     VarArgStyleRegisters(CCInfo, DAG, dl, Chain,
4080                          CCInfo.getNextStackOffset(),
4081                          TotalArgRegsSaveSize);
4082
4083   AFI->setArgumentStackSize(CCInfo.getNextStackOffset());
4084
4085   return Chain;
4086 }
4087
4088 /// isFloatingPointZero - Return true if this is +0.0.
4089 static bool isFloatingPointZero(SDValue Op) {
4090   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
4091     return CFP->getValueAPF().isPosZero();
4092   else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
4093     // Maybe this has already been legalized into the constant pool?
4094     if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
4095       SDValue WrapperOp = Op.getOperand(1).getOperand(0);
4096       if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
4097         if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
4098           return CFP->getValueAPF().isPosZero();
4099     }
4100   } else if (Op->getOpcode() == ISD::BITCAST &&
4101              Op->getValueType(0) == MVT::f64) {
4102     // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
4103     // created by LowerConstantFP().
4104     SDValue BitcastOp = Op->getOperand(0);
4105     if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
4106         isNullConstant(BitcastOp->getOperand(0)))
4107       return true;
4108   }
4109   return false;
4110 }
4111
4112 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for
4113 /// the given operands.
4114 SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
4115                                      SDValue &ARMcc, SelectionDAG &DAG,
4116                                      const SDLoc &dl) const {
4117   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4118     unsigned C = RHSC->getZExtValue();
4119     if (!isLegalICmpImmediate((int32_t)C)) {
4120       // Constant does not fit, try adjusting it by one.
4121       switch (CC) {
4122       default: break;
4123       case ISD::SETLT:
4124       case ISD::SETGE:
4125         if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
4126           CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4127           RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4128         }
4129         break;
4130       case ISD::SETULT:
4131       case ISD::SETUGE:
4132         if (C != 0 && isLegalICmpImmediate(C-1)) {
4133           CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4134           RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4135         }
4136         break;
4137       case ISD::SETLE:
4138       case ISD::SETGT:
4139         if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
4140           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4141           RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4142         }
4143         break;
4144       case ISD::SETULE:
4145       case ISD::SETUGT:
4146         if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
4147           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4148           RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4149         }
4150         break;
4151       }
4152     }
4153   } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
4154              (ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) {
4155     // In ARM and Thumb-2, the compare instructions can shift their second
4156     // operand.
4157     CC = ISD::getSetCCSwappedOperands(CC);
4158     std::swap(LHS, RHS);
4159   }
4160
4161   // Thumb1 has very limited immediate modes, so turning an "and" into a
4162   // shift can save multiple instructions.
4163   //
4164   // If we have (x & C1), and C1 is an appropriate mask, we can transform it
4165   // into "((x << n) >> n)".  But that isn't necessarily profitable on its
4166   // own. If it's the operand to an unsigned comparison with an immediate,
4167   // we can eliminate one of the shifts: we transform
4168   // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
4169   //
4170   // We avoid transforming cases which aren't profitable due to encoding
4171   // details:
4172   //
4173   // 1. C2 fits into the immediate field of a cmp, and the transformed version
4174   // would not; in that case, we're essentially trading one immediate load for
4175   // another.
4176   // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
4177   // 3. C2 is zero; we have other code for this special case.
4178   //
4179   // FIXME: Figure out profitability for Thumb2; we usually can't save an
4180   // instruction, since the AND is always one instruction anyway, but we could
4181   // use narrow instructions in some cases.
4182   if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
4183       LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4184       LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
4185       !isSignedIntSetCC(CC)) {
4186     unsigned Mask = cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue();
4187     auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
4188     uint64_t RHSV = RHSC->getZExtValue();
4189     if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
4190       unsigned ShiftBits = countLeadingZeros(Mask);
4191       if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
4192         SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
4193         LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
4194         RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
4195       }
4196     }
4197   }
4198
4199   // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
4200   // single "lsls x, c+1".  The shift sets the "C" and "Z" flags the same
4201   // way a cmp would.
4202   // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
4203   // some tweaks to the heuristics for the previous and->shift transform.
4204   // FIXME: Optimize cases where the LHS isn't a shift.
4205   if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
4206       isa<ConstantSDNode>(RHS) &&
4207       cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U &&
4208       CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4209       cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() < 31) {
4210     unsigned ShiftAmt =
4211       cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() + 1;
4212     SDValue Shift = DAG.getNode(ARMISD::LSLS, dl,
4213                                 DAG.getVTList(MVT::i32, MVT::i32),
4214                                 LHS.getOperand(0),
4215                                 DAG.getConstant(ShiftAmt, dl, MVT::i32));
4216     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
4217                                      Shift.getValue(1), SDValue());
4218     ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
4219     return Chain.getValue(1);
4220   }
4221
4222   ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
4223
4224   // If the RHS is a constant zero then the V (overflow) flag will never be
4225   // set. This can allow us to simplify GE to PL or LT to MI, which can be
4226   // simpler for other passes (like the peephole optimiser) to deal with.
4227   if (isNullConstant(RHS)) {
4228     switch (CondCode) {
4229       default: break;
4230       case ARMCC::GE:
4231         CondCode = ARMCC::PL;
4232         break;
4233       case ARMCC::LT:
4234         CondCode = ARMCC::MI;
4235         break;
4236     }
4237   }
4238
4239   ARMISD::NodeType CompareType;
4240   switch (CondCode) {
4241   default:
4242     CompareType = ARMISD::CMP;
4243     break;
4244   case ARMCC::EQ:
4245   case ARMCC::NE:
4246     // Uses only Z Flag
4247     CompareType = ARMISD::CMPZ;
4248     break;
4249   }
4250   ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4251   return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
4252 }
4253
4254 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
4255 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
4256                                      SelectionDAG &DAG, const SDLoc &dl) const {
4257   assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
4258   SDValue Cmp;
4259   if (!isFloatingPointZero(RHS))
4260     Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
4261   else
4262     Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS);
4263   return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
4264 }
4265
4266 /// duplicateCmp - Glue values can have only one use, so this function
4267 /// duplicates a comparison node.
4268 SDValue
4269 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
4270   unsigned Opc = Cmp.getOpcode();
4271   SDLoc DL(Cmp);
4272   if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
4273     return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
4274
4275   assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
4276   Cmp = Cmp.getOperand(0);
4277   Opc = Cmp.getOpcode();
4278   if (Opc == ARMISD::CMPFP)
4279     Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
4280   else {
4281     assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
4282     Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
4283   }
4284   return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
4285 }
4286
4287 // This function returns three things: the arithmetic computation itself
4288 // (Value), a comparison (OverflowCmp), and a condition code (ARMcc).  The
4289 // comparison and the condition code define the case in which the arithmetic
4290 // computation *does not* overflow.
4291 std::pair<SDValue, SDValue>
4292 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
4293                                  SDValue &ARMcc) const {
4294   assert(Op.getValueType() == MVT::i32 &&  "Unsupported value type");
4295
4296   SDValue Value, OverflowCmp;
4297   SDValue LHS = Op.getOperand(0);
4298   SDValue RHS = Op.getOperand(1);
4299   SDLoc dl(Op);
4300
4301   // FIXME: We are currently always generating CMPs because we don't support
4302   // generating CMN through the backend. This is not as good as the natural
4303   // CMP case because it causes a register dependency and cannot be folded
4304   // later.
4305
4306   switch (Op.getOpcode()) {
4307   default:
4308     llvm_unreachable("Unknown overflow instruction!");
4309   case ISD::SADDO:
4310     ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4311     Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
4312     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
4313     break;
4314   case ISD::UADDO:
4315     ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4316     // We use ADDC here to correspond to its use in LowerUnsignedALUO.
4317     // We do not use it in the USUBO case as Value may not be used.
4318     Value = DAG.getNode(ARMISD::ADDC, dl,
4319                         DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
4320                 .getValue(0);
4321     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
4322     break;
4323   case ISD::SSUBO:
4324     ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4325     Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4326     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
4327     break;
4328   case ISD::USUBO:
4329     ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4330     Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4331     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
4332     break;
4333   case ISD::UMULO:
4334     // We generate a UMUL_LOHI and then check if the high word is 0.
4335     ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4336     Value = DAG.getNode(ISD::UMUL_LOHI, dl,
4337                         DAG.getVTList(Op.getValueType(), Op.getValueType()),
4338                         LHS, RHS);
4339     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
4340                               DAG.getConstant(0, dl, MVT::i32));
4341     Value = Value.getValue(0); // We only want the low 32 bits for the result.
4342     break;
4343   case ISD::SMULO:
4344     // We generate a SMUL_LOHI and then check if all the bits of the high word
4345     // are the same as the sign bit of the low word.
4346     ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4347     Value = DAG.getNode(ISD::SMUL_LOHI, dl,
4348                         DAG.getVTList(Op.getValueType(), Op.getValueType()),
4349                         LHS, RHS);
4350     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
4351                               DAG.getNode(ISD::SRA, dl, Op.getValueType(),
4352                                           Value.getValue(0),
4353                                           DAG.getConstant(31, dl, MVT::i32)));
4354     Value = Value.getValue(0); // We only want the low 32 bits for the result.
4355     break;
4356   } // switch (...)
4357
4358   return std::make_pair(Value, OverflowCmp);
4359 }
4360
4361 SDValue
4362 ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {
4363   // Let legalize expand this if it isn't a legal type yet.
4364   if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4365     return SDValue();
4366
4367   SDValue Value, OverflowCmp;
4368   SDValue ARMcc;
4369   std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
4370   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4371   SDLoc dl(Op);
4372   // We use 0 and 1 as false and true values.
4373   SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4374   SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4375   EVT VT = Op.getValueType();
4376
4377   SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal,
4378                                  ARMcc, CCR, OverflowCmp);
4379
4380   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4381   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4382 }
4383
4384 static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry,
4385                                               SelectionDAG &DAG) {
4386   SDLoc DL(BoolCarry);
4387   EVT CarryVT = BoolCarry.getValueType();
4388
4389   // This converts the boolean value carry into the carry flag by doing
4390   // ARMISD::SUBC Carry, 1
4391   SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
4392                               DAG.getVTList(CarryVT, MVT::i32),
4393                               BoolCarry, DAG.getConstant(1, DL, CarryVT));
4394   return Carry.getValue(1);
4395 }
4396
4397 static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT,
4398                                               SelectionDAG &DAG) {
4399   SDLoc DL(Flags);
4400
4401   // Now convert the carry flag into a boolean carry. We do this
4402   // using ARMISD:ADDE 0, 0, Carry
4403   return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
4404                      DAG.getConstant(0, DL, MVT::i32),
4405                      DAG.getConstant(0, DL, MVT::i32), Flags);
4406 }
4407
4408 SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
4409                                              SelectionDAG &DAG) const {
4410   // Let legalize expand this if it isn't a legal type yet.
4411   if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4412     return SDValue();
4413
4414   SDValue LHS = Op.getOperand(0);
4415   SDValue RHS = Op.getOperand(1);
4416   SDLoc dl(Op);
4417
4418   EVT VT = Op.getValueType();
4419   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
4420   SDValue Value;
4421   SDValue Overflow;
4422   switch (Op.getOpcode()) {
4423   default:
4424     llvm_unreachable("Unknown overflow instruction!");
4425   case ISD::UADDO:
4426     Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
4427     // Convert the carry flag into a boolean value.
4428     Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4429     break;
4430   case ISD::USUBO: {
4431     Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
4432     // Convert the carry flag into a boolean value.
4433     Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4434     // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
4435     // value. So compute 1 - C.
4436     Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,
4437                            DAG.getConstant(1, dl, MVT::i32), Overflow);
4438     break;
4439   }
4440   }
4441
4442   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4443 }
4444
4445 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
4446   SDValue Cond = Op.getOperand(0);
4447   SDValue SelectTrue = Op.getOperand(1);
4448   SDValue SelectFalse = Op.getOperand(2);
4449   SDLoc dl(Op);
4450   unsigned Opc = Cond.getOpcode();
4451
4452   if (Cond.getResNo() == 1 &&
4453       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
4454        Opc == ISD::USUBO)) {
4455     if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
4456       return SDValue();
4457
4458     SDValue Value, OverflowCmp;
4459     SDValue ARMcc;
4460     std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
4461     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4462     EVT VT = Op.getValueType();
4463
4464     return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR,
4465                    OverflowCmp, DAG);
4466   }
4467
4468   // Convert:
4469   //
4470   //   (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
4471   //   (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
4472   //
4473   if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
4474     const ConstantSDNode *CMOVTrue =
4475       dyn_cast<ConstantSDNode>(Cond.getOperand(0));
4476     const ConstantSDNode *CMOVFalse =
4477       dyn_cast<ConstantSDNode>(Cond.getOperand(1));
4478
4479     if (CMOVTrue && CMOVFalse) {
4480       unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
4481       unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
4482
4483       SDValue True;
4484       SDValue False;
4485       if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
4486         True = SelectTrue;
4487         False = SelectFalse;
4488       } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
4489         True = SelectFalse;
4490         False = SelectTrue;
4491       }
4492
4493       if (True.getNode() && False.getNode()) {
4494         EVT VT = Op.getValueType();
4495         SDValue ARMcc = Cond.getOperand(2);
4496         SDValue CCR = Cond.getOperand(3);
4497         SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
4498         assert(True.getValueType() == VT);
4499         return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG);
4500       }
4501     }
4502   }
4503
4504   // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
4505   // undefined bits before doing a full-word comparison with zero.
4506   Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
4507                      DAG.getConstant(1, dl, Cond.getValueType()));
4508
4509   return DAG.getSelectCC(dl, Cond,
4510                          DAG.getConstant(0, dl, Cond.getValueType()),
4511                          SelectTrue, SelectFalse, ISD::SETNE);
4512 }
4513
4514 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
4515                                  bool &swpCmpOps, bool &swpVselOps) {
4516   // Start by selecting the GE condition code for opcodes that return true for
4517   // 'equality'
4518   if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
4519       CC == ISD::SETULE || CC == ISD::SETGE  || CC == ISD::SETLE)
4520     CondCode = ARMCC::GE;
4521
4522   // and GT for opcodes that return false for 'equality'.
4523   else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
4524            CC == ISD::SETULT || CC == ISD::SETGT  || CC == ISD::SETLT)
4525     CondCode = ARMCC::GT;
4526
4527   // Since we are constrained to GE/GT, if the opcode contains 'less', we need
4528   // to swap the compare operands.
4529   if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
4530       CC == ISD::SETULT || CC == ISD::SETLE  || CC == ISD::SETLT)
4531     swpCmpOps = true;
4532
4533   // Both GT and GE are ordered comparisons, and return false for 'unordered'.
4534   // If we have an unordered opcode, we need to swap the operands to the VSEL
4535   // instruction (effectively negating the condition).
4536   //
4537   // This also has the effect of swapping which one of 'less' or 'greater'
4538   // returns true, so we also swap the compare operands. It also switches
4539   // whether we return true for 'equality', so we compensate by picking the
4540   // opposite condition code to our original choice.
4541   if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
4542       CC == ISD::SETUGT) {
4543     swpCmpOps = !swpCmpOps;
4544     swpVselOps = !swpVselOps;
4545     CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
4546   }
4547
4548   // 'ordered' is 'anything but unordered', so use the VS condition code and
4549   // swap the VSEL operands.
4550   if (CC == ISD::SETO) {
4551     CondCode = ARMCC::VS;
4552     swpVselOps = true;
4553   }
4554
4555   // 'unordered or not equal' is 'anything but equal', so use the EQ condition
4556   // code and swap the VSEL operands. Also do this if we don't care about the
4557   // unordered case.
4558   if (CC == ISD::SETUNE || CC == ISD::SETNE) {
4559     CondCode = ARMCC::EQ;
4560     swpVselOps = true;
4561   }
4562 }
4563
4564 SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
4565                                    SDValue TrueVal, SDValue ARMcc, SDValue CCR,
4566                                    SDValue Cmp, SelectionDAG &DAG) const {
4567   if (!Subtarget->hasFP64() && VT == MVT::f64) {
4568     FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
4569                            DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
4570     TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
4571                           DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
4572
4573     SDValue TrueLow = TrueVal.getValue(0);
4574     SDValue TrueHigh = TrueVal.getValue(1);
4575     SDValue FalseLow = FalseVal.getValue(0);
4576     SDValue FalseHigh = FalseVal.getValue(1);
4577
4578     SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
4579                               ARMcc, CCR, Cmp);
4580     SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
4581                                ARMcc, CCR, duplicateCmp(Cmp, DAG));
4582
4583     return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
4584   } else {
4585     return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
4586                        Cmp);
4587   }
4588 }
4589
4590 static bool isGTorGE(ISD::CondCode CC) {
4591   return CC == ISD::SETGT || CC == ISD::SETGE;
4592 }
4593
4594 static bool isLTorLE(ISD::CondCode CC) {
4595   return CC == ISD::SETLT || CC == ISD::SETLE;
4596 }
4597
4598 // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
4599 // All of these conditions (and their <= and >= counterparts) will do:
4600 //          x < k ? k : x
4601 //          x > k ? x : k
4602 //          k < x ? x : k
4603 //          k > x ? k : x
4604 static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
4605                             const SDValue TrueVal, const SDValue FalseVal,
4606                             const ISD::CondCode CC, const SDValue K) {
4607   return (isGTorGE(CC) &&
4608           ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
4609          (isLTorLE(CC) &&
4610           ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
4611 }
4612
4613 // Similar to isLowerSaturate(), but checks for upper-saturating conditions.
4614 static bool isUpperSaturate(const SDValue LHS, const SDValue RHS,
4615                             const SDValue TrueVal, const SDValue FalseVal,
4616                             const ISD::CondCode CC, const SDValue K) {
4617   return (isGTorGE(CC) &&
4618           ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) ||
4619          (isLTorLE(CC) &&
4620           ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal)));
4621 }
4622
4623 // Check if two chained conditionals could be converted into SSAT or USAT.
4624 //
4625 // SSAT can replace a set of two conditional selectors that bound a number to an
4626 // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
4627 //
4628 //     x < -k ? -k : (x > k ? k : x)
4629 //     x < -k ? -k : (x < k ? x : k)
4630 //     x > -k ? (x > k ? k : x) : -k
4631 //     x < k ? (x < -k ? -k : x) : k
4632 //     etc.
4633 //
4634 // USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is
4635 // a power of 2.
4636 //
4637 // It returns true if the conversion can be done, false otherwise.
4638 // Additionally, the variable is returned in parameter V, the constant in K and
4639 // usat is set to true if the conditional represents an unsigned saturation
4640 static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
4641                                     uint64_t &K, bool &usat) {
4642   SDValue LHS1 = Op.getOperand(0);
4643   SDValue RHS1 = Op.getOperand(1);
4644   SDValue TrueVal1 = Op.getOperand(2);
4645   SDValue FalseVal1 = Op.getOperand(3);
4646   ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
4647
4648   const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
4649   if (Op2.getOpcode() != ISD::SELECT_CC)
4650     return false;
4651
4652   SDValue LHS2 = Op2.getOperand(0);
4653   SDValue RHS2 = Op2.getOperand(1);
4654   SDValue TrueVal2 = Op2.getOperand(2);
4655   SDValue FalseVal2 = Op2.getOperand(3);
4656   ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
4657
4658   // Find out which are the constants and which are the variables
4659   // in each conditional
4660   SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1)
4661                                                         ? &RHS1
4662                                                         : nullptr;
4663   SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2)
4664                                                         ? &RHS2
4665                                                         : nullptr;
4666   SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2;
4667   SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1;
4668   SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2;
4669   SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2;
4670
4671   // We must detect cases where the original operations worked with 16- or
4672   // 8-bit values. In such case, V2Tmp != V2 because the comparison operations
4673   // must work with sign-extended values but the select operations return
4674   // the original non-extended value.
4675   SDValue V2TmpReg = V2Tmp;
4676   if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG)
4677     V2TmpReg = V2Tmp->getOperand(0);
4678
4679   // Check that the registers and the constants have the correct values
4680   // in both conditionals
4681   if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp ||
4682       V2TmpReg != V2)
4683     return false;
4684
4685   // Figure out which conditional is saturating the lower/upper bound.
4686   const SDValue *LowerCheckOp =
4687       isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
4688           ? &Op
4689           : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2)
4690                 ? &Op2
4691                 : nullptr;
4692   const SDValue *UpperCheckOp =
4693       isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
4694           ? &Op
4695           : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2)
4696                 ? &Op2
4697                 : nullptr;
4698
4699   if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp)
4700     return false;
4701
4702   // Check that the constant in the lower-bound check is
4703   // the opposite of the constant in the upper-bound check
4704   // in 1's complement.
4705   int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue();
4706   int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue();
4707   int64_t PosVal = std::max(Val1, Val2);
4708   int64_t NegVal = std::min(Val1, Val2);
4709
4710   if (((Val1 > Val2 && UpperCheckOp == &Op) ||
4711        (Val1 < Val2 && UpperCheckOp == &Op2)) &&
4712       isPowerOf2_64(PosVal + 1)) {
4713
4714     // Handle the difference between USAT (unsigned) and SSAT (signed) saturation
4715     if (Val1 == ~Val2)
4716       usat = false;
4717     else if (NegVal == 0)
4718       usat = true;
4719     else
4720       return false;
4721
4722     V = V2;
4723     K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive
4724
4725     return true;
4726   }
4727
4728   return false;
4729 }
4730
4731 // Check if a condition of the type x < k ? k : x can be converted into a
4732 // bit operation instead of conditional moves.
4733 // Currently this is allowed given:
4734 // - The conditions and values match up
4735 // - k is 0 or -1 (all ones)
4736 // This function will not check the last condition, thats up to the caller
4737 // It returns true if the transformation can be made, and in such case
4738 // returns x in V, and k in SatK.
4739 static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V,
4740                                          SDValue &SatK)
4741 {
4742   SDValue LHS = Op.getOperand(0);
4743   SDValue RHS = Op.getOperand(1);
4744   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
4745   SDValue TrueVal = Op.getOperand(2);
4746   SDValue FalseVal = Op.getOperand(3);
4747
4748   SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS)
4749                                                ? &RHS
4750                                                : nullptr;
4751
4752   // No constant operation in comparison, early out
4753   if (!K)
4754     return false;
4755
4756   SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
4757   V = (KTmp == TrueVal) ? FalseVal : TrueVal;
4758   SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
4759
4760   // If the constant on left and right side, or variable on left and right,
4761   // does not match, early out
4762   if (*K != KTmp || V != VTmp)
4763     return false;
4764
4765   if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
4766     SatK = *K;
4767     return true;
4768   }
4769
4770   return false;
4771 }
4772
4773 bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
4774   if (VT == MVT::f32)
4775     return !Subtarget->hasVFP2Base();
4776   if (VT == MVT::f64)
4777     return !Subtarget->hasFP64();
4778   if (VT == MVT::f16)
4779     return !Subtarget->hasFullFP16();
4780   return false;
4781 }
4782
4783 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
4784   EVT VT = Op.getValueType();
4785   SDLoc dl(Op);
4786
4787   // Try to convert two saturating conditional selects into a single SSAT
4788   SDValue SatValue;
4789   uint64_t SatConstant;
4790   bool SatUSat;
4791   if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) &&
4792       isSaturatingConditional(Op, SatValue, SatConstant, SatUSat)) {
4793     if (SatUSat)
4794       return DAG.getNode(ARMISD::USAT, dl, VT, SatValue,
4795                          DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
4796     else
4797       return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue,
4798                          DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
4799   }
4800
4801   // Try to convert expressions of the form x < k ? k : x (and similar forms)
4802   // into more efficient bit operations, which is possible when k is 0 or -1
4803   // On ARM and Thumb-2 which have flexible operand 2 this will result in
4804   // single instructions. On Thumb the shift and the bit operation will be two
4805   // instructions.
4806   // Only allow this transformation on full-width (32-bit) operations
4807   SDValue LowerSatConstant;
4808   if (VT == MVT::i32 &&
4809       isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
4810     SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
4811                                  DAG.getConstant(31, dl, VT));
4812     if (isNullConstant(LowerSatConstant)) {
4813       SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
4814                                       DAG.getAllOnesConstant(dl, VT));
4815       return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
4816     } else if (isAllOnesConstant(LowerSatConstant))
4817       return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
4818   }
4819
4820   SDValue LHS = Op.getOperand(0);
4821   SDValue RHS = Op.getOperand(1);
4822   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
4823   SDValue TrueVal = Op.getOperand(2);
4824   SDValue FalseVal = Op.getOperand(3);
4825   ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
4826   ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);
4827
4828   if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal &&
4829       LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) {
4830     unsigned TVal = CTVal->getZExtValue();
4831     unsigned FVal = CFVal->getZExtValue();
4832     unsigned Opcode = 0;
4833
4834     if (TVal == ~FVal) {
4835       Opcode = ARMISD::CSINV;
4836     } else if (TVal == ~FVal + 1) {
4837       Opcode = ARMISD::CSNEG;
4838     } else if (TVal + 1 == FVal) {
4839       Opcode = ARMISD::CSINC;
4840     } else if (TVal == FVal + 1) {
4841       Opcode = ARMISD::CSINC;
4842       std::swap(TrueVal, FalseVal);
4843       std::swap(TVal, FVal);
4844       CC = ISD::getSetCCInverse(CC, true);
4845     }
4846
4847     if (Opcode) {
4848       // If one of the constants is cheaper than another, materialise the
4849       // cheaper one and let the csel generate the other.
4850       if (Opcode != ARMISD::CSINC &&
4851           HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
4852         std::swap(TrueVal, FalseVal);
4853         std::swap(TVal, FVal);
4854         CC = ISD::getSetCCInverse(CC, true);
4855       }
4856
4857       // Attempt to use ZR checking TVal is 0, possibly inverting the condition
4858       // to get there. CSINC not is invertable like the other two (~(~a) == a,
4859       // -(-a) == a, but (a+1)+1 != a).
4860       if (FVal == 0 && Opcode != ARMISD::CSINC) {
4861         std::swap(TrueVal, FalseVal);
4862         std::swap(TVal, FVal);
4863         CC = ISD::getSetCCInverse(CC, true);
4864       }
4865       if (TVal == 0)
4866         TrueVal = DAG.getRegister(ARM::ZR, MVT::i32);
4867
4868       // Drops F's value because we can get it by inverting/negating TVal.
4869       FalseVal = TrueVal;
4870
4871       SDValue ARMcc;
4872       SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
4873       EVT VT = TrueVal.getValueType();
4874       return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp);
4875     }
4876   }
4877
4878   if (isUnsupportedFloatingType(LHS.getValueType())) {
4879     DAG.getTargetLoweringInfo().softenSetCCOperands(
4880         DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
4881
4882     // If softenSetCCOperands only returned one value, we should compare it to
4883     // zero.
4884     if (!RHS.getNode()) {
4885       RHS = DAG.getConstant(0, dl, LHS.getValueType());
4886       CC = ISD::SETNE;
4887     }
4888   }
4889
4890   if (LHS.getValueType() == MVT::i32) {
4891     // Try to generate VSEL on ARMv8.
4892     // The VSEL instruction can't use all the usual ARM condition
4893     // codes: it only has two bits to select the condition code, so it's
4894     // constrained to use only GE, GT, VS and EQ.
4895     //
4896     // To implement all the various ISD::SETXXX opcodes, we sometimes need to
4897     // swap the operands of the previous compare instruction (effectively
4898     // inverting the compare condition, swapping 'less' and 'greater') and
4899     // sometimes need to swap the operands to the VSEL (which inverts the
4900     // condition in the sense of firing whenever the previous condition didn't)
4901     if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
4902                                         TrueVal.getValueType() == MVT::f32 ||
4903                                         TrueVal.getValueType() == MVT::f64)) {
4904       ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
4905       if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
4906           CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
4907         CC = ISD::getSetCCInverse(CC, true);
4908         std::swap(TrueVal, FalseVal);
4909       }
4910     }
4911
4912     SDValue ARMcc;
4913     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4914     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
4915     // Choose GE over PL, which vsel does now support
4916     if (cast<ConstantSDNode>(ARMcc)->getZExtValue() == ARMCC::PL)
4917       ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
4918     return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
4919   }
4920
4921   ARMCC::CondCodes CondCode, CondCode2;
4922   FPCCToARMCC(CC, CondCode, CondCode2);
4923
4924   // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
4925   // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
4926   // must use VSEL (limited condition codes), due to not having conditional f16
4927   // moves.
4928   if (Subtarget->hasFPARMv8Base() &&
4929       !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
4930       (TrueVal.getValueType() == MVT::f16 ||
4931        TrueVal.getValueType() == MVT::f32 ||
4932        TrueVal.getValueType() == MVT::f64)) {
4933     bool swpCmpOps = false;
4934     bool swpVselOps = false;
4935     checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
4936
4937     if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
4938         CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
4939       if (swpCmpOps)
4940         std::swap(LHS, RHS);
4941       if (swpVselOps)
4942         std::swap(TrueVal, FalseVal);
4943     }
4944   }
4945
4946   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4947   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
4948   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
4949   SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
4950   if (CondCode2 != ARMCC::AL) {
4951     SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
4952     // FIXME: Needs another CMP because flag can have but one use.
4953     SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
4954     Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
4955   }
4956   return Result;
4957 }
4958
4959 /// canChangeToInt - Given the fp compare operand, return true if it is suitable
4960 /// to morph to an integer compare sequence.
4961 static bool canChangeToInt(SDValue Op, bool &SeenZero,
4962                            const ARMSubtarget *Subtarget) {
4963   SDNode *N = Op.getNode();
4964   if (!N->hasOneUse())
4965     // Otherwise it requires moving the value from fp to integer registers.
4966     return false;
4967   if (!N->getNumValues())
4968     return false;
4969   EVT VT = Op.getValueType();
4970   if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
4971     // f32 case is generally profitable. f64 case only makes sense when vcmpe +
4972     // vmrs are very slow, e.g. cortex-a8.
4973     return false;
4974
4975   if (isFloatingPointZero(Op)) {
4976     SeenZero = true;
4977     return true;
4978   }
4979   return ISD::isNormalLoad(N);
4980 }
4981
4982 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
4983   if (isFloatingPointZero(Op))
4984     return DAG.getConstant(0, SDLoc(Op), MVT::i32);
4985
4986   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
4987     return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
4988                        Ld->getPointerInfo(), Ld->getAlignment(),
4989                        Ld->getMemOperand()->getFlags());
4990
4991   llvm_unreachable("Unknown VFP cmp argument!");
4992 }
4993
4994 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
4995                            SDValue &RetVal1, SDValue &RetVal2) {
4996   SDLoc dl(Op);
4997
4998   if (isFloatingPointZero(Op)) {
4999     RetVal1 = DAG.getConstant(0, dl, MVT::i32);
5000     RetVal2 = DAG.getConstant(0, dl, MVT::i32);
5001     return;
5002   }
5003
5004   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
5005     SDValue Ptr = Ld->getBasePtr();
5006     RetVal1 =
5007         DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
5008                     Ld->getAlignment(), Ld->getMemOperand()->getFlags());
5009
5010     EVT PtrType = Ptr.getValueType();
5011     unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
5012     SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
5013                                  PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
5014     RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
5015                           Ld->getPointerInfo().getWithOffset(4), NewAlign,
5016                           Ld->getMemOperand()->getFlags());
5017     return;
5018   }
5019
5020   llvm_unreachable("Unknown VFP cmp argument!");
5021 }
5022
5023 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
5024 /// f32 and even f64 comparisons to integer ones.
5025 SDValue
5026 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
5027   SDValue Chain = Op.getOperand(0);
5028   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5029   SDValue LHS = Op.getOperand(2);
5030   SDValue RHS = Op.getOperand(3);
5031   SDValue Dest = Op.getOperand(4);
5032   SDLoc dl(Op);
5033
5034   bool LHSSeenZero = false;
5035   bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
5036   bool RHSSeenZero = false;
5037   bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
5038   if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
5039     // If unsafe fp math optimization is enabled and there are no other uses of
5040     // the CMP operands, and the condition code is EQ or NE, we can optimize it
5041     // to an integer comparison.
5042     if (CC == ISD::SETOEQ)
5043       CC = ISD::SETEQ;
5044     else if (CC == ISD::SETUNE)
5045       CC = ISD::SETNE;
5046
5047     SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5048     SDValue ARMcc;
5049     if (LHS.getValueType() == MVT::f32) {
5050       LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5051                         bitcastf32Toi32(LHS, DAG), Mask);
5052       RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5053                         bitcastf32Toi32(RHS, DAG), Mask);
5054       SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5055       SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5056       return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
5057                          Chain, Dest, ARMcc, CCR, Cmp);
5058     }
5059
5060     SDValue LHS1, LHS2;
5061     SDValue RHS1, RHS2;
5062     expandf64Toi32(LHS, DAG, LHS1, LHS2);
5063     expandf64Toi32(RHS, DAG, RHS1, RHS2);
5064     LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
5065     RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
5066     ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
5067     ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5068     SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
5069     SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
5070     return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops);
5071   }
5072
5073   return SDValue();
5074 }
5075
5076 SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
5077   SDValue Chain = Op.getOperand(0);
5078   SDValue Cond = Op.getOperand(1);
5079   SDValue Dest = Op.getOperand(2);
5080   SDLoc dl(Op);
5081
5082   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5083   // instruction.
5084   unsigned Opc = Cond.getOpcode();
5085   bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5086                       !Subtarget->isThumb1Only();
5087   if (Cond.getResNo() == 1 &&
5088       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5089        Opc == ISD::USUBO || OptimizeMul)) {
5090     // Only lower legal XALUO ops.
5091     if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
5092       return SDValue();
5093
5094     // The actual operation with overflow check.
5095     SDValue Value, OverflowCmp;
5096     SDValue ARMcc;
5097     std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5098
5099     // Reverse the condition code.
5100     ARMCC::CondCodes CondCode =
5101         (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5102     CondCode = ARMCC::getOppositeCondition(CondCode);
5103     ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5104     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5105
5106     return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
5107                        OverflowCmp);
5108   }
5109
5110   return SDValue();
5111 }
5112
5113 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
5114   SDValue Chain = Op.getOperand(0);
5115   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5116   SDValue LHS = Op.getOperand(2);
5117   SDValue RHS = Op.getOperand(3);
5118   SDValue Dest = Op.getOperand(4);
5119   SDLoc dl(Op);
5120
5121   if (isUnsupportedFloatingType(LHS.getValueType())) {
5122     DAG.getTargetLoweringInfo().softenSetCCOperands(
5123         DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5124
5125     // If softenSetCCOperands only returned one value, we should compare it to
5126     // zero.
5127     if (!RHS.getNode()) {
5128       RHS = DAG.getConstant(0, dl, LHS.getValueType());
5129       CC = ISD::SETNE;
5130     }
5131   }
5132
5133   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5134   // instruction.
5135   unsigned Opc = LHS.getOpcode();
5136   bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5137                       !Subtarget->isThumb1Only();
5138   if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
5139       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5140        Opc == ISD::USUBO || OptimizeMul) &&
5141       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5142     // Only lower legal XALUO ops.
5143     if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
5144       return SDValue();
5145
5146     // The actual operation with overflow check.
5147     SDValue Value, OverflowCmp;
5148     SDValue ARMcc;
5149     std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
5150
5151     if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
5152       // Reverse the condition code.
5153       ARMCC::CondCodes CondCode =
5154           (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5155       CondCode = ARMCC::getOppositeCondition(CondCode);
5156       ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5157     }
5158     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5159
5160     return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
5161                        OverflowCmp);
5162   }
5163
5164   if (LHS.getValueType() == MVT::i32) {
5165     SDValue ARMcc;
5166     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5167     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5168     return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
5169                        Chain, Dest, ARMcc, CCR, Cmp);
5170   }
5171
5172   if (getTargetMachine().Options.UnsafeFPMath &&
5173       (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
5174        CC == ISD::SETNE || CC == ISD::SETUNE)) {
5175     if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
5176       return Result;
5177   }
5178
5179   ARMCC::CondCodes CondCode, CondCode2;
5180   FPCCToARMCC(CC, CondCode, CondCode2);
5181
5182   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5183   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5184   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5185   SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
5186   SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
5187   SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
5188   if (CondCode2 != ARMCC::AL) {
5189     ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
5190     SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
5191     Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
5192   }
5193   return Res;
5194 }
5195
5196 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
5197   SDValue Chain = Op.getOperand(0);
5198   SDValue Table = Op.getOperand(1);
5199   SDValue Index = Op.getOperand(2);
5200   SDLoc dl(Op);
5201
5202   EVT PTy = getPointerTy(DAG.getDataLayout());
5203   JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
5204   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
5205   Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
5206   Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
5207   SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
5208   if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
5209     // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
5210     // which does another jump to the destination. This also makes it easier
5211     // to translate it to TBB / TBH later (Thumb2 only).
5212     // FIXME: This might not work if the function is extremely large.
5213     return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
5214                        Addr, Op.getOperand(2), JTI);
5215   }
5216   if (isPositionIndependent() || Subtarget->isROPI()) {
5217     Addr =
5218         DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
5219                     MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
5220     Chain = Addr.getValue(1);
5221     Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
5222     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5223   } else {
5224     Addr =
5225         DAG.getLoad(PTy, dl, Chain, Addr,
5226                     MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
5227     Chain = Addr.getValue(1);
5228     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5229   }
5230 }
5231
5232 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
5233   EVT VT = Op.getValueType();
5234   SDLoc dl(Op);
5235
5236   if (Op.getValueType().getVectorElementType() == MVT::i32) {
5237     if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
5238       return Op;
5239     return DAG.UnrollVectorOp(Op.getNode());
5240   }
5241
5242   const bool HasFullFP16 =
5243     static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16();
5244
5245   EVT NewTy;
5246   const EVT OpTy = Op.getOperand(0).getValueType();
5247   if (OpTy == MVT::v4f32)
5248     NewTy = MVT::v4i32;
5249   else if (OpTy == MVT::v4f16 && HasFullFP16)
5250     NewTy = MVT::v4i16;
5251   else if (OpTy == MVT::v8f16 && HasFullFP16)
5252     NewTy = MVT::v8i16;
5253   else
5254     llvm_unreachable("Invalid type for custom lowering!");
5255
5256   if (VT != MVT::v4i16 && VT != MVT::v8i16)
5257     return DAG.UnrollVectorOp(Op.getNode());
5258
5259   Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
5260   return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
5261 }
5262
5263 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
5264   EVT VT = Op.getValueType();
5265   if (VT.isVector())
5266     return LowerVectorFP_TO_INT(Op, DAG);
5267   if (isUnsupportedFloatingType(Op.getOperand(0).getValueType())) {
5268     RTLIB::Libcall LC;
5269     if (Op.getOpcode() == ISD::FP_TO_SINT)
5270       LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(),
5271                               Op.getValueType());
5272     else
5273       LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(),
5274                               Op.getValueType());
5275     MakeLibCallOptions CallOptions;
5276     return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
5277                        CallOptions, SDLoc(Op)).first;
5278   }
5279
5280   return Op;
5281 }
5282
5283 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
5284   EVT VT = Op.getValueType();
5285   SDLoc dl(Op);
5286
5287   if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
5288     if (VT.getVectorElementType() == MVT::f32)
5289       return Op;
5290     return DAG.UnrollVectorOp(Op.getNode());
5291   }
5292
5293   assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
5294           Op.getOperand(0).getValueType() == MVT::v8i16) &&
5295          "Invalid type for custom lowering!");
5296
5297   const bool HasFullFP16 =
5298     static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16();
5299
5300   EVT DestVecType;
5301   if (VT == MVT::v4f32)
5302     DestVecType = MVT::v4i32;
5303   else if (VT == MVT::v4f16 && HasFullFP16)
5304     DestVecType = MVT::v4i16;
5305   else if (VT == MVT::v8f16 && HasFullFP16)
5306     DestVecType = MVT::v8i16;
5307   else
5308     return DAG.UnrollVectorOp(Op.getNode());
5309
5310   unsigned CastOpc;
5311   unsigned Opc;
5312   switch (Op.getOpcode()) {
5313   default: llvm_unreachable("Invalid opcode!");
5314   case ISD::SINT_TO_FP:
5315     CastOpc = ISD::SIGN_EXTEND;
5316     Opc = ISD::SINT_TO_FP;
5317     break;
5318   case ISD::UINT_TO_FP:
5319     CastOpc = ISD::ZERO_EXTEND;
5320     Opc = ISD::UINT_TO_FP;
5321     break;
5322   }
5323
5324   Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
5325   return DAG.getNode(Opc, dl, VT, Op);
5326 }
5327
5328 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
5329   EVT VT = Op.getValueType();
5330   if (VT.isVector())
5331     return LowerVectorINT_TO_FP(Op, DAG);
5332   if (isUnsupportedFloatingType(VT)) {
5333     RTLIB::Libcall LC;
5334     if (Op.getOpcode() == ISD::SINT_TO_FP)
5335       LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
5336                               Op.getValueType());
5337     else
5338       LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
5339                               Op.getValueType());
5340     MakeLibCallOptions CallOptions;
5341     return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
5342                        CallOptions, SDLoc(Op)).first;
5343   }
5344
5345   return Op;
5346 }
5347
5348 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
5349   // Implement fcopysign with a fabs and a conditional fneg.
5350   SDValue Tmp0 = Op.getOperand(0);
5351   SDValue Tmp1 = Op.getOperand(1);
5352   SDLoc dl(Op);
5353   EVT VT = Op.getValueType();
5354   EVT SrcVT = Tmp1.getValueType();
5355   bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
5356     Tmp0.getOpcode() == ARMISD::VMOVDRR;
5357   bool UseNEON = !InGPR && Subtarget->hasNEON();
5358
5359   if (UseNEON) {
5360     // Use VBSL to copy the sign bit.
5361     unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
5362     SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
5363                                DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
5364     EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
5365     if (VT == MVT::f64)
5366       Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5367                          DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
5368                          DAG.getConstant(32, dl, MVT::i32));
5369     else /*if (VT == MVT::f32)*/
5370       Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
5371     if (SrcVT == MVT::f32) {
5372       Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
5373       if (VT == MVT::f64)
5374         Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5375                            DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
5376                            DAG.getConstant(32, dl, MVT::i32));
5377     } else if (VT == MVT::f32)
5378       Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
5379                          DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
5380                          DAG.getConstant(32, dl, MVT::i32));
5381     Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
5382     Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
5383
5384     SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff),
5385                                             dl, MVT::i32);
5386     AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
5387     SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
5388                                   DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
5389
5390     SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
5391                               DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
5392                               DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
5393     if (VT == MVT::f32) {
5394       Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
5395       Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
5396                         DAG.getConstant(0, dl, MVT::i32));
5397     } else {
5398       Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
5399     }
5400
5401     return Res;
5402   }
5403
5404   // Bitcast operand 1 to i32.
5405   if (SrcVT == MVT::f64)
5406     Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5407                        Tmp1).getValue(1);
5408   Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
5409
5410   // Or in the signbit with integer operations.
5411   SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
5412   SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5413   Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
5414   if (VT == MVT::f32) {
5415     Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
5416                        DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
5417     return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
5418                        DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
5419   }
5420
5421   // f64: Or the high part with signbit and then combine two parts.
5422   Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5423                      Tmp0);
5424   SDValue Lo = Tmp0.getValue(0);
5425   SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
5426   Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
5427   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
5428 }
5429
5430 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
5431   MachineFunction &MF = DAG.getMachineFunction();
5432   MachineFrameInfo &MFI = MF.getFrameInfo();
5433   MFI.setReturnAddressIsTaken(true);
5434
5435   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
5436     return SDValue();
5437
5438   EVT VT = Op.getValueType();
5439   SDLoc dl(Op);
5440   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
5441   if (Depth) {
5442     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
5443     SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
5444     return DAG.getLoad(VT, dl, DAG.getEntryNode(),
5445                        DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
5446                        MachinePointerInfo());
5447   }
5448
5449   // Return LR, which contains the return address. Mark it an implicit live-in.
5450   unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
5451   return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
5452 }
5453
5454 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
5455   const ARMBaseRegisterInfo &ARI =
5456     *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
5457   MachineFunction &MF = DAG.getMachineFunction();
5458   MachineFrameInfo &MFI = MF.getFrameInfo();
5459   MFI.setFrameAddressIsTaken(true);
5460
5461   EVT VT = Op.getValueType();
5462   SDLoc dl(Op);  // FIXME probably not meaningful
5463   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
5464   Register FrameReg = ARI.getFrameRegister(MF);
5465   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
5466   while (Depth--)
5467     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
5468                             MachinePointerInfo());
5469   return FrameAddr;
5470 }
5471
5472 // FIXME? Maybe this could be a TableGen attribute on some registers and
5473 // this table could be generated automatically from RegInfo.
5474 Register ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT,
5475                                               const MachineFunction &MF) const {
5476   Register Reg = StringSwitch<unsigned>(RegName)
5477                        .Case("sp", ARM::SP)
5478                        .Default(0);
5479   if (Reg)
5480     return Reg;
5481   report_fatal_error(Twine("Invalid register name \""
5482                               + StringRef(RegName)  + "\"."));
5483 }
5484
5485 // Result is 64 bit value so split into two 32 bit values and return as a
5486 // pair of values.
5487 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
5488                                 SelectionDAG &DAG) {
5489   SDLoc DL(N);
5490
5491   // This function is only supposed to be called for i64 type destination.
5492   assert(N->getValueType(0) == MVT::i64
5493           && "ExpandREAD_REGISTER called for non-i64 type result.");
5494
5495   SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL,
5496                              DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
5497                              N->getOperand(0),
5498                              N->getOperand(1));
5499
5500   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
5501                     Read.getValue(1)));
5502   Results.push_back(Read.getOperand(0));
5503 }
5504
5505 /// \p BC is a bitcast that is about to be turned into a VMOVDRR.
5506 /// When \p DstVT, the destination type of \p BC, is on the vector
5507 /// register bank and the source of bitcast, \p Op, operates on the same bank,
5508 /// it might be possible to combine them, such that everything stays on the
5509 /// vector register bank.
5510 /// \p return The node that would replace \p BT, if the combine
5511 /// is possible.
5512 static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
5513                                                 SelectionDAG &DAG) {
5514   SDValue Op = BC->getOperand(0);
5515   EVT DstVT = BC->getValueType(0);
5516
5517   // The only vector instruction that can produce a scalar (remember,
5518   // since the bitcast was about to be turned into VMOVDRR, the source
5519   // type is i64) from a vector is EXTRACT_VECTOR_ELT.
5520   // Moreover, we can do this combine only if there is one use.
5521   // Finally, if the destination type is not a vector, there is not
5522   // much point on forcing everything on the vector bank.
5523   if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5524       !Op.hasOneUse())
5525     return SDValue();
5526
5527   // If the index is not constant, we will introduce an additional
5528   // multiply that will stick.
5529   // Give up in that case.
5530   ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
5531   if (!Index)
5532     return SDValue();
5533   unsigned DstNumElt = DstVT.getVectorNumElements();
5534
5535   // Compute the new index.
5536   const APInt &APIntIndex = Index->getAPIntValue();
5537   APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
5538   NewIndex *= APIntIndex;
5539   // Check if the new constant index fits into i32.
5540   if (NewIndex.getBitWidth() > 32)
5541     return SDValue();
5542
5543   // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
5544   // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
5545   SDLoc dl(Op);
5546   SDValue ExtractSrc = Op.getOperand(0);
5547   EVT VecVT = EVT::getVectorVT(
5548       *DAG.getContext(), DstVT.getScalarType(),
5549       ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
5550   SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
5551   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
5552                      DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
5553 }
5554
5555 /// ExpandBITCAST - If the target supports VFP, this function is called to
5556 /// expand a bit convert where either the source or destination type is i64 to
5557 /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
5558 /// operand type is illegal (e.g., v2f32 for a target that doesn't support
5559 /// vectors), since the legalizer won't know what to do with that.
5560 static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
5561                              const ARMSubtarget *Subtarget) {
5562   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5563   SDLoc dl(N);
5564   SDValue Op = N->getOperand(0);
5565
5566   // This function is only supposed to be called for i64 types, either as the
5567   // source or destination of the bit convert.
5568   EVT SrcVT = Op.getValueType();
5569   EVT DstVT = N->getValueType(0);
5570   const bool HasFullFP16 = Subtarget->hasFullFP16();
5571
5572   if (SrcVT == MVT::f32 && DstVT == MVT::i32) {
5573      // FullFP16: half values are passed in S-registers, and we don't
5574      // need any of the bitcast and moves:
5575      //
5576      // t2: f32,ch = CopyFromReg t0, Register:f32 %0
5577      //   t5: i32 = bitcast t2
5578      // t18: f16 = ARMISD::VMOVhr t5
5579      if (Op.getOpcode() != ISD::CopyFromReg ||
5580          Op.getValueType() != MVT::f32)
5581        return SDValue();
5582
5583      auto Move = N->use_begin();
5584      if (Move->getOpcode() != ARMISD::VMOVhr)
5585        return SDValue();
5586
5587      SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) };
5588      SDValue Copy = DAG.getNode(ISD::CopyFromReg, SDLoc(Op), MVT::f16, Ops);
5589      DAG.ReplaceAllUsesWith(*Move, &Copy);
5590      return Copy;
5591   }
5592
5593   if (SrcVT == MVT::i16 && DstVT == MVT::f16) {
5594     if (!HasFullFP16)
5595       return SDValue();
5596     // SoftFP: read half-precision arguments:
5597     //
5598     // t2: i32,ch = ...
5599     //        t7: i16 = truncate t2 <~~~~ Op
5600     //      t8: f16 = bitcast t7    <~~~~ N
5601     //
5602     if (Op.getOperand(0).getValueType() == MVT::i32)
5603       return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op),
5604                          MVT::f16, Op.getOperand(0));
5605
5606     return SDValue();
5607   }
5608
5609   // Half-precision return values
5610   if (SrcVT == MVT::f16 && DstVT == MVT::i16) {
5611     if (!HasFullFP16)
5612       return SDValue();
5613     //
5614     //          t11: f16 = fadd t8, t10
5615     //        t12: i16 = bitcast t11       <~~~ SDNode N
5616     //      t13: i32 = zero_extend t12
5617     //    t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13
5618     //  t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1
5619     //
5620     // transform this into:
5621     //
5622     //    t20: i32 = ARMISD::VMOVrh t11
5623     //  t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20
5624     //
5625     auto ZeroExtend = N->use_begin();
5626     if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND ||
5627         ZeroExtend->getValueType(0) != MVT::i32)
5628       return SDValue();
5629
5630     auto Copy = ZeroExtend->use_begin();
5631     if (Copy->getOpcode() == ISD::CopyToReg &&
5632         Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) {
5633       SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op);
5634       DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt);
5635       return Cvt;
5636     }
5637     return SDValue();
5638   }
5639
5640   if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
5641     return SDValue();
5642
5643   // Turn i64->f64 into VMOVDRR.
5644   if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
5645     // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
5646     // if we can combine the bitcast with its source.
5647     if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG))
5648       return Val;
5649
5650     SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
5651                              DAG.getConstant(0, dl, MVT::i32));
5652     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
5653                              DAG.getConstant(1, dl, MVT::i32));
5654     return DAG.getNode(ISD::BITCAST, dl, DstVT,
5655                        DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
5656   }
5657
5658   // Turn f64->i64 into VMOVRRD.
5659   if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
5660     SDValue Cvt;
5661     if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
5662         SrcVT.getVectorNumElements() > 1)
5663       Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
5664                         DAG.getVTList(MVT::i32, MVT::i32),
5665                         DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
5666     else
5667       Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
5668                         DAG.getVTList(MVT::i32, MVT::i32), Op);
5669     // Merge the pieces into a single i64 value.
5670     return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
5671   }
5672
5673   return SDValue();
5674 }
5675
5676 /// getZeroVector - Returns a vector of specified type with all zero elements.
5677 /// Zero vectors are used to represent vector negation and in those cases
5678 /// will be implemented with the NEON VNEG instruction.  However, VNEG does
5679 /// not support i64 elements, so sometimes the zero vectors will need to be
5680 /// explicitly constructed.  Regardless, use a canonical VMOV to create the
5681 /// zero vector.
5682 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5683   assert(VT.isVector() && "Expected a vector type");
5684   // The canonical modified immediate encoding of a zero vector is....0!
5685   SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
5686   EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
5687   SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
5688   return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
5689 }
5690
5691 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two
5692 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
5693 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
5694                                                 SelectionDAG &DAG) const {
5695   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
5696   EVT VT = Op.getValueType();
5697   unsigned VTBits = VT.getSizeInBits();
5698   SDLoc dl(Op);
5699   SDValue ShOpLo = Op.getOperand(0);
5700   SDValue ShOpHi = Op.getOperand(1);
5701   SDValue ShAmt  = Op.getOperand(2);
5702   SDValue ARMcc;
5703   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5704   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
5705
5706   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
5707
5708   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
5709                                  DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
5710   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
5711   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
5712                                    DAG.getConstant(VTBits, dl, MVT::i32));
5713   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
5714   SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
5715   SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
5716   SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
5717                             ISD::SETGE, ARMcc, DAG, dl);
5718   SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift,
5719                            ARMcc, CCR, CmpLo);
5720
5721   SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
5722   SDValue HiBigShift = Opc == ISD::SRA
5723                            ? DAG.getNode(Opc, dl, VT, ShOpHi,
5724                                          DAG.getConstant(VTBits - 1, dl, VT))
5725                            : DAG.getConstant(0, dl, VT);
5726   SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
5727                             ISD::SETGE, ARMcc, DAG, dl);
5728   SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
5729                            ARMcc, CCR, CmpHi);
5730
5731   SDValue Ops[2] = { Lo, Hi };
5732   return DAG.getMergeValues(Ops, dl);
5733 }
5734
5735 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
5736 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
5737 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
5738                                                SelectionDAG &DAG) const {
5739   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
5740   EVT VT = Op.getValueType();
5741   unsigned VTBits = VT.getSizeInBits();
5742   SDLoc dl(Op);
5743   SDValue ShOpLo = Op.getOperand(0);
5744   SDValue ShOpHi = Op.getOperand(1);
5745   SDValue ShAmt  = Op.getOperand(2);
5746   SDValue ARMcc;
5747   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
5748
5749   assert(Op.getOpcode() == ISD::SHL_PARTS);
5750   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
5751                                  DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
5752   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
5753   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
5754   SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
5755
5756   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
5757                                    DAG.getConstant(VTBits, dl, MVT::i32));
5758   SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
5759   SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
5760                             ISD::SETGE, ARMcc, DAG, dl);
5761   SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
5762                            ARMcc, CCR, CmpHi);
5763
5764   SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
5765                           ISD::SETGE, ARMcc, DAG, dl);
5766   SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
5767   SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
5768                            DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo);
5769
5770   SDValue Ops[2] = { Lo, Hi };
5771   return DAG.getMergeValues(Ops, dl);
5772 }
5773
5774 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
5775                                             SelectionDAG &DAG) const {
5776   // The rounding mode is in bits 23:22 of the FPSCR.
5777   // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5778   // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5779   // so that the shift + and get folded into a bitfield extract.
5780   SDLoc dl(Op);
5781   SDValue Ops[] = { DAG.getEntryNode(),
5782                     DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32) };
5783
5784   SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, MVT::i32, Ops);
5785   SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
5786                                   DAG.getConstant(1U << 22, dl, MVT::i32));
5787   SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
5788                               DAG.getConstant(22, dl, MVT::i32));
5789   return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
5790                      DAG.getConstant(3, dl, MVT::i32));
5791 }
5792
5793 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
5794                          const ARMSubtarget *ST) {
5795   SDLoc dl(N);
5796   EVT VT = N->getValueType(0);
5797   if (VT.isVector() && ST->hasNEON()) {
5798
5799     // Compute the least significant set bit: LSB = X & -X
5800     SDValue X = N->getOperand(0);
5801     SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
5802     SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
5803
5804     EVT ElemTy = VT.getVectorElementType();
5805
5806     if (ElemTy == MVT::i8) {
5807       // Compute with: cttz(x) = ctpop(lsb - 1)
5808       SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
5809                                 DAG.getTargetConstant(1, dl, ElemTy));
5810       SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
5811       return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
5812     }
5813
5814     if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
5815         (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
5816       // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
5817       unsigned NumBits = ElemTy.getSizeInBits();
5818       SDValue WidthMinus1 =
5819           DAG.getNode(ARMISD::VMOVIMM, dl, VT,
5820                       DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
5821       SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
5822       return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
5823     }
5824
5825     // Compute with: cttz(x) = ctpop(lsb - 1)
5826
5827     // Compute LSB - 1.
5828     SDValue Bits;
5829     if (ElemTy == MVT::i64) {
5830       // Load constant 0xffff'ffff'ffff'ffff to register.
5831       SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
5832                                DAG.getTargetConstant(0x1eff, dl, MVT::i32));
5833       Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
5834     } else {
5835       SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
5836                                 DAG.getTargetConstant(1, dl, ElemTy));
5837       Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
5838     }
5839     return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
5840   }
5841
5842   if (!ST->hasV6T2Ops())
5843     return SDValue();
5844
5845   SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
5846   return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
5847 }
5848
5849 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
5850                           const ARMSubtarget *ST) {
5851   EVT VT = N->getValueType(0);
5852   SDLoc DL(N);
5853
5854   assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
5855   assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
5856           VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
5857          "Unexpected type for custom ctpop lowering");
5858
5859   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5860   EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
5861   SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
5862   Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
5863
5864   // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
5865   unsigned EltSize = 8;
5866   unsigned NumElts = VT.is64BitVector() ? 8 : 16;
5867   while (EltSize != VT.getScalarSizeInBits()) {
5868     SmallVector<SDValue, 8> Ops;
5869     Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
5870                                   TLI.getPointerTy(DAG.getDataLayout())));
5871     Ops.push_back(Res);
5872
5873     EltSize *= 2;
5874     NumElts /= 2;
5875     MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
5876     Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
5877   }
5878
5879   return Res;
5880 }
5881
5882 /// Getvshiftimm - Check if this is a valid build_vector for the immediate
5883 /// operand of a vector shift operation, where all the elements of the
5884 /// build_vector must have the same constant integer value.
5885 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
5886   // Ignore bit_converts.
5887   while (Op.getOpcode() == ISD::BITCAST)
5888     Op = Op.getOperand(0);
5889   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
5890   APInt SplatBits, SplatUndef;
5891   unsigned SplatBitSize;
5892   bool HasAnyUndefs;
5893   if (!BVN ||
5894       !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
5895                             ElementBits) ||
5896       SplatBitSize > ElementBits)
5897     return false;
5898   Cnt = SplatBits.getSExtValue();
5899   return true;
5900 }
5901
5902 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
5903 /// operand of a vector shift left operation.  That value must be in the range:
5904 ///   0 <= Value < ElementBits for a left shift; or
5905 ///   0 <= Value <= ElementBits for a long left shift.
5906 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
5907   assert(VT.isVector() && "vector shift count is not a vector type");
5908   int64_t ElementBits = VT.getScalarSizeInBits();
5909   if (!getVShiftImm(Op, ElementBits, Cnt))
5910     return false;
5911   return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
5912 }
5913
5914 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
5915 /// operand of a vector shift right operation.  For a shift opcode, the value
5916 /// is positive, but for an intrinsic the value count must be negative. The
5917 /// absolute value must be in the range:
5918 ///   1 <= |Value| <= ElementBits for a right shift; or
5919 ///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
5920 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
5921                          int64_t &Cnt) {
5922   assert(VT.isVector() && "vector shift count is not a vector type");
5923   int64_t ElementBits = VT.getScalarSizeInBits();
5924   if (!getVShiftImm(Op, ElementBits, Cnt))
5925     return false;
5926   if (!isIntrinsic)
5927     return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
5928   if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
5929     Cnt = -Cnt;
5930     return true;
5931   }
5932   return false;
5933 }
5934
5935 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
5936                           const ARMSubtarget *ST) {
5937   EVT VT = N->getValueType(0);
5938   SDLoc dl(N);
5939   int64_t Cnt;
5940
5941   if (!VT.isVector())
5942     return SDValue();
5943
5944   // We essentially have two forms here. Shift by an immediate and shift by a
5945   // vector register (there are also shift by a gpr, but that is just handled
5946   // with a tablegen pattern). We cannot easily match shift by an immediate in
5947   // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
5948   // For shifting by a vector, we don't have VSHR, only VSHL (which can be
5949   // signed or unsigned, and a negative shift indicates a shift right).
5950   if (N->getOpcode() == ISD::SHL) {
5951     if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
5952       return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
5953                          DAG.getConstant(Cnt, dl, MVT::i32));
5954     return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
5955                        N->getOperand(1));
5956   }
5957
5958   assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
5959          "unexpected vector shift opcode");
5960
5961   if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
5962     unsigned VShiftOpc =
5963         (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
5964     return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
5965                        DAG.getConstant(Cnt, dl, MVT::i32));
5966   }
5967
5968   // Other right shifts we don't have operations for (we use a shift left by a
5969   // negative number).
5970   EVT ShiftVT = N->getOperand(1).getValueType();
5971   SDValue NegatedCount = DAG.getNode(
5972       ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
5973   unsigned VShiftOpc =
5974       (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
5975   return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
5976 }
5977
5978 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
5979                                 const ARMSubtarget *ST) {
5980   EVT VT = N->getValueType(0);
5981   SDLoc dl(N);
5982
5983   // We can get here for a node like i32 = ISD::SHL i32, i64
5984   if (VT != MVT::i64)
5985     return SDValue();
5986
5987   assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
5988           N->getOpcode() == ISD::SHL) &&
5989          "Unknown shift to lower!");
5990
5991   unsigned ShOpc = N->getOpcode();
5992   if (ST->hasMVEIntegerOps()) {
5993     SDValue ShAmt = N->getOperand(1);
5994     unsigned ShPartsOpc = ARMISD::LSLL;
5995     ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt);
5996
5997     // If the shift amount is greater than 32 or has a greater bitwidth than 64
5998     // then do the default optimisation
5999     if (ShAmt->getValueType(0).getSizeInBits() > 64 ||
6000         (Con && (Con->getZExtValue() == 0 || Con->getZExtValue() >= 32)))
6001       return SDValue();
6002
6003     // Extract the lower 32 bits of the shift amount if it's not an i32
6004     if (ShAmt->getValueType(0) != MVT::i32)
6005       ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
6006
6007     if (ShOpc == ISD::SRL) {
6008       if (!Con)
6009         // There is no t2LSRLr instruction so negate and perform an lsll if the
6010         // shift amount is in a register, emulating a right shift.
6011         ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6012                             DAG.getConstant(0, dl, MVT::i32), ShAmt);
6013       else
6014         // Else generate an lsrl on the immediate shift amount
6015         ShPartsOpc = ARMISD::LSRL;
6016     } else if (ShOpc == ISD::SRA)
6017       ShPartsOpc = ARMISD::ASRL;
6018
6019     // Lower 32 bits of the destination/source
6020     SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
6021                              DAG.getConstant(0, dl, MVT::i32));
6022     // Upper 32 bits of the destination/source
6023     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
6024                              DAG.getConstant(1, dl, MVT::i32));
6025
6026     // Generate the shift operation as computed above
6027     Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
6028                      ShAmt);
6029     // The upper 32 bits come from the second return value of lsll
6030     Hi = SDValue(Lo.getNode(), 1);
6031     return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6032   }
6033
6034   // We only lower SRA, SRL of 1 here, all others use generic lowering.
6035   if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
6036     return SDValue();
6037
6038   // If we are in thumb mode, we don't have RRX.
6039   if (ST->isThumb1Only())
6040     return SDValue();
6041
6042   // Okay, we have a 64-bit SRA or SRL of 1.  Lower this to an RRX expr.
6043   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
6044                            DAG.getConstant(0, dl, MVT::i32));
6045   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
6046                            DAG.getConstant(1, dl, MVT::i32));
6047
6048   // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
6049   // captures the result into a carry flag.
6050   unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
6051   Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);
6052
6053   // The low part is an ARMISD::RRX operand, which shifts the carry in.
6054   Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
6055
6056   // Merge the pieces into a single i64 value.
6057  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6058 }
6059
6060 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG,
6061                            const ARMSubtarget *ST) {
6062   bool Invert = false;
6063   bool Swap = false;
6064   unsigned Opc = ARMCC::AL;
6065
6066   SDValue Op0 = Op.getOperand(0);
6067   SDValue Op1 = Op.getOperand(1);
6068   SDValue CC = Op.getOperand(2);
6069   EVT VT = Op.getValueType();
6070   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
6071   SDLoc dl(Op);
6072
6073   EVT CmpVT;
6074   if (ST->hasNEON())
6075     CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
6076   else {
6077     assert(ST->hasMVEIntegerOps() &&
6078            "No hardware support for integer vector comparison!");
6079
6080     if (Op.getValueType().getVectorElementType() != MVT::i1)
6081       return SDValue();
6082
6083     // Make sure we expand floating point setcc to scalar if we do not have
6084     // mve.fp, so that we can handle them from there.
6085     if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
6086       return SDValue();
6087
6088     CmpVT = VT;
6089   }
6090
6091   if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
6092       (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
6093     // Special-case integer 64-bit equality comparisons. They aren't legal,
6094     // but they can be lowered with a few vector instructions.
6095     unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
6096     EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
6097     SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
6098     SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
6099     SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
6100                               DAG.getCondCode(ISD::SETEQ));
6101     SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
6102     SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
6103     Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
6104     if (SetCCOpcode == ISD::SETNE)
6105       Merged = DAG.getNOT(dl, Merged, CmpVT);
6106     Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
6107     return Merged;
6108   }
6109
6110   if (CmpVT.getVectorElementType() == MVT::i64)
6111     // 64-bit comparisons are not legal in general.
6112     return SDValue();
6113
6114   if (Op1.getValueType().isFloatingPoint()) {
6115     switch (SetCCOpcode) {
6116     default: llvm_unreachable("Illegal FP comparison");
6117     case ISD::SETUNE:
6118     case ISD::SETNE:
6119       if (ST->hasMVEFloatOps()) {
6120         Opc = ARMCC::NE; break;
6121       } else {
6122         Invert = true; LLVM_FALLTHROUGH;
6123       }
6124     case ISD::SETOEQ:
6125     case ISD::SETEQ:  Opc = ARMCC::EQ; break;
6126     case ISD::SETOLT:
6127     case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
6128     case ISD::SETOGT:
6129     case ISD::SETGT:  Opc = ARMCC::GT; break;
6130     case ISD::SETOLE:
6131     case ISD::SETLE:  Swap = true; LLVM_FALLTHROUGH;
6132     case ISD::SETOGE:
6133     case ISD::SETGE: Opc = ARMCC::GE; break;
6134     case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH;
6135     case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
6136     case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH;
6137     case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
6138     case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH;
6139     case ISD::SETONE: {
6140       // Expand this to (OLT | OGT).
6141       SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6142                                    DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6143       SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6144                                    DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6145       SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6146       if (Invert)
6147         Result = DAG.getNOT(dl, Result, VT);
6148       return Result;
6149     }
6150     case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH;
6151     case ISD::SETO: {
6152       // Expand this to (OLT | OGE).
6153       SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6154                                    DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6155       SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6156                                    DAG.getConstant(ARMCC::GE, dl, MVT::i32));
6157       SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6158       if (Invert)
6159         Result = DAG.getNOT(dl, Result, VT);
6160       return Result;
6161     }
6162     }
6163   } else {
6164     // Integer comparisons.
6165     switch (SetCCOpcode) {
6166     default: llvm_unreachable("Illegal integer comparison");
6167     case ISD::SETNE:
6168       if (ST->hasMVEIntegerOps()) {
6169         Opc = ARMCC::NE; break;
6170       } else {
6171         Invert = true; LLVM_FALLTHROUGH;
6172       }
6173     case ISD::SETEQ:  Opc = ARMCC::EQ; break;
6174     case ISD::SETLT:  Swap = true; LLVM_FALLTHROUGH;
6175     case ISD::SETGT:  Opc = ARMCC::GT; break;
6176     case ISD::SETLE:  Swap = true; LLVM_FALLTHROUGH;
6177     case ISD::SETGE:  Opc = ARMCC::GE; break;
6178     case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
6179     case ISD::SETUGT: Opc = ARMCC::HI; break;
6180     case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
6181     case ISD::SETUGE: Opc = ARMCC::HS; break;
6182     }
6183
6184     // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
6185     if (ST->hasNEON() && Opc == ARMCC::EQ) {
6186       SDValue AndOp;
6187       if (ISD::isBuildVectorAllZeros(Op1.getNode()))
6188         AndOp = Op0;
6189       else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
6190         AndOp = Op1;
6191
6192       // Ignore bitconvert.
6193       if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
6194         AndOp = AndOp.getOperand(0);
6195
6196       if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
6197         Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
6198         Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
6199         SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
6200         if (!Invert)
6201           Result = DAG.getNOT(dl, Result, VT);
6202         return Result;
6203       }
6204     }
6205   }
6206
6207   if (Swap)
6208     std::swap(Op0, Op1);
6209
6210   // If one of the operands is a constant vector zero, attempt to fold the
6211   // comparison to a specialized compare-against-zero form.
6212   SDValue SingleOp;
6213   if (ISD::isBuildVectorAllZeros(Op1.getNode()))
6214     SingleOp = Op0;
6215   else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
6216     if (Opc == ARMCC::GE)
6217       Opc = ARMCC::LE;
6218     else if (Opc == ARMCC::GT)
6219       Opc = ARMCC::LT;
6220     SingleOp = Op1;
6221   }
6222
6223   SDValue Result;
6224   if (SingleOp.getNode()) {
6225     Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp,
6226                          DAG.getConstant(Opc, dl, MVT::i32));
6227   } else {
6228     Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6229                          DAG.getConstant(Opc, dl, MVT::i32));
6230   }
6231
6232   Result = DAG.getSExtOrTrunc(Result, dl, VT);
6233
6234   if (Invert)
6235     Result = DAG.getNOT(dl, Result, VT);
6236
6237   return Result;
6238 }
6239
6240 static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) {
6241   SDValue LHS = Op.getOperand(0);
6242   SDValue RHS = Op.getOperand(1);
6243   SDValue Carry = Op.getOperand(2);
6244   SDValue Cond = Op.getOperand(3);
6245   SDLoc DL(Op);
6246
6247   assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
6248
6249   // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
6250   // have to invert the carry first.
6251   Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
6252                       DAG.getConstant(1, DL, MVT::i32), Carry);
6253   // This converts the boolean value carry into the carry flag.
6254   Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
6255
6256   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
6257   SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
6258
6259   SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
6260   SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
6261   SDValue ARMcc = DAG.getConstant(
6262       IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
6263   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
6264   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR,
6265                                    Cmp.getValue(1), SDValue());
6266   return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
6267                      CCR, Chain.getValue(1));
6268 }
6269
6270 /// isVMOVModifiedImm - Check if the specified splat value corresponds to a
6271 /// valid vector constant for a NEON or MVE instruction with a "modified
6272 /// immediate" operand (e.g., VMOV).  If so, return the encoded value.
6273 static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
6274                                  unsigned SplatBitSize, SelectionDAG &DAG,
6275                                  const SDLoc &dl, EVT &VT, bool is128Bits,
6276                                  VMOVModImmType type) {
6277   unsigned OpCmode, Imm;
6278
6279   // SplatBitSize is set to the smallest size that splats the vector, so a
6280   // zero vector will always have SplatBitSize == 8.  However, NEON modified
6281   // immediate instructions others than VMOV do not support the 8-bit encoding
6282   // of a zero vector, and the default encoding of zero is supposed to be the
6283   // 32-bit version.
6284   if (SplatBits == 0)
6285     SplatBitSize = 32;
6286
6287   switch (SplatBitSize) {
6288   case 8:
6289     if (type != VMOVModImm)
6290       return SDValue();
6291     // Any 1-byte value is OK.  Op=0, Cmode=1110.
6292     assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
6293     OpCmode = 0xe;
6294     Imm = SplatBits;
6295     VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
6296     break;
6297
6298   case 16:
6299     // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
6300     VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
6301     if ((SplatBits & ~0xff) == 0) {
6302       // Value = 0x00nn: Op=x, Cmode=100x.
6303       OpCmode = 0x8;
6304       Imm = SplatBits;
6305       break;
6306     }
6307     if ((SplatBits & ~0xff00) == 0) {
6308       // Value = 0xnn00: Op=x, Cmode=101x.
6309       OpCmode = 0xa;
6310       Imm = SplatBits >> 8;
6311       break;
6312     }
6313     return SDValue();
6314
6315   case 32:
6316     // NEON's 32-bit VMOV supports splat values where:
6317     // * only one byte is nonzero, or
6318     // * the least significant byte is 0xff and the second byte is nonzero, or
6319     // * the least significant 2 bytes are 0xff and the third is nonzero.
6320     VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
6321     if ((SplatBits & ~0xff) == 0) {
6322       // Value = 0x000000nn: Op=x, Cmode=000x.
6323       OpCmode = 0;
6324       Imm = SplatBits;
6325       break;
6326     }
6327     if ((SplatBits & ~0xff00) == 0) {
6328       // Value = 0x0000nn00: Op=x, Cmode=001x.
6329       OpCmode = 0x2;
6330       Imm = SplatBits >> 8;
6331       break;
6332     }
6333     if ((SplatBits & ~0xff0000) == 0) {
6334       // Value = 0x00nn0000: Op=x, Cmode=010x.
6335       OpCmode = 0x4;
6336       Imm = SplatBits >> 16;
6337       break;
6338     }
6339     if ((SplatBits & ~0xff000000) == 0) {
6340       // Value = 0xnn000000: Op=x, Cmode=011x.
6341       OpCmode = 0x6;
6342       Imm = SplatBits >> 24;
6343       break;
6344     }
6345
6346     // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
6347     if (type == OtherModImm) return SDValue();
6348
6349     if ((SplatBits & ~0xffff) == 0 &&
6350         ((SplatBits | SplatUndef) & 0xff) == 0xff) {
6351       // Value = 0x0000nnff: Op=x, Cmode=1100.
6352       OpCmode = 0xc;
6353       Imm = SplatBits >> 8;
6354       break;
6355     }
6356
6357     // cmode == 0b1101 is not supported for MVE VMVN
6358     if (type == MVEVMVNModImm)
6359       return SDValue();
6360
6361     if ((SplatBits & ~0xffffff) == 0 &&
6362         ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
6363       // Value = 0x00nnffff: Op=x, Cmode=1101.
6364       OpCmode = 0xd;
6365       Imm = SplatBits >> 16;
6366       break;
6367     }
6368
6369     // Note: there are a few 32-bit splat values (specifically: 00ffff00,
6370     // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
6371     // VMOV.I32.  A (very) minor optimization would be to replicate the value
6372     // and fall through here to test for a valid 64-bit splat.  But, then the
6373     // caller would also need to check and handle the change in size.
6374     return SDValue();
6375
6376   case 64: {
6377     if (type != VMOVModImm)
6378       return SDValue();
6379     // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
6380     uint64_t BitMask = 0xff;
6381     uint64_t Val = 0;
6382     unsigned ImmMask = 1;
6383     Imm = 0;
6384     for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
6385       if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
6386         Val |= BitMask;
6387         Imm |= ImmMask;
6388       } else if ((SplatBits & BitMask) != 0) {
6389         return SDValue();
6390       }
6391       BitMask <<= 8;
6392       ImmMask <<= 1;
6393     }
6394
6395     if (DAG.getDataLayout().isBigEndian())
6396       // swap higher and lower 32 bit word
6397       Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4);
6398
6399     // Op=1, Cmode=1110.
6400     OpCmode = 0x1e;
6401     VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
6402     break;
6403   }
6404
6405   default:
6406     llvm_unreachable("unexpected size for isVMOVModifiedImm");
6407   }
6408
6409   unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
6410   return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
6411 }
6412
6413 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
6414                                            const ARMSubtarget *ST) const {
6415   EVT VT = Op.getValueType();
6416   bool IsDouble = (VT == MVT::f64);
6417   ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
6418   const APFloat &FPVal = CFP->getValueAPF();
6419
6420   // Prevent floating-point constants from using literal loads
6421   // when execute-only is enabled.
6422   if (ST->genExecuteOnly()) {
6423     // If we can represent the constant as an immediate, don't lower it
6424     if (isFPImmLegal(FPVal, VT))
6425       return Op;
6426     // Otherwise, construct as integer, and move to float register
6427     APInt INTVal = FPVal.bitcastToAPInt();
6428     SDLoc DL(CFP);
6429     switch (VT.getSimpleVT().SimpleTy) {
6430       default:
6431         llvm_unreachable("Unknown floating point type!");
6432         break;
6433       case MVT::f64: {
6434         SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
6435         SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
6436         if (!ST->isLittle())
6437           std::swap(Lo, Hi);
6438         return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
6439       }
6440       case MVT::f32:
6441           return DAG.getNode(ARMISD::VMOVSR, DL, VT,
6442               DAG.getConstant(INTVal, DL, MVT::i32));
6443     }
6444   }
6445
6446   if (!ST->hasVFP3Base())
6447     return SDValue();
6448
6449   // Use the default (constant pool) lowering for double constants when we have
6450   // an SP-only FPU
6451   if (IsDouble && !Subtarget->hasFP64())
6452     return SDValue();
6453
6454   // Try splatting with a VMOV.f32...
6455   int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
6456
6457   if (ImmVal != -1) {
6458     if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
6459       // We have code in place to select a valid ConstantFP already, no need to
6460       // do any mangling.
6461       return Op;
6462     }
6463
6464     // It's a float and we are trying to use NEON operations where
6465     // possible. Lower it to a splat followed by an extract.
6466     SDLoc DL(Op);
6467     SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
6468     SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
6469                                       NewVal);
6470     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
6471                        DAG.getConstant(0, DL, MVT::i32));
6472   }
6473
6474   // The rest of our options are NEON only, make sure that's allowed before
6475   // proceeding..
6476   if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
6477     return SDValue();
6478
6479   EVT VMovVT;
6480   uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
6481
6482   // It wouldn't really be worth bothering for doubles except for one very
6483   // important value, which does happen to match: 0.0. So make sure we don't do
6484   // anything stupid.
6485   if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
6486     return SDValue();
6487
6488   // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
6489   SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
6490                                      VMovVT, false, VMOVModImm);
6491   if (NewVal != SDValue()) {
6492     SDLoc DL(Op);
6493     SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
6494                                       NewVal);
6495     if (IsDouble)
6496       return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
6497
6498     // It's a float: cast and extract a vector element.
6499     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
6500                                        VecConstant);
6501     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
6502                        DAG.getConstant(0, DL, MVT::i32));
6503   }
6504
6505   // Finally, try a VMVN.i32
6506   NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
6507                              false, VMVNModImm);
6508   if (NewVal != SDValue()) {
6509     SDLoc DL(Op);
6510     SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
6511
6512     if (IsDouble)
6513       return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
6514
6515     // It's a float: cast and extract a vector element.
6516     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
6517                                        VecConstant);
6518     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
6519                        DAG.getConstant(0, DL, MVT::i32));
6520   }
6521
6522   return SDValue();
6523 }
6524
6525 // check if an VEXT instruction can handle the shuffle mask when the
6526 // vector sources of the shuffle are the same.
6527 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
6528   unsigned NumElts = VT.getVectorNumElements();
6529
6530   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
6531   if (M[0] < 0)
6532     return false;
6533
6534   Imm = M[0];
6535
6536   // If this is a VEXT shuffle, the immediate value is the index of the first
6537   // element.  The other shuffle indices must be the successive elements after
6538   // the first one.
6539   unsigned ExpectedElt = Imm;
6540   for (unsigned i = 1; i < NumElts; ++i) {
6541     // Increment the expected index.  If it wraps around, just follow it
6542     // back to index zero and keep going.
6543     ++ExpectedElt;
6544     if (ExpectedElt == NumElts)
6545       ExpectedElt = 0;
6546
6547     if (M[i] < 0) continue; // ignore UNDEF indices
6548     if (ExpectedElt != static_cast<unsigned>(M[i]))
6549       return false;
6550   }
6551
6552   return true;
6553 }
6554
6555 static bool isVEXTMask(ArrayRef<int> M, EVT VT,
6556                        bool &ReverseVEXT, unsigned &Imm) {
6557   unsigned NumElts = VT.getVectorNumElements();
6558   ReverseVEXT = false;
6559
6560   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
6561   if (M[0] < 0)
6562     return false;
6563
6564   Imm = M[0];
6565
6566   // If this is a VEXT shuffle, the immediate value is the index of the first
6567   // element.  The other shuffle indices must be the successive elements after
6568   // the first one.
6569   unsigned ExpectedElt = Imm;
6570   for (unsigned i = 1; i < NumElts; ++i) {
6571     // Increment the expected index.  If it wraps around, it may still be
6572     // a VEXT but the source vectors must be swapped.
6573     ExpectedElt += 1;
6574     if (ExpectedElt == NumElts * 2) {
6575       ExpectedElt = 0;
6576       ReverseVEXT = true;
6577     }
6578
6579     if (M[i] < 0) continue; // ignore UNDEF indices
6580     if (ExpectedElt != static_cast<unsigned>(M[i]))
6581       return false;
6582   }
6583
6584   // Adjust the index value if the source operands will be swapped.
6585   if (ReverseVEXT)
6586     Imm -= NumElts;
6587
6588   return true;
6589 }
6590
6591 /// isVREVMask - Check if a vector shuffle corresponds to a VREV
6592 /// instruction with the specified blocksize.  (The order of the elements
6593 /// within each block of the vector is reversed.)
6594 static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
6595   assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
6596          "Only possible block sizes for VREV are: 16, 32, 64");
6597
6598   unsigned EltSz = VT.getScalarSizeInBits();
6599   if (EltSz == 64)
6600     return false;
6601
6602   unsigned NumElts = VT.getVectorNumElements();
6603   unsigned BlockElts = M[0] + 1;
6604   // If the first shuffle index is UNDEF, be optimistic.
6605   if (M[0] < 0)
6606     BlockElts = BlockSize / EltSz;
6607
6608   if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
6609     return false;
6610
6611   for (unsigned i = 0; i < NumElts; ++i) {
6612     if (M[i] < 0) continue; // ignore UNDEF indices
6613     if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
6614       return false;
6615   }
6616
6617   return true;
6618 }
6619
6620 static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
6621   // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
6622   // range, then 0 is placed into the resulting vector. So pretty much any mask
6623   // of 8 elements can work here.
6624   return VT == MVT::v8i8 && M.size() == 8;
6625 }
6626
6627 static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
6628                                unsigned Index) {
6629   if (Mask.size() == Elements * 2)
6630     return Index / Elements;
6631   return Mask[Index] == 0 ? 0 : 1;
6632 }
6633
6634 // Checks whether the shuffle mask represents a vector transpose (VTRN) by
6635 // checking that pairs of elements in the shuffle mask represent the same index
6636 // in each vector, incrementing the expected index by 2 at each step.
6637 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
6638 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
6639 //  v2={e,f,g,h}
6640 // WhichResult gives the offset for each element in the mask based on which
6641 // of the two results it belongs to.
6642 //
6643 // The transpose can be represented either as:
6644 // result1 = shufflevector v1, v2, result1_shuffle_mask
6645 // result2 = shufflevector v1, v2, result2_shuffle_mask
6646 // where v1/v2 and the shuffle masks have the same number of elements
6647 // (here WhichResult (see below) indicates which result is being checked)
6648 //
6649 // or as:
6650 // results = shufflevector v1, v2, shuffle_mask
6651 // where both results are returned in one vector and the shuffle mask has twice
6652 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
6653 // want to check the low half and high half of the shuffle mask as if it were
6654 // the other case
6655 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
6656   unsigned EltSz = VT.getScalarSizeInBits();
6657   if (EltSz == 64)
6658     return false;
6659
6660   unsigned NumElts = VT.getVectorNumElements();
6661   if (M.size() != NumElts && M.size() != NumElts*2)
6662     return false;
6663
6664   // If the mask is twice as long as the input vector then we need to check the
6665   // upper and lower parts of the mask with a matching value for WhichResult
6666   // FIXME: A mask with only even values will be rejected in case the first
6667   // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
6668   // M[0] is used to determine WhichResult
6669   for (unsigned i = 0; i < M.size(); i += NumElts) {
6670     WhichResult = SelectPairHalf(NumElts, M, i);
6671     for (unsigned j = 0; j < NumElts; j += 2) {
6672       if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
6673           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
6674         return false;
6675     }
6676   }
6677
6678   if (M.size() == NumElts*2)
6679     WhichResult = 0;
6680
6681   return true;
6682 }
6683
6684 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
6685 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
6686 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
6687 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
6688   unsigned EltSz = VT.getScalarSizeInBits();
6689   if (EltSz == 64)
6690     return false;
6691
6692   unsigned NumElts = VT.getVectorNumElements();
6693   if (M.size() != NumElts && M.size() != NumElts*2)
6694     return false;
6695
6696   for (unsigned i = 0; i < M.size(); i += NumElts) {
6697     WhichResult = SelectPairHalf(NumElts, M, i);
6698     for (unsigned j = 0; j < NumElts; j += 2) {
6699       if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
6700           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
6701         return false;
6702     }
6703   }
6704
6705   if (M.size() == NumElts*2)
6706     WhichResult = 0;
6707
6708   return true;
6709 }
6710
6711 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
6712 // that the mask elements are either all even and in steps of size 2 or all odd
6713 // and in steps of size 2.
6714 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
6715 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
6716 //  v2={e,f,g,h}
6717 // Requires similar checks to that of isVTRNMask with
6718 // respect the how results are returned.
6719 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
6720   unsigned EltSz = VT.getScalarSizeInBits();
6721   if (EltSz == 64)
6722     return false;
6723
6724   unsigned NumElts = VT.getVectorNumElements();
6725   if (M.size() != NumElts && M.size() != NumElts*2)
6726     return false;
6727
6728   for (unsigned i = 0; i < M.size(); i += NumElts) {
6729     WhichResult = SelectPairHalf(NumElts, M, i);
6730     for (unsigned j = 0; j < NumElts; ++j) {
6731       if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
6732         return false;
6733     }
6734   }
6735
6736   if (M.size() == NumElts*2)
6737     WhichResult = 0;
6738
6739   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
6740   if (VT.is64BitVector() && EltSz == 32)
6741     return false;
6742
6743   return true;
6744 }
6745
6746 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
6747 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
6748 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
6749 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
6750   unsigned EltSz = VT.getScalarSizeInBits();
6751   if (EltSz == 64)
6752     return false;
6753
6754   unsigned NumElts = VT.getVectorNumElements();
6755   if (M.size() != NumElts && M.size() != NumElts*2)
6756     return false;
6757
6758   unsigned Half = NumElts / 2;
6759   for (unsigned i = 0; i < M.size(); i += NumElts) {
6760     WhichResult = SelectPairHalf(NumElts, M, i);
6761     for (unsigned j = 0; j < NumElts; j += Half) {
6762       unsigned Idx = WhichResult;
6763       for (unsigned k = 0; k < Half; ++k) {
6764         int MIdx = M[i + j + k];
6765         if (MIdx >= 0 && (unsigned) MIdx != Idx)
6766           return false;
6767         Idx += 2;
6768       }
6769     }
6770   }
6771
6772   if (M.size() == NumElts*2)
6773     WhichResult = 0;
6774
6775   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
6776   if (VT.is64BitVector() && EltSz == 32)
6777     return false;
6778
6779   return true;
6780 }
6781
6782 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking
6783 // that pairs of elements of the shufflemask represent the same index in each
6784 // vector incrementing sequentially through the vectors.
6785 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
6786 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
6787 //  v2={e,f,g,h}
6788 // Requires similar checks to that of isVTRNMask with respect the how results
6789 // are returned.
6790 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
6791   unsigned EltSz = VT.getScalarSizeInBits();
6792   if (EltSz == 64)
6793     return false;
6794
6795   unsigned NumElts = VT.getVectorNumElements();
6796   if (M.size() != NumElts && M.size() != NumElts*2)
6797     return false;
6798
6799   for (unsigned i = 0; i < M.size(); i += NumElts) {
6800     WhichResult = SelectPairHalf(NumElts, M, i);
6801     unsigned Idx = WhichResult * NumElts / 2;
6802     for (unsigned j = 0; j < NumElts; j += 2) {
6803       if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
6804           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
6805         return false;
6806       Idx += 1;
6807     }
6808   }
6809
6810   if (M.size() == NumElts*2)
6811     WhichResult = 0;
6812
6813   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
6814   if (VT.is64BitVector() && EltSz == 32)
6815     return false;
6816
6817   return true;
6818 }
6819
6820 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
6821 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
6822 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
6823 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
6824   unsigned EltSz = VT.getScalarSizeInBits();
6825   if (EltSz == 64)
6826     return false;
6827
6828   unsigned NumElts = VT.getVectorNumElements();
6829   if (M.size() != NumElts && M.size() != NumElts*2)
6830     return false;
6831
6832   for (unsigned i = 0; i < M.size(); i += NumElts) {
6833     WhichResult = SelectPairHalf(NumElts, M, i);
6834     unsigned Idx = WhichResult * NumElts / 2;
6835     for (unsigned j = 0; j < NumElts; j += 2) {
6836       if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
6837           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
6838         return false;
6839       Idx += 1;
6840     }
6841   }
6842
6843   if (M.size() == NumElts*2)
6844     WhichResult = 0;
6845
6846   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
6847   if (VT.is64BitVector() && EltSz == 32)
6848     return false;
6849
6850   return true;
6851 }
6852
6853 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
6854 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
6855 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
6856                                            unsigned &WhichResult,
6857                                            bool &isV_UNDEF) {
6858   isV_UNDEF = false;
6859   if (isVTRNMask(ShuffleMask, VT, WhichResult))
6860     return ARMISD::VTRN;
6861   if (isVUZPMask(ShuffleMask, VT, WhichResult))
6862     return ARMISD::VUZP;
6863   if (isVZIPMask(ShuffleMask, VT, WhichResult))
6864     return ARMISD::VZIP;
6865
6866   isV_UNDEF = true;
6867   if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
6868     return ARMISD::VTRN;
6869   if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
6870     return ARMISD::VUZP;
6871   if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
6872     return ARMISD::VZIP;
6873
6874   return 0;
6875 }
6876
6877 /// \return true if this is a reverse operation on an vector.
6878 static bool isReverseMask(ArrayRef<int> M, EVT VT) {
6879   unsigned NumElts = VT.getVectorNumElements();
6880   // Make sure the mask has the right size.
6881   if (NumElts != M.size())
6882       return false;
6883
6884   // Look for <15, ..., 3, -1, 1, 0>.
6885   for (unsigned i = 0; i != NumElts; ++i)
6886     if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
6887       return false;
6888
6889   return true;
6890 }
6891
6892 // If N is an integer constant that can be moved into a register in one
6893 // instruction, return an SDValue of such a constant (will become a MOV
6894 // instruction).  Otherwise return null.
6895 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
6896                                      const ARMSubtarget *ST, const SDLoc &dl) {
6897   uint64_t Val;
6898   if (!isa<ConstantSDNode>(N))
6899     return SDValue();
6900   Val = cast<ConstantSDNode>(N)->getZExtValue();
6901
6902   if (ST->isThumb1Only()) {
6903     if (Val <= 255 || ~Val <= 255)
6904       return DAG.getConstant(Val, dl, MVT::i32);
6905   } else {
6906     if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
6907       return DAG.getConstant(Val, dl, MVT::i32);
6908   }
6909   return SDValue();
6910 }
6911
6912 static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG,
6913                                     const ARMSubtarget *ST) {
6914   SDLoc dl(Op);
6915   EVT VT = Op.getValueType();
6916
6917   assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
6918
6919   unsigned NumElts = VT.getVectorNumElements();
6920   unsigned BoolMask;
6921   unsigned BitsPerBool;
6922   if (NumElts == 4) {
6923     BitsPerBool = 4;
6924     BoolMask = 0xf;
6925   } else if (NumElts == 8) {
6926     BitsPerBool = 2;
6927     BoolMask = 0x3;
6928   } else if (NumElts == 16) {
6929     BitsPerBool = 1;
6930     BoolMask = 0x1;
6931   } else
6932     return SDValue();
6933
6934   // If this is a single value copied into all lanes (a splat), we can just sign
6935   // extend that single value
6936   SDValue FirstOp = Op.getOperand(0);
6937   if (!isa<ConstantSDNode>(FirstOp) &&
6938       std::all_of(std::next(Op->op_begin()), Op->op_end(),
6939                   [&FirstOp](SDUse &U) {
6940                     return U.get().isUndef() || U.get() == FirstOp;
6941                   })) {
6942     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
6943                               DAG.getValueType(MVT::i1));
6944     return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
6945   }
6946
6947   // First create base with bits set where known
6948   unsigned Bits32 = 0;
6949   for (unsigned i = 0; i < NumElts; ++i) {
6950     SDValue V = Op.getOperand(i);
6951     if (!isa<ConstantSDNode>(V) && !V.isUndef())
6952       continue;
6953     bool BitSet = V.isUndef() ? false : cast<ConstantSDNode>(V)->getZExtValue();
6954     if (BitSet)
6955       Bits32 |= BoolMask << (i * BitsPerBool);
6956   }
6957
6958   // Add in unknown nodes
6959   SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
6960                              DAG.getConstant(Bits32, dl, MVT::i32));
6961   for (unsigned i = 0; i < NumElts; ++i) {
6962     SDValue V = Op.getOperand(i);
6963     if (isa<ConstantSDNode>(V) || V.isUndef())
6964       continue;
6965     Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
6966                        DAG.getConstant(i, dl, MVT::i32));
6967   }
6968
6969   return Base;
6970 }
6971
6972 // If this is a case we can't handle, return null and let the default
6973 // expansion code take care of it.
6974 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
6975                                              const ARMSubtarget *ST) const {
6976   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
6977   SDLoc dl(Op);
6978   EVT VT = Op.getValueType();
6979
6980   if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
6981     return LowerBUILD_VECTOR_i1(Op, DAG, ST);
6982
6983   APInt SplatBits, SplatUndef;
6984   unsigned SplatBitSize;
6985   bool HasAnyUndefs;
6986   if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
6987     if (SplatUndef.isAllOnesValue())
6988       return DAG.getUNDEF(VT);
6989
6990     if ((ST->hasNEON() && SplatBitSize <= 64) ||
6991         (ST->hasMVEIntegerOps() && SplatBitSize <= 32)) {
6992       // Check if an immediate VMOV works.
6993       EVT VmovVT;
6994       SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(),
6995                                       SplatUndef.getZExtValue(), SplatBitSize,
6996                                       DAG, dl, VmovVT, VT.is128BitVector(),
6997                                       VMOVModImm);
6998
6999       if (Val.getNode()) {
7000         SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
7001         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
7002       }
7003
7004       // Try an immediate VMVN.
7005       uint64_t NegatedImm = (~SplatBits).getZExtValue();
7006       Val = isVMOVModifiedImm(
7007           NegatedImm, SplatUndef.getZExtValue(), SplatBitSize,
7008           DAG, dl, VmovVT, VT.is128BitVector(),
7009           ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
7010       if (Val.getNode()) {
7011         SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
7012         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
7013       }
7014
7015       // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
7016       if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
7017         int ImmVal = ARM_AM::getFP32Imm(SplatBits);
7018         if (ImmVal != -1) {
7019           SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
7020           return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
7021         }
7022       }
7023     }
7024   }
7025
7026   // Scan through the operands to see if only one value is used.
7027   //
7028   // As an optimisation, even if more than one value is used it may be more
7029   // profitable to splat with one value then change some lanes.
7030   //
7031   // Heuristically we decide to do this if the vector has a "dominant" value,
7032   // defined as splatted to more than half of the lanes.
7033   unsigned NumElts = VT.getVectorNumElements();
7034   bool isOnlyLowElement = true;
7035   bool usesOnlyOneValue = true;
7036   bool hasDominantValue = false;
7037   bool isConstant = true;
7038
7039   // Map of the number of times a particular SDValue appears in the
7040   // element list.
7041   DenseMap<SDValue, unsigned> ValueCounts;
7042   SDValue Value;
7043   for (unsigned i = 0; i < NumElts; ++i) {
7044     SDValue V = Op.getOperand(i);
7045     if (V.isUndef())
7046       continue;
7047     if (i > 0)
7048       isOnlyLowElement = false;
7049     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
7050       isConstant = false;
7051
7052     ValueCounts.insert(std::make_pair(V, 0));
7053     unsigned &Count = ValueCounts[V];
7054
7055     // Is this value dominant? (takes up more than half of the lanes)
7056     if (++Count > (NumElts / 2)) {
7057       hasDominantValue = true;
7058       Value = V;
7059     }
7060   }
7061   if (ValueCounts.size() != 1)
7062     usesOnlyOneValue = false;
7063   if (!Value.getNode() && !ValueCounts.empty())
7064     Value = ValueCounts.begin()->first;
7065
7066   if (ValueCounts.empty())
7067     return DAG.getUNDEF(VT);
7068
7069   // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
7070   // Keep going if we are hitting this case.
7071   if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
7072     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
7073
7074   unsigned EltSize = VT.getScalarSizeInBits();
7075
7076   // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
7077   // i32 and try again.
7078   if (hasDominantValue && EltSize <= 32) {
7079     if (!isConstant) {
7080       SDValue N;
7081
7082       // If we are VDUPing a value that comes directly from a vector, that will
7083       // cause an unnecessary move to and from a GPR, where instead we could
7084       // just use VDUPLANE. We can only do this if the lane being extracted
7085       // is at a constant index, as the VDUP from lane instructions only have
7086       // constant-index forms.
7087       ConstantSDNode *constIndex;
7088       if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7089           (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
7090         // We need to create a new undef vector to use for the VDUPLANE if the
7091         // size of the vector from which we get the value is different than the
7092         // size of the vector that we need to create. We will insert the element
7093         // such that the register coalescer will remove unnecessary copies.
7094         if (VT != Value->getOperand(0).getValueType()) {
7095           unsigned index = constIndex->getAPIntValue().getLimitedValue() %
7096                              VT.getVectorNumElements();
7097           N =  DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7098                  DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
7099                         Value, DAG.getConstant(index, dl, MVT::i32)),
7100                            DAG.getConstant(index, dl, MVT::i32));
7101         } else
7102           N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7103                         Value->getOperand(0), Value->getOperand(1));
7104       } else
7105         N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
7106
7107       if (!usesOnlyOneValue) {
7108         // The dominant value was splatted as 'N', but we now have to insert
7109         // all differing elements.
7110         for (unsigned I = 0; I < NumElts; ++I) {
7111           if (Op.getOperand(I) == Value)
7112             continue;
7113           SmallVector<SDValue, 3> Ops;
7114           Ops.push_back(N);
7115           Ops.push_back(Op.getOperand(I));
7116           Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
7117           N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
7118         }
7119       }
7120       return N;
7121     }
7122     if (VT.getVectorElementType().isFloatingPoint()) {
7123       SmallVector<SDValue, 8> Ops;
7124       MVT FVT = VT.getVectorElementType().getSimpleVT();
7125       assert(FVT == MVT::f32 || FVT == MVT::f16);
7126       MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
7127       for (unsigned i = 0; i < NumElts; ++i)
7128         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
7129                                   Op.getOperand(i)));
7130       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
7131       SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
7132       Val = LowerBUILD_VECTOR(Val, DAG, ST);
7133       if (Val.getNode())
7134         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7135     }
7136     if (usesOnlyOneValue) {
7137       SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
7138       if (isConstant && Val.getNode())
7139         return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
7140     }
7141   }
7142
7143   // If all elements are constants and the case above didn't get hit, fall back
7144   // to the default expansion, which will generate a load from the constant
7145   // pool.
7146   if (isConstant)
7147     return SDValue();
7148
7149   // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
7150   if (NumElts >= 4) {
7151     SDValue shuffle = ReconstructShuffle(Op, DAG);
7152     if (shuffle != SDValue())
7153       return shuffle;
7154   }
7155
7156   if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
7157     // If we haven't found an efficient lowering, try splitting a 128-bit vector
7158     // into two 64-bit vectors; we might discover a better way to lower it.
7159     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
7160     EVT ExtVT = VT.getVectorElementType();
7161     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
7162     SDValue Lower =
7163         DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2));
7164     if (Lower.getOpcode() == ISD::BUILD_VECTOR)
7165       Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
7166     SDValue Upper = DAG.getBuildVector(
7167         HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2));
7168     if (Upper.getOpcode() == ISD::BUILD_VECTOR)
7169       Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
7170     if (Lower && Upper)
7171       return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
7172   }
7173
7174   // Vectors with 32- or 64-bit elements can be built by directly assigning
7175   // the subregisters.  Lower it to an ARMISD::BUILD_VECTOR so the operands
7176   // will be legalized.
7177   if (EltSize >= 32) {
7178     // Do the expansion with floating-point types, since that is what the VFP
7179     // registers are defined to use, and since i64 is not legal.
7180     EVT EltVT = EVT::getFloatingPointVT(EltSize);
7181     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
7182     SmallVector<SDValue, 8> Ops;
7183     for (unsigned i = 0; i < NumElts; ++i)
7184       Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
7185     SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
7186     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7187   }
7188
7189   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
7190   // know the default expansion would otherwise fall back on something even
7191   // worse. For a vector with one or two non-undef values, that's
7192   // scalar_to_vector for the elements followed by a shuffle (provided the
7193   // shuffle is valid for the target) and materialization element by element
7194   // on the stack followed by a load for everything else.
7195   if (!isConstant && !usesOnlyOneValue) {
7196     SDValue Vec = DAG.getUNDEF(VT);
7197     for (unsigned i = 0 ; i < NumElts; ++i) {
7198       SDValue V = Op.getOperand(i);
7199       if (V.isUndef())
7200         continue;
7201       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
7202       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
7203     }
7204     return Vec;
7205   }
7206
7207   return SDValue();
7208 }
7209
7210 // Gather data to see if the operation can be modelled as a
7211 // shuffle in combination with VEXTs.
7212 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
7213                                               SelectionDAG &DAG) const {
7214   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7215   SDLoc dl(Op);
7216   EVT VT = Op.getValueType();
7217   unsigned NumElts = VT.getVectorNumElements();
7218
7219   struct ShuffleSourceInfo {
7220     SDValue Vec;
7221     unsigned MinElt = std::numeric_limits<unsigned>::max();
7222     unsigned MaxElt = 0;
7223
7224     // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
7225     // be compatible with the shuffle we intend to construct. As a result
7226     // ShuffleVec will be some sliding window into the original Vec.
7227     SDValue ShuffleVec;
7228
7229     // Code should guarantee that element i in Vec starts at element "WindowBase
7230     // + i * WindowScale in ShuffleVec".
7231     int WindowBase = 0;
7232     int WindowScale = 1;
7233
7234     ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
7235
7236     bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
7237   };
7238
7239   // First gather all vectors used as an immediate source for this BUILD_VECTOR
7240   // node.
7241   SmallVector<ShuffleSourceInfo, 2> Sources;
7242   for (unsigned i = 0; i < NumElts; ++i) {
7243     SDValue V = Op.getOperand(i);
7244     if (V.isUndef())
7245       continue;
7246     else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
7247       // A shuffle can only come from building a vector from various
7248       // elements of other vectors.
7249       return SDValue();
7250     } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
7251       // Furthermore, shuffles require a constant mask, whereas extractelts
7252       // accept variable indices.
7253       return SDValue();
7254     }
7255
7256     // Add this element source to the list if it's not already there.
7257     SDValue SourceVec = V.getOperand(0);
7258     auto Source = llvm::find(Sources, SourceVec);
7259     if (Source == Sources.end())
7260       Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
7261
7262     // Update the minimum and maximum lane number seen.
7263     unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
7264     Source->MinElt = std::min(Source->MinElt, EltNo);
7265     Source->MaxElt = std::max(Source->MaxElt, EltNo);
7266   }
7267
7268   // Currently only do something sane when at most two source vectors
7269   // are involved.
7270   if (Sources.size() > 2)
7271     return SDValue();
7272
7273   // Find out the smallest element size among result and two sources, and use
7274   // it as element size to build the shuffle_vector.
7275   EVT SmallestEltTy = VT.getVectorElementType();
7276   for (auto &Source : Sources) {
7277     EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
7278     if (SrcEltTy.bitsLT(SmallestEltTy))
7279       SmallestEltTy = SrcEltTy;
7280   }
7281   unsigned ResMultiplier =
7282       VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
7283   NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
7284   EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
7285
7286   // If the source vector is too wide or too narrow, we may nevertheless be able
7287   // to construct a compatible shuffle either by concatenating it with UNDEF or
7288   // extracting a suitable range of elements.
7289   for (auto &Src : Sources) {
7290     EVT SrcVT = Src.ShuffleVec.getValueType();
7291
7292     if (SrcVT.getSizeInBits() == VT.getSizeInBits())
7293       continue;
7294
7295     // This stage of the search produces a source with the same element type as
7296     // the original, but with a total width matching the BUILD_VECTOR output.
7297     EVT EltVT = SrcVT.getVectorElementType();
7298     unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
7299     EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
7300
7301     if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
7302       if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits())
7303         return SDValue();
7304       // We can pad out the smaller vector for free, so if it's part of a
7305       // shuffle...
7306       Src.ShuffleVec =
7307           DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
7308                       DAG.getUNDEF(Src.ShuffleVec.getValueType()));
7309       continue;
7310     }
7311
7312     if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits())
7313       return SDValue();
7314
7315     if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
7316       // Span too large for a VEXT to cope
7317       return SDValue();
7318     }
7319
7320     if (Src.MinElt >= NumSrcElts) {
7321       // The extraction can just take the second half
7322       Src.ShuffleVec =
7323           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
7324                       DAG.getConstant(NumSrcElts, dl, MVT::i32));
7325       Src.WindowBase = -NumSrcElts;
7326     } else if (Src.MaxElt < NumSrcElts) {
7327       // The extraction can just take the first half
7328       Src.ShuffleVec =
7329           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
7330                       DAG.getConstant(0, dl, MVT::i32));
7331     } else {
7332       // An actual VEXT is needed
7333       SDValue VEXTSrc1 =
7334           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
7335                       DAG.getConstant(0, dl, MVT::i32));
7336       SDValue VEXTSrc2 =
7337           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
7338                       DAG.getConstant(NumSrcElts, dl, MVT::i32));
7339
7340       Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
7341                                    VEXTSrc2,
7342                                    DAG.getConstant(Src.MinElt, dl, MVT::i32));
7343       Src.WindowBase = -Src.MinElt;
7344     }
7345   }
7346
7347   // Another possible incompatibility occurs from the vector element types. We
7348   // can fix this by bitcasting the source vectors to the same type we intend
7349   // for the shuffle.
7350   for (auto &Src : Sources) {
7351     EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
7352     if (SrcEltTy == SmallestEltTy)
7353       continue;
7354     assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
7355     Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
7356     Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
7357     Src.WindowBase *= Src.WindowScale;
7358   }
7359
7360   // Final sanity check before we try to actually produce a shuffle.
7361   LLVM_DEBUG(for (auto Src
7362                   : Sources)
7363                  assert(Src.ShuffleVec.getValueType() == ShuffleVT););
7364
7365   // The stars all align, our next step is to produce the mask for the shuffle.
7366   SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
7367   int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
7368   for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
7369     SDValue Entry = Op.getOperand(i);
7370     if (Entry.isUndef())
7371       continue;
7372
7373     auto Src = llvm::find(Sources, Entry.getOperand(0));
7374     int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
7375
7376     // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
7377     // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
7378     // segment.
7379     EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
7380     int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
7381                                VT.getScalarSizeInBits());
7382     int LanesDefined = BitsDefined / BitsPerShuffleLane;
7383
7384     // This source is expected to fill ResMultiplier lanes of the final shuffle,
7385     // starting at the appropriate offset.
7386     int *LaneMask = &Mask[i * ResMultiplier];
7387
7388     int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
7389     ExtractBase += NumElts * (Src - Sources.begin());
7390     for (int j = 0; j < LanesDefined; ++j)
7391       LaneMask[j] = ExtractBase + j;
7392   }
7393
7394
7395   // We can't handle more than two sources. This should have already
7396   // been checked before this point.
7397   assert(Sources.size() <= 2 && "Too many sources!");
7398
7399   SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
7400   for (unsigned i = 0; i < Sources.size(); ++i)
7401     ShuffleOps[i] = Sources[i].ShuffleVec;
7402
7403   SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
7404                                             ShuffleOps[1], Mask, DAG);
7405   if (!Shuffle)
7406     return SDValue();
7407   return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
7408 }
7409
7410 enum ShuffleOpCodes {
7411   OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
7412   OP_VREV,
7413   OP_VDUP0,
7414   OP_VDUP1,
7415   OP_VDUP2,
7416   OP_VDUP3,
7417   OP_VEXT1,
7418   OP_VEXT2,
7419   OP_VEXT3,
7420   OP_VUZPL, // VUZP, left result
7421   OP_VUZPR, // VUZP, right result
7422   OP_VZIPL, // VZIP, left result
7423   OP_VZIPR, // VZIP, right result
7424   OP_VTRNL, // VTRN, left result
7425   OP_VTRNR  // VTRN, right result
7426 };
7427
7428 static bool isLegalMVEShuffleOp(unsigned PFEntry) {
7429   unsigned OpNum = (PFEntry >> 26) & 0x0F;
7430   switch (OpNum) {
7431   case OP_COPY:
7432   case OP_VREV:
7433   case OP_VDUP0:
7434   case OP_VDUP1:
7435   case OP_VDUP2:
7436   case OP_VDUP3:
7437     return true;
7438   }
7439   return false;
7440 }
7441
7442 /// isShuffleMaskLegal - Targets can use this to indicate that they only
7443 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
7444 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
7445 /// are assumed to be legal.
7446 bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
7447   if (VT.getVectorNumElements() == 4 &&
7448       (VT.is128BitVector() || VT.is64BitVector())) {
7449     unsigned PFIndexes[4];
7450     for (unsigned i = 0; i != 4; ++i) {
7451       if (M[i] < 0)
7452         PFIndexes[i] = 8;
7453       else
7454         PFIndexes[i] = M[i];
7455     }
7456
7457     // Compute the index in the perfect shuffle table.
7458     unsigned PFTableIndex =
7459       PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
7460     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
7461     unsigned Cost = (PFEntry >> 30);
7462
7463     if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
7464       return true;
7465   }
7466
7467   bool ReverseVEXT, isV_UNDEF;
7468   unsigned Imm, WhichResult;
7469
7470   unsigned EltSize = VT.getScalarSizeInBits();
7471   if (EltSize >= 32 ||
7472       ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
7473       ShuffleVectorInst::isIdentityMask(M) ||
7474       isVREVMask(M, VT, 64) ||
7475       isVREVMask(M, VT, 32) ||
7476       isVREVMask(M, VT, 16))
7477     return true;
7478   else if (Subtarget->hasNEON() &&
7479            (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
7480             isVTBLMask(M, VT) ||
7481             isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
7482     return true;
7483   else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) &&
7484            isReverseMask(M, VT))
7485     return true;
7486   else
7487     return false;
7488 }
7489
7490 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
7491 /// the specified operations to build the shuffle.
7492 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
7493                                       SDValue RHS, SelectionDAG &DAG,
7494                                       const SDLoc &dl) {
7495   unsigned OpNum = (PFEntry >> 26) & 0x0F;
7496   unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
7497   unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
7498
7499   if (OpNum == OP_COPY) {
7500     if (LHSID == (1*9+2)*9+3) return LHS;
7501     assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
7502     return RHS;
7503   }
7504
7505   SDValue OpLHS, OpRHS;
7506   OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
7507   OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
7508   EVT VT = OpLHS.getValueType();
7509
7510   switch (OpNum) {
7511   default: llvm_unreachable("Unknown shuffle opcode!");
7512   case OP_VREV:
7513     // VREV divides the vector in half and swaps within the half.
7514     if (VT.getVectorElementType() == MVT::i32 ||
7515         VT.getVectorElementType() == MVT::f32)
7516       return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
7517     // vrev <4 x i16> -> VREV32
7518     if (VT.getVectorElementType() == MVT::i16)
7519       return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
7520     // vrev <4 x i8> -> VREV16
7521     assert(VT.getVectorElementType() == MVT::i8);
7522     return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
7523   case OP_VDUP0:
7524   case OP_VDUP1:
7525   case OP_VDUP2:
7526   case OP_VDUP3:
7527     return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7528                        OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
7529   case OP_VEXT1:
7530   case OP_VEXT2:
7531   case OP_VEXT3:
7532     return DAG.getNode(ARMISD::VEXT, dl, VT,
7533                        OpLHS, OpRHS,
7534                        DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
7535   case OP_VUZPL:
7536   case OP_VUZPR:
7537     return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
7538                        OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
7539   case OP_VZIPL:
7540   case OP_VZIPR:
7541     return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
7542                        OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
7543   case OP_VTRNL:
7544   case OP_VTRNR:
7545     return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
7546                        OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
7547   }
7548 }
7549
7550 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
7551                                        ArrayRef<int> ShuffleMask,
7552                                        SelectionDAG &DAG) {
7553   // Check to see if we can use the VTBL instruction.
7554   SDValue V1 = Op.getOperand(0);
7555   SDValue V2 = Op.getOperand(1);
7556   SDLoc DL(Op);
7557
7558   SmallVector<SDValue, 8> VTBLMask;
7559   for (ArrayRef<int>::iterator
7560          I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
7561     VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32));
7562
7563   if (V2.getNode()->isUndef())
7564     return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
7565                        DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
7566
7567   return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
7568                      DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
7569 }
7570
7571 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
7572                                                       SelectionDAG &DAG) {
7573   SDLoc DL(Op);
7574   SDValue OpLHS = Op.getOperand(0);
7575   EVT VT = OpLHS.getValueType();
7576
7577   assert((VT == MVT::v8i16 || VT == MVT::v16i8) &&
7578          "Expect an v8i16/v16i8 type");
7579   OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);
7580   // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
7581   // extract the first 8 bytes into the top double word and the last 8 bytes
7582   // into the bottom double word. The v8i16 case is similar.
7583   unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
7584   return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
7585                      DAG.getConstant(ExtractNum, DL, MVT::i32));
7586 }
7587
7588 static EVT getVectorTyFromPredicateVector(EVT VT) {
7589   switch (VT.getSimpleVT().SimpleTy) {
7590   case MVT::v4i1:
7591     return MVT::v4i32;
7592   case MVT::v8i1:
7593     return MVT::v8i16;
7594   case MVT::v16i1:
7595     return MVT::v16i8;
7596   default:
7597     llvm_unreachable("Unexpected vector predicate type");
7598   }
7599 }
7600
7601 static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT,
7602                                     SelectionDAG &DAG) {
7603   // Converting from boolean predicates to integers involves creating a vector
7604   // of all ones or all zeroes and selecting the lanes based upon the real
7605   // predicate.
7606   SDValue AllOnes =
7607       DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
7608   AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
7609
7610   SDValue AllZeroes =
7611       DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
7612   AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
7613
7614   // Get full vector type from predicate type
7615   EVT NewVT = getVectorTyFromPredicateVector(VT);
7616
7617   SDValue RecastV1;
7618   // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
7619   // this to a v16i1. This cannot be done with an ordinary bitcast because the
7620   // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
7621   // since we know in hardware the sizes are really the same.
7622   if (VT != MVT::v16i1)
7623     RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
7624   else
7625     RecastV1 = Pred;
7626
7627   // Select either all ones or zeroes depending upon the real predicate bits.
7628   SDValue PredAsVector =
7629       DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
7630
7631   // Recast our new predicate-as-integer v16i8 vector into something
7632   // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
7633   return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
7634 }
7635
7636 static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG,
7637                                       const ARMSubtarget *ST) {
7638   EVT VT = Op.getValueType();
7639   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
7640   ArrayRef<int> ShuffleMask = SVN->getMask();
7641
7642   assert(ST->hasMVEIntegerOps() &&
7643          "No support for vector shuffle of boolean predicates");
7644
7645   SDValue V1 = Op.getOperand(0);
7646   SDLoc dl(Op);
7647   if (isReverseMask(ShuffleMask, VT)) {
7648     SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
7649     SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
7650     SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
7651                               DAG.getConstant(16, dl, MVT::i32));
7652     return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
7653   }
7654
7655   // Until we can come up with optimised cases for every single vector
7656   // shuffle in existence we have chosen the least painful strategy. This is
7657   // to essentially promote the boolean predicate to a 8-bit integer, where
7658   // each predicate represents a byte. Then we fall back on a normal integer
7659   // vector shuffle and convert the result back into a predicate vector. In
7660   // many cases the generated code might be even better than scalar code
7661   // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
7662   // fields in a register into 8 other arbitrary 2-bit fields!
7663   SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG);
7664   EVT NewVT = PredAsVector.getValueType();
7665
7666   // Do the shuffle!
7667   SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector,
7668                                           DAG.getUNDEF(NewVT), ShuffleMask);
7669
7670   // Now return the result of comparing the shuffled vector with zero,
7671   // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
7672   return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
7673                      DAG.getConstant(ARMCC::NE, dl, MVT::i32));
7674 }
7675
7676 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
7677                                    const ARMSubtarget *ST) {
7678   SDValue V1 = Op.getOperand(0);
7679   SDValue V2 = Op.getOperand(1);
7680   SDLoc dl(Op);
7681   EVT VT = Op.getValueType();
7682   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
7683   unsigned EltSize = VT.getScalarSizeInBits();
7684
7685   if (ST->hasMVEIntegerOps() && EltSize == 1)
7686     return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
7687
7688   // Convert shuffles that are directly supported on NEON to target-specific
7689   // DAG nodes, instead of keeping them as shuffles and matching them again
7690   // during code selection.  This is more efficient and avoids the possibility
7691   // of inconsistencies between legalization and selection.
7692   // FIXME: floating-point vectors should be canonicalized to integer vectors
7693   // of the same time so that they get CSEd properly.
7694   ArrayRef<int> ShuffleMask = SVN->getMask();
7695
7696   if (EltSize <= 32) {
7697     if (SVN->isSplat()) {
7698       int Lane = SVN->getSplatIndex();
7699       // If this is undef splat, generate it via "just" vdup, if possible.
7700       if (Lane == -1) Lane = 0;
7701
7702       // Test if V1 is a SCALAR_TO_VECTOR.
7703       if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7704         return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
7705       }
7706       // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
7707       // (and probably will turn into a SCALAR_TO_VECTOR once legalization
7708       // reaches it).
7709       if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
7710           !isa<ConstantSDNode>(V1.getOperand(0))) {
7711         bool IsScalarToVector = true;
7712         for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
7713           if (!V1.getOperand(i).isUndef()) {
7714             IsScalarToVector = false;
7715             break;
7716           }
7717         if (IsScalarToVector)
7718           return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
7719       }
7720       return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
7721                          DAG.getConstant(Lane, dl, MVT::i32));
7722     }
7723
7724     bool ReverseVEXT = false;
7725     unsigned Imm = 0;
7726     if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
7727       if (ReverseVEXT)
7728         std::swap(V1, V2);
7729       return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
7730                          DAG.getConstant(Imm, dl, MVT::i32));
7731     }
7732
7733     if (isVREVMask(ShuffleMask, VT, 64))
7734       return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
7735     if (isVREVMask(ShuffleMask, VT, 32))
7736       return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
7737     if (isVREVMask(ShuffleMask, VT, 16))
7738       return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
7739
7740     if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
7741       return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
7742                          DAG.getConstant(Imm, dl, MVT::i32));
7743     }
7744
7745     // Check for Neon shuffles that modify both input vectors in place.
7746     // If both results are used, i.e., if there are two shuffles with the same
7747     // source operands and with masks corresponding to both results of one of
7748     // these operations, DAG memoization will ensure that a single node is
7749     // used for both shuffles.
7750     unsigned WhichResult = 0;
7751     bool isV_UNDEF = false;
7752     if (ST->hasNEON()) {
7753       if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
7754               ShuffleMask, VT, WhichResult, isV_UNDEF)) {
7755         if (isV_UNDEF)
7756           V2 = V1;
7757         return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
7758             .getValue(WhichResult);
7759       }
7760     }
7761
7762     // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
7763     // shuffles that produce a result larger than their operands with:
7764     //   shuffle(concat(v1, undef), concat(v2, undef))
7765     // ->
7766     //   shuffle(concat(v1, v2), undef)
7767     // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
7768     //
7769     // This is useful in the general case, but there are special cases where
7770     // native shuffles produce larger results: the two-result ops.
7771     //
7772     // Look through the concat when lowering them:
7773     //   shuffle(concat(v1, v2), undef)
7774     // ->
7775     //   concat(VZIP(v1, v2):0, :1)
7776     //
7777     if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
7778       SDValue SubV1 = V1->getOperand(0);
7779       SDValue SubV2 = V1->getOperand(1);
7780       EVT SubVT = SubV1.getValueType();
7781
7782       // We expect these to have been canonicalized to -1.
7783       assert(llvm::all_of(ShuffleMask, [&](int i) {
7784         return i < (int)VT.getVectorNumElements();
7785       }) && "Unexpected shuffle index into UNDEF operand!");
7786
7787       if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
7788               ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
7789         if (isV_UNDEF)
7790           SubV2 = SubV1;
7791         assert((WhichResult == 0) &&
7792                "In-place shuffle of concat can only have one result!");
7793         SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
7794                                   SubV1, SubV2);
7795         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
7796                            Res.getValue(1));
7797       }
7798     }
7799   }
7800
7801   // If the shuffle is not directly supported and it has 4 elements, use
7802   // the PerfectShuffle-generated table to synthesize it from other shuffles.
7803   unsigned NumElts = VT.getVectorNumElements();
7804   if (NumElts == 4) {
7805     unsigned PFIndexes[4];
7806     for (unsigned i = 0; i != 4; ++i) {
7807       if (ShuffleMask[i] < 0)
7808         PFIndexes[i] = 8;
7809       else
7810         PFIndexes[i] = ShuffleMask[i];
7811     }
7812
7813     // Compute the index in the perfect shuffle table.
7814     unsigned PFTableIndex =
7815       PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
7816     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
7817     unsigned Cost = (PFEntry >> 30);
7818
7819     if (Cost <= 4) {
7820       if (ST->hasNEON())
7821         return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
7822       else if (isLegalMVEShuffleOp(PFEntry)) {
7823         unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
7824         unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
7825         unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
7826         unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
7827         if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
7828           return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
7829       }
7830     }
7831   }
7832
7833   // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
7834   if (EltSize >= 32) {
7835     // Do the expansion with floating-point types, since that is what the VFP
7836     // registers are defined to use, and since i64 is not legal.
7837     EVT EltVT = EVT::getFloatingPointVT(EltSize);
7838     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
7839     V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
7840     V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
7841     SmallVector<SDValue, 8> Ops;
7842     for (unsigned i = 0; i < NumElts; ++i) {
7843       if (ShuffleMask[i] < 0)
7844         Ops.push_back(DAG.getUNDEF(EltVT));
7845       else
7846         Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
7847                                   ShuffleMask[i] < (int)NumElts ? V1 : V2,
7848                                   DAG.getConstant(ShuffleMask[i] & (NumElts-1),
7849                                                   dl, MVT::i32)));
7850     }
7851     SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
7852     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7853   }
7854
7855   if (ST->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
7856     return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
7857
7858   if (ST->hasNEON() && VT == MVT::v8i8)
7859     if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
7860       return NewOp;
7861
7862   return SDValue();
7863 }
7864
7865 static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,
7866                                          const ARMSubtarget *ST) {
7867   EVT VecVT = Op.getOperand(0).getValueType();
7868   SDLoc dl(Op);
7869
7870   assert(ST->hasMVEIntegerOps() &&
7871          "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
7872
7873   SDValue Conv =
7874       DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
7875   unsigned Lane = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
7876   unsigned LaneWidth =
7877       getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;
7878   unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
7879   SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
7880                             Op.getOperand(1), DAG.getValueType(MVT::i1));
7881   SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
7882                             DAG.getConstant(~Mask, dl, MVT::i32));
7883   return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
7884 }
7885
7886 SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
7887                                                   SelectionDAG &DAG) const {
7888   // INSERT_VECTOR_ELT is legal only for immediate indexes.
7889   SDValue Lane = Op.getOperand(2);
7890   if (!isa<ConstantSDNode>(Lane))
7891     return SDValue();
7892
7893   SDValue Elt = Op.getOperand(1);
7894   EVT EltVT = Elt.getValueType();
7895
7896   if (Subtarget->hasMVEIntegerOps() &&
7897       Op.getValueType().getScalarSizeInBits() == 1)
7898     return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
7899
7900   if (getTypeAction(*DAG.getContext(), EltVT) ==
7901       TargetLowering::TypePromoteFloat) {
7902     // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
7903     // but the type system will try to do that if we don't intervene.
7904     // Reinterpret any such vector-element insertion as one with the
7905     // corresponding integer types.
7906
7907     SDLoc dl(Op);
7908
7909     EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
7910     assert(getTypeAction(*DAG.getContext(), IEltVT) !=
7911            TargetLowering::TypePromoteFloat);
7912
7913     SDValue VecIn = Op.getOperand(0);
7914     EVT VecVT = VecIn.getValueType();
7915     EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
7916                                   VecVT.getVectorNumElements());
7917
7918     SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
7919     SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
7920     SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
7921                                   IVecIn, IElt, Lane);
7922     return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
7923   }
7924
7925   return Op;
7926 }
7927
7928 static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG,
7929                                           const ARMSubtarget *ST) {
7930   EVT VecVT = Op.getOperand(0).getValueType();
7931   SDLoc dl(Op);
7932
7933   assert(ST->hasMVEIntegerOps() &&
7934          "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
7935
7936   SDValue Conv =
7937       DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
7938   unsigned Lane = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7939   unsigned LaneWidth =
7940       getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8;
7941   SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
7942                               DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
7943   return Shift;
7944 }
7945
7946 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG,
7947                                        const ARMSubtarget *ST) {
7948   // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
7949   SDValue Lane = Op.getOperand(1);
7950   if (!isa<ConstantSDNode>(Lane))
7951     return SDValue();
7952
7953   SDValue Vec = Op.getOperand(0);
7954   EVT VT = Vec.getValueType();
7955
7956   if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
7957     return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
7958
7959   if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
7960     SDLoc dl(Op);
7961     return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
7962   }
7963
7964   return Op;
7965 }
7966
7967 static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG,
7968                                       const ARMSubtarget *ST) {
7969   SDValue V1 = Op.getOperand(0);
7970   SDValue V2 = Op.getOperand(1);
7971   SDLoc dl(Op);
7972   EVT VT = Op.getValueType();
7973   EVT Op1VT = V1.getValueType();
7974   EVT Op2VT = V2.getValueType();
7975   unsigned NumElts = VT.getVectorNumElements();
7976
7977   assert(Op1VT == Op2VT && "Operand types don't match!");
7978   assert(VT.getScalarSizeInBits() == 1 &&
7979          "Unexpected custom CONCAT_VECTORS lowering");
7980   assert(ST->hasMVEIntegerOps() &&
7981          "CONCAT_VECTORS lowering only supported for MVE");
7982
7983   SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
7984   SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
7985
7986   // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
7987   // promoted to v8i16, etc.
7988
7989   MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
7990
7991   // Extract the vector elements from Op1 and Op2 one by one and truncate them
7992   // to be the right size for the destination. For example, if Op1 is v4i1 then
7993   // the promoted vector is v4i32. The result of concatentation gives a v8i1,
7994   // which when promoted is v8i16. That means each i32 element from Op1 needs
7995   // truncating to i16 and inserting in the result.
7996   EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
7997   SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
7998   auto ExractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
7999     EVT NewVT = NewV.getValueType();
8000     EVT ConcatVT = ConVec.getValueType();
8001     for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
8002       SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
8003                                 DAG.getIntPtrConstant(i, dl));
8004       ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
8005                            DAG.getConstant(j, dl, MVT::i32));
8006     }
8007     return ConVec;
8008   };
8009   unsigned j = 0;
8010   ConVec = ExractInto(NewV1, ConVec, j);
8011   ConVec = ExractInto(NewV2, ConVec, j);
8012
8013   // Now return the result of comparing the subvector with zero,
8014   // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8015   return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
8016                      DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8017 }
8018
8019 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
8020                                    const ARMSubtarget *ST) {
8021   EVT VT = Op->getValueType(0);
8022   if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
8023     return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
8024
8025   // The only time a CONCAT_VECTORS operation can have legal types is when
8026   // two 64-bit vectors are concatenated to a 128-bit vector.
8027   assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
8028          "unexpected CONCAT_VECTORS");
8029   SDLoc dl(Op);
8030   SDValue Val = DAG.getUNDEF(MVT::v2f64);
8031   SDValue Op0 = Op.getOperand(0);
8032   SDValue Op1 = Op.getOperand(1);
8033   if (!Op0.isUndef())
8034     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
8035                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
8036                       DAG.getIntPtrConstant(0, dl));
8037   if (!Op1.isUndef())
8038     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
8039                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
8040                       DAG.getIntPtrConstant(1, dl));
8041   return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
8042 }
8043
8044 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG,
8045                                       const ARMSubtarget *ST) {
8046   SDValue V1 = Op.getOperand(0);
8047   SDValue V2 = Op.getOperand(1);
8048   SDLoc dl(Op);
8049   EVT VT = Op.getValueType();
8050   EVT Op1VT = V1.getValueType();
8051   unsigned NumElts = VT.getVectorNumElements();
8052   unsigned Index = cast<ConstantSDNode>(V2)->getZExtValue();
8053
8054   assert(VT.getScalarSizeInBits() == 1 &&
8055          "Unexpected custom EXTRACT_SUBVECTOR lowering");
8056   assert(ST->hasMVEIntegerOps() &&
8057          "EXTRACT_SUBVECTOR lowering only supported for MVE");
8058
8059   SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
8060
8061   // We now have Op1 promoted to a vector of integers, where v8i1 gets
8062   // promoted to v8i16, etc.
8063
8064   MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
8065
8066   EVT SubVT = MVT::getVectorVT(ElType, NumElts);
8067   SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
8068   for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
8069     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
8070                               DAG.getIntPtrConstant(i, dl));
8071     SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
8072                          DAG.getConstant(j, dl, MVT::i32));
8073   }
8074
8075   // Now return the result of comparing the subvector with zero,
8076   // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8077   return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
8078                      DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8079 }
8080
8081 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
8082 /// element has been zero/sign-extended, depending on the isSigned parameter,
8083 /// from an integer type half its size.
8084 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
8085                                    bool isSigned) {
8086   // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
8087   EVT VT = N->getValueType(0);
8088   if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
8089     SDNode *BVN = N->getOperand(0).getNode();
8090     if (BVN->getValueType(0) != MVT::v4i32 ||
8091         BVN->getOpcode() != ISD::BUILD_VECTOR)
8092       return false;
8093     unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
8094     unsigned HiElt = 1 - LoElt;
8095     ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
8096     ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
8097     ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
8098     ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
8099     if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
8100       return false;
8101     if (isSigned) {
8102       if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
8103           Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
8104         return true;
8105     } else {
8106       if (Hi0->isNullValue() && Hi1->isNullValue())
8107         return true;
8108     }
8109     return false;
8110   }
8111
8112   if (N->getOpcode() != ISD::BUILD_VECTOR)
8113     return false;
8114
8115   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
8116     SDNode *Elt = N->getOperand(i).getNode();
8117     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
8118       unsigned EltSize = VT.getScalarSizeInBits();
8119       unsigned HalfSize = EltSize / 2;
8120       if (isSigned) {
8121         if (!isIntN(HalfSize, C->getSExtValue()))
8122           return false;
8123       } else {
8124         if (!isUIntN(HalfSize, C->getZExtValue()))
8125           return false;
8126       }
8127       continue;
8128     }
8129     return false;
8130   }
8131
8132   return true;
8133 }
8134
8135 /// isSignExtended - Check if a node is a vector value that is sign-extended
8136 /// or a constant BUILD_VECTOR with sign-extended elements.
8137 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
8138   if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
8139     return true;
8140   if (isExtendedBUILD_VECTOR(N, DAG, true))
8141     return true;
8142   return false;
8143 }
8144
8145 /// isZeroExtended - Check if a node is a vector value that is zero-extended
8146 /// or a constant BUILD_VECTOR with zero-extended elements.
8147 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
8148   if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N))
8149     return true;
8150   if (isExtendedBUILD_VECTOR(N, DAG, false))
8151     return true;
8152   return false;
8153 }
8154
8155 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
8156   if (OrigVT.getSizeInBits() >= 64)
8157     return OrigVT;
8158
8159   assert(OrigVT.isSimple() && "Expecting a simple value type");
8160
8161   MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
8162   switch (OrigSimpleTy) {
8163   default: llvm_unreachable("Unexpected Vector Type");
8164   case MVT::v2i8:
8165   case MVT::v2i16:
8166      return MVT::v2i32;
8167   case MVT::v4i8:
8168     return  MVT::v4i16;
8169   }
8170 }
8171
8172 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
8173 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
8174 /// We insert the required extension here to get the vector to fill a D register.
8175 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
8176                                             const EVT &OrigTy,
8177                                             const EVT &ExtTy,
8178                                             unsigned ExtOpcode) {
8179   // The vector originally had a size of OrigTy. It was then extended to ExtTy.
8180   // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
8181   // 64-bits we need to insert a new extension so that it will be 64-bits.
8182   assert(ExtTy.is128BitVector() && "Unexpected extension size");
8183   if (OrigTy.getSizeInBits() >= 64)
8184     return N;
8185
8186   // Must extend size to at least 64 bits to be used as an operand for VMULL.
8187   EVT NewVT = getExtensionTo64Bits(OrigTy);
8188
8189   return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
8190 }
8191
8192 /// SkipLoadExtensionForVMULL - return a load of the original vector size that
8193 /// does not do any sign/zero extension. If the original vector is less
8194 /// than 64 bits, an appropriate extension will be added after the load to
8195 /// reach a total size of 64 bits. We have to add the extension separately
8196 /// because ARM does not have a sign/zero extending load for vectors.
8197 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
8198   EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
8199
8200   // The load already has the right type.
8201   if (ExtendedTy == LD->getMemoryVT())
8202     return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
8203                        LD->getBasePtr(), LD->getPointerInfo(),
8204                        LD->getAlignment(), LD->getMemOperand()->getFlags());
8205
8206   // We need to create a zextload/sextload. We cannot just create a load
8207   // followed by a zext/zext node because LowerMUL is also run during normal
8208   // operation legalization where we can't create illegal types.
8209   return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
8210                         LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
8211                         LD->getMemoryVT(), LD->getAlignment(),
8212                         LD->getMemOperand()->getFlags());
8213 }
8214
8215 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
8216 /// extending load, or BUILD_VECTOR with extended elements, return the
8217 /// unextended value. The unextended vector should be 64 bits so that it can
8218 /// be used as an operand to a VMULL instruction. If the original vector size
8219 /// before extension is less than 64 bits we add a an extension to resize
8220 /// the vector to 64 bits.
8221 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
8222   if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
8223     return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
8224                                         N->getOperand(0)->getValueType(0),
8225                                         N->getValueType(0),
8226                                         N->getOpcode());
8227
8228   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
8229     assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
8230            "Expected extending load");
8231
8232     SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
8233     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
8234     unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
8235     SDValue extLoad =
8236         DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
8237     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
8238
8239     return newLoad;
8240   }
8241
8242   // Otherwise, the value must be a BUILD_VECTOR.  For v2i64, it will
8243   // have been legalized as a BITCAST from v4i32.
8244   if (N->getOpcode() == ISD::BITCAST) {
8245     SDNode *BVN = N->getOperand(0).getNode();
8246     assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
8247            BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
8248     unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
8249     return DAG.getBuildVector(
8250         MVT::v2i32, SDLoc(N),
8251         {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
8252   }
8253   // Construct a new BUILD_VECTOR with elements truncated to half the size.
8254   assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
8255   EVT VT = N->getValueType(0);
8256   unsigned EltSize = VT.getScalarSizeInBits() / 2;
8257   unsigned NumElts = VT.getVectorNumElements();
8258   MVT TruncVT = MVT::getIntegerVT(EltSize);
8259   SmallVector<SDValue, 8> Ops;
8260   SDLoc dl(N);
8261   for (unsigned i = 0; i != NumElts; ++i) {
8262     ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
8263     const APInt &CInt = C->getAPIntValue();
8264     // Element types smaller than 32 bits are not legal, so use i32 elements.
8265     // The values are implicitly truncated so sext vs. zext doesn't matter.
8266     Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
8267   }
8268   return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
8269 }
8270
8271 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
8272   unsigned Opcode = N->getOpcode();
8273   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
8274     SDNode *N0 = N->getOperand(0).getNode();
8275     SDNode *N1 = N->getOperand(1).getNode();
8276     return N0->hasOneUse() && N1->hasOneUse() &&
8277       isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
8278   }
8279   return false;
8280 }
8281
8282 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
8283   unsigned Opcode = N->getOpcode();
8284   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
8285     SDNode *N0 = N->getOperand(0).getNode();
8286     SDNode *N1 = N->getOperand(1).getNode();
8287     return N0->hasOneUse() && N1->hasOneUse() &&
8288       isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
8289   }
8290   return false;
8291 }
8292
8293 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
8294   // Multiplications are only custom-lowered for 128-bit vectors so that
8295   // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
8296   EVT VT = Op.getValueType();
8297   assert(VT.is128BitVector() && VT.isInteger() &&
8298          "unexpected type for custom-lowering ISD::MUL");
8299   SDNode *N0 = Op.getOperand(0).getNode();
8300   SDNode *N1 = Op.getOperand(1).getNode();
8301   unsigned NewOpc = 0;
8302   bool isMLA = false;
8303   bool isN0SExt = isSignExtended(N0, DAG);
8304   bool isN1SExt = isSignExtended(N1, DAG);
8305   if (isN0SExt && isN1SExt)
8306     NewOpc = ARMISD::VMULLs;
8307   else {
8308     bool isN0ZExt = isZeroExtended(N0, DAG);
8309     bool isN1ZExt = isZeroExtended(N1, DAG);
8310     if (isN0ZExt && isN1ZExt)
8311       NewOpc = ARMISD::VMULLu;
8312     else if (isN1SExt || isN1ZExt) {
8313       // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
8314       // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
8315       if (isN1SExt && isAddSubSExt(N0, DAG)) {
8316         NewOpc = ARMISD::VMULLs;
8317         isMLA = true;
8318       } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
8319         NewOpc = ARMISD::VMULLu;
8320         isMLA = true;
8321       } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
8322         std::swap(N0, N1);
8323         NewOpc = ARMISD::VMULLu;
8324         isMLA = true;
8325       }
8326     }
8327
8328     if (!NewOpc) {
8329       if (VT == MVT::v2i64)
8330         // Fall through to expand this.  It is not legal.
8331         return SDValue();
8332       else
8333         // Other vector multiplications are legal.
8334         return Op;
8335     }
8336   }
8337
8338   // Legalize to a VMULL instruction.
8339   SDLoc DL(Op);
8340   SDValue Op0;
8341   SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
8342   if (!isMLA) {
8343     Op0 = SkipExtensionForVMULL(N0, DAG);
8344     assert(Op0.getValueType().is64BitVector() &&
8345            Op1.getValueType().is64BitVector() &&
8346            "unexpected types for extended operands to VMULL");
8347     return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
8348   }
8349
8350   // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
8351   // isel lowering to take advantage of no-stall back to back vmul + vmla.
8352   //   vmull q0, d4, d6
8353   //   vmlal q0, d5, d6
8354   // is faster than
8355   //   vaddl q0, d4, d5
8356   //   vmovl q1, d6
8357   //   vmul  q0, q0, q1
8358   SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
8359   SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
8360   EVT Op1VT = Op1.getValueType();
8361   return DAG.getNode(N0->getOpcode(), DL, VT,
8362                      DAG.getNode(NewOpc, DL, VT,
8363                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
8364                      DAG.getNode(NewOpc, DL, VT,
8365                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
8366 }
8367
8368 static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl,
8369                               SelectionDAG &DAG) {
8370   // TODO: Should this propagate fast-math-flags?
8371
8372   // Convert to float
8373   // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
8374   // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
8375   X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
8376   Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
8377   X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
8378   Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
8379   // Get reciprocal estimate.
8380   // float4 recip = vrecpeq_f32(yf);
8381   Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
8382                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
8383                    Y);
8384   // Because char has a smaller range than uchar, we can actually get away
8385   // without any newton steps.  This requires that we use a weird bias
8386   // of 0xb000, however (again, this has been exhaustively tested).
8387   // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
8388   X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
8389   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
8390   Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
8391   X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
8392   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
8393   // Convert back to short.
8394   X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
8395   X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
8396   return X;
8397 }
8398
8399 static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl,
8400                                SelectionDAG &DAG) {
8401   // TODO: Should this propagate fast-math-flags?
8402
8403   SDValue N2;
8404   // Convert to float.
8405   // float4 yf = vcvt_f32_s32(vmovl_s16(y));
8406   // float4 xf = vcvt_f32_s32(vmovl_s16(x));
8407   N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
8408   N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
8409   N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
8410   N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
8411
8412   // Use reciprocal estimate and one refinement step.
8413   // float4 recip = vrecpeq_f32(yf);
8414   // recip *= vrecpsq_f32(yf, recip);
8415   N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
8416                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
8417                    N1);
8418   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
8419                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
8420                    N1, N2);
8421   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
8422   // Because short has a smaller range than ushort, we can actually get away
8423   // with only a single newton step.  This requires that we use a weird bias
8424   // of 89, however (again, this has been exhaustively tested).
8425   // float4 result = as_float4(as_int4(xf*recip) + 0x89);
8426   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
8427   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
8428   N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
8429   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
8430   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
8431   // Convert back to integer and return.
8432   // return vmovn_s32(vcvt_s32_f32(result));
8433   N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
8434   N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
8435   return N0;
8436 }
8437
8438 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG,
8439                          const ARMSubtarget *ST) {
8440   EVT VT = Op.getValueType();
8441   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
8442          "unexpected type for custom-lowering ISD::SDIV");
8443
8444   SDLoc dl(Op);
8445   SDValue N0 = Op.getOperand(0);
8446   SDValue N1 = Op.getOperand(1);
8447   SDValue N2, N3;
8448
8449   if (VT == MVT::v8i8) {
8450     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
8451     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
8452
8453     N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
8454                      DAG.getIntPtrConstant(4, dl));
8455     N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
8456                      DAG.getIntPtrConstant(4, dl));
8457     N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
8458                      DAG.getIntPtrConstant(0, dl));
8459     N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
8460                      DAG.getIntPtrConstant(0, dl));
8461
8462     N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
8463     N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
8464
8465     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
8466     N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
8467
8468     N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
8469     return N0;
8470   }
8471   return LowerSDIV_v4i16(N0, N1, dl, DAG);
8472 }
8473
8474 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG,
8475                          const ARMSubtarget *ST) {
8476   // TODO: Should this propagate fast-math-flags?
8477   EVT VT = Op.getValueType();
8478   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
8479          "unexpected type for custom-lowering ISD::UDIV");
8480
8481   SDLoc dl(Op);
8482   SDValue N0 = Op.getOperand(0);
8483   SDValue N1 = Op.getOperand(1);
8484   SDValue N2, N3;
8485
8486   if (VT == MVT::v8i8) {
8487     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
8488     N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
8489
8490     N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
8491                      DAG.getIntPtrConstant(4, dl));
8492     N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
8493                      DAG.getIntPtrConstant(4, dl));
8494     N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
8495                      DAG.getIntPtrConstant(0, dl));
8496     N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
8497                      DAG.getIntPtrConstant(0, dl));
8498
8499     N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
8500     N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
8501
8502     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
8503     N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
8504
8505     N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
8506                      DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
8507                                      MVT::i32),
8508                      N0);
8509     return N0;
8510   }
8511
8512   // v4i16 sdiv ... Convert to float.
8513   // float4 yf = vcvt_f32_s32(vmovl_u16(y));
8514   // float4 xf = vcvt_f32_s32(vmovl_u16(x));
8515   N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
8516   N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
8517   N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
8518   SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
8519
8520   // Use reciprocal estimate and two refinement steps.
8521   // float4 recip = vrecpeq_f32(yf);
8522   // recip *= vrecpsq_f32(yf, recip);
8523   // recip *= vrecpsq_f32(yf, recip);
8524   N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
8525                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
8526                    BN1);
8527   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
8528                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
8529                    BN1, N2);
8530   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
8531   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
8532                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
8533                    BN1, N2);
8534   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
8535   // Simply multiplying by the reciprocal estimate can leave us a few ulps
8536   // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
8537   // and that it will never cause us to return an answer too large).
8538   // float4 result = as_float4(as_int4(xf*recip) + 2);
8539   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
8540   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
8541   N1 = DAG.getConstant(2, dl, MVT::v4i32);
8542   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
8543   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
8544   // Convert back to integer and return.
8545   // return vmovn_u32(vcvt_s32_f32(result));
8546   N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
8547   N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
8548   return N0;
8549 }
8550
8551 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
8552   SDNode *N = Op.getNode();
8553   EVT VT = N->getValueType(0);
8554   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
8555
8556   SDValue Carry = Op.getOperand(2);
8557
8558   SDLoc DL(Op);
8559
8560   SDValue Result;
8561   if (Op.getOpcode() == ISD::ADDCARRY) {
8562     // This converts the boolean value carry into the carry flag.
8563     Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
8564
8565     // Do the addition proper using the carry flag we wanted.
8566     Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
8567                          Op.getOperand(1), Carry);
8568
8569     // Now convert the carry flag into a boolean value.
8570     Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
8571   } else {
8572     // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
8573     // have to invert the carry first.
8574     Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
8575                         DAG.getConstant(1, DL, MVT::i32), Carry);
8576     // This converts the boolean value carry into the carry flag.
8577     Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
8578
8579     // Do the subtraction proper using the carry flag we wanted.
8580     Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
8581                          Op.getOperand(1), Carry);
8582
8583     // Now convert the carry flag into a boolean value.
8584     Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
8585     // But the carry returned by ARMISD::SUBE is not a borrow as expected
8586     // by ISD::SUBCARRY, so compute 1 - C.
8587     Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
8588                         DAG.getConstant(1, DL, MVT::i32), Carry);
8589   }
8590
8591   // Return both values.
8592   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
8593 }
8594
8595 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
8596   assert(Subtarget->isTargetDarwin());
8597
8598   // For iOS, we want to call an alternative entry point: __sincos_stret,
8599   // return values are passed via sret.
8600   SDLoc dl(Op);
8601   SDValue Arg = Op.getOperand(0);
8602   EVT ArgVT = Arg.getValueType();
8603   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
8604   auto PtrVT = getPointerTy(DAG.getDataLayout());
8605
8606   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
8607   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8608
8609   // Pair of floats / doubles used to pass the result.
8610   Type *RetTy = StructType::get(ArgTy, ArgTy);
8611   auto &DL = DAG.getDataLayout();
8612
8613   ArgListTy Args;
8614   bool ShouldUseSRet = Subtarget->isAPCS_ABI();
8615   SDValue SRet;
8616   if (ShouldUseSRet) {
8617     // Create stack object for sret.
8618     const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
8619     const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy);
8620     int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
8621     SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));
8622
8623     ArgListEntry Entry;
8624     Entry.Node = SRet;
8625     Entry.Ty = RetTy->getPointerTo();
8626     Entry.IsSExt = false;
8627     Entry.IsZExt = false;
8628     Entry.IsSRet = true;
8629     Args.push_back(Entry);
8630     RetTy = Type::getVoidTy(*DAG.getContext());
8631   }
8632
8633   ArgListEntry Entry;
8634   Entry.Node = Arg;
8635   Entry.Ty = ArgTy;
8636   Entry.IsSExt = false;
8637   Entry.IsZExt = false;
8638   Args.push_back(Entry);
8639
8640   RTLIB::Libcall LC =
8641       (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
8642   const char *LibcallName = getLibcallName(LC);
8643   CallingConv::ID CC = getLibcallCallingConv(LC);
8644   SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
8645
8646   TargetLowering::CallLoweringInfo CLI(DAG);
8647   CLI.setDebugLoc(dl)
8648       .setChain(DAG.getEntryNode())
8649       .setCallee(CC, RetTy, Callee, std::move(Args))
8650       .setDiscardResult(ShouldUseSRet);
8651   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
8652
8653   if (!ShouldUseSRet)
8654     return CallResult.first;
8655
8656   SDValue LoadSin =
8657       DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
8658
8659   // Address of cos field.
8660   SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
8661                             DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
8662   SDValue LoadCos =
8663       DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
8664
8665   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
8666   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
8667                      LoadSin.getValue(0), LoadCos.getValue(0));
8668 }
8669
8670 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
8671                                                   bool Signed,
8672                                                   SDValue &Chain) const {
8673   EVT VT = Op.getValueType();
8674   assert((VT == MVT::i32 || VT == MVT::i64) &&
8675          "unexpected type for custom lowering DIV");
8676   SDLoc dl(Op);
8677
8678   const auto &DL = DAG.getDataLayout();
8679   const auto &TLI = DAG.getTargetLoweringInfo();
8680
8681   const char *Name = nullptr;
8682   if (Signed)
8683     Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";
8684   else
8685     Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
8686
8687   SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL));
8688
8689   ARMTargetLowering::ArgListTy Args;
8690
8691   for (auto AI : {1, 0}) {
8692     ArgListEntry Arg;
8693     Arg.Node = Op.getOperand(AI);
8694     Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext());
8695     Args.push_back(Arg);
8696   }
8697
8698   CallLoweringInfo CLI(DAG);
8699   CLI.setDebugLoc(dl)
8700     .setChain(Chain)
8701     .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()),
8702                ES, std::move(Args));
8703
8704   return LowerCallTo(CLI).first;
8705 }
8706
8707 // This is a code size optimisation: return the original SDIV node to
8708 // DAGCombiner when we don't want to expand SDIV into a sequence of
8709 // instructions, and an empty node otherwise which will cause the
8710 // SDIV to be expanded in DAGCombine.
8711 SDValue
8712 ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
8713                                  SelectionDAG &DAG,
8714                                  SmallVectorImpl<SDNode *> &Created) const {
8715   // TODO: Support SREM
8716   if (N->getOpcode() != ISD::SDIV)
8717     return SDValue();
8718
8719   const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget());
8720   const bool MinSize = ST.hasMinSize();
8721   const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
8722                                       : ST.hasDivideInARMMode();
8723
8724   // Don't touch vector types; rewriting this may lead to scalarizing
8725   // the int divs.
8726   if (N->getOperand(0).getValueType().isVector())
8727     return SDValue();
8728
8729   // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
8730   // hwdiv support for this to be really profitable.
8731   if (!(MinSize && HasDivide))
8732     return SDValue();
8733
8734   // ARM mode is a bit simpler than Thumb: we can handle large power
8735   // of 2 immediates with 1 mov instruction; no further checks required,
8736   // just return the sdiv node.
8737   if (!ST.isThumb())
8738     return SDValue(N, 0);
8739
8740   // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
8741   // and thus lose the code size benefits of a MOVS that requires only 2.
8742   // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
8743   // but as it's doing exactly this, it's not worth the trouble to get TTI.
8744   if (Divisor.sgt(128))
8745     return SDValue();
8746
8747   return SDValue(N, 0);
8748 }
8749
8750 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
8751                                             bool Signed) const {
8752   assert(Op.getValueType() == MVT::i32 &&
8753          "unexpected type for custom lowering DIV");
8754   SDLoc dl(Op);
8755
8756   SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
8757                                DAG.getEntryNode(), Op.getOperand(1));
8758
8759   return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
8760 }
8761
8762 static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) {
8763   SDLoc DL(N);
8764   SDValue Op = N->getOperand(1);
8765   if (N->getValueType(0) == MVT::i32)
8766     return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
8767   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
8768                            DAG.getConstant(0, DL, MVT::i32));
8769   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
8770                            DAG.getConstant(1, DL, MVT::i32));
8771   return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
8772                      DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
8773 }
8774
8775 void ARMTargetLowering::ExpandDIV_Windows(
8776     SDValue Op, SelectionDAG &DAG, bool Signed,
8777     SmallVectorImpl<SDValue> &Results) const {
8778   const auto &DL = DAG.getDataLayout();
8779   const auto &TLI = DAG.getTargetLoweringInfo();
8780
8781   assert(Op.getValueType() == MVT::i64 &&
8782          "unexpected type for custom lowering DIV");
8783   SDLoc dl(Op);
8784
8785   SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
8786
8787   SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
8788
8789   SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
8790   SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
8791                               DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
8792   Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
8793
8794   Results.push_back(Lower);
8795   Results.push_back(Upper);
8796 }
8797
8798 static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) {
8799   LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
8800   EVT MemVT = LD->getMemoryVT();
8801   assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) &&
8802          "Expected a predicate type!");
8803   assert(MemVT == Op.getValueType());
8804   assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
8805          "Expected a non-extending load");
8806   assert(LD->isUnindexed() && "Expected a unindexed load");
8807
8808   // The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit
8809   // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
8810   // need to make sure that 8/4 bits are actually loaded into the correct
8811   // place, which means loading the value and then shuffling the values into
8812   // the bottom bits of the predicate.
8813   // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
8814   // for BE).
8815
8816   SDLoc dl(Op);
8817   SDValue Load = DAG.getExtLoad(
8818       ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
8819       EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
8820       LD->getMemOperand());
8821   SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Load);
8822   if (MemVT != MVT::v16i1)
8823     Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
8824                        DAG.getConstant(0, dl, MVT::i32));
8825   return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
8826 }
8827
8828 static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) {
8829   StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
8830   EVT MemVT = ST->getMemoryVT();
8831   assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) &&
8832          "Expected a predicate type!");
8833   assert(MemVT == ST->getValue().getValueType());
8834   assert(!ST->isTruncatingStore() && "Expected a non-extending store");
8835   assert(ST->isUnindexed() && "Expected a unindexed store");
8836
8837   // Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits
8838   // unset and a scalar store.
8839   SDLoc dl(Op);
8840   SDValue Build = ST->getValue();
8841   if (MemVT != MVT::v16i1) {
8842     SmallVector<SDValue, 16> Ops;
8843     for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++)
8844       Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
8845                                 DAG.getConstant(I, dl, MVT::i32)));
8846     for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
8847       Ops.push_back(DAG.getUNDEF(MVT::i32));
8848     Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
8849   }
8850   SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
8851   return DAG.getTruncStore(
8852       ST->getChain(), dl, GRP, ST->getBasePtr(),
8853       EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()),
8854       ST->getMemOperand());
8855 }
8856
8857 static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
8858   MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
8859   MVT VT = Op.getSimpleValueType();
8860   SDValue Mask = N->getMask();
8861   SDValue PassThru = N->getPassThru();
8862   SDLoc dl(Op);
8863
8864   if (ISD::isBuildVectorAllZeros(PassThru.getNode()) ||
8865       (PassThru->getOpcode() == ARMISD::VMOVIMM &&
8866        isNullConstant(PassThru->getOperand(0))))
8867     return Op;
8868
8869   // MVE Masked loads use zero as the passthru value. Here we convert undef to
8870   // zero too, and other values are lowered to a select.
8871   SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
8872                                 DAG.getTargetConstant(0, dl, MVT::i32));
8873   SDValue NewLoad = DAG.getMaskedLoad(
8874       VT, dl, N->getChain(), N->getBasePtr(), Mask, ZeroVec, N->getMemoryVT(),
8875       N->getMemOperand(), N->getExtensionType(), N->isExpandingLoad());
8876   SDValue Combo = NewLoad;
8877   if (!PassThru.isUndef())
8878     Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
8879   return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
8880 }
8881
8882 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
8883   if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
8884     // Acquire/Release load/store is not legal for targets without a dmb or
8885     // equivalent available.
8886     return SDValue();
8887
8888   // Monotonic load/store is legal for all targets.
8889   return Op;
8890 }
8891
8892 static void ReplaceREADCYCLECOUNTER(SDNode *N,
8893                                     SmallVectorImpl<SDValue> &Results,
8894                                     SelectionDAG &DAG,
8895                                     const ARMSubtarget *Subtarget) {
8896   SDLoc DL(N);
8897   // Under Power Management extensions, the cycle-count is:
8898   //    mrc p15, #0, <Rt>, c9, c13, #0
8899   SDValue Ops[] = { N->getOperand(0), // Chain
8900                     DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
8901                     DAG.getTargetConstant(15, DL, MVT::i32),
8902                     DAG.getTargetConstant(0, DL, MVT::i32),
8903                     DAG.getTargetConstant(9, DL, MVT::i32),
8904                     DAG.getTargetConstant(13, DL, MVT::i32),
8905                     DAG.getTargetConstant(0, DL, MVT::i32)
8906   };
8907
8908   SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
8909                                  DAG.getVTList(MVT::i32, MVT::Other), Ops);
8910   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
8911                                 DAG.getConstant(0, DL, MVT::i32)));
8912   Results.push_back(Cycles32.getValue(1));
8913 }
8914
8915 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
8916   SDLoc dl(V.getNode());
8917   SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32);
8918   SDValue VHi = DAG.getAnyExtOrTrunc(
8919       DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)),
8920       dl, MVT::i32);
8921   bool isBigEndian = DAG.getDataLayout().isBigEndian();
8922   if (isBigEndian)
8923     std::swap (VLo, VHi);
8924   SDValue RegClass =
8925       DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
8926   SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
8927   SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
8928   const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
8929   return SDValue(
8930       DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
8931 }
8932
8933 static void ReplaceCMP_SWAP_64Results(SDNode *N,
8934                                        SmallVectorImpl<SDValue> & Results,
8935                                        SelectionDAG &DAG) {
8936   assert(N->getValueType(0) == MVT::i64 &&
8937          "AtomicCmpSwap on types less than 64 should be legal");
8938   SDValue Ops[] = {N->getOperand(1),
8939                    createGPRPairNode(DAG, N->getOperand(2)),
8940                    createGPRPairNode(DAG, N->getOperand(3)),
8941                    N->getOperand(0)};
8942   SDNode *CmpSwap = DAG.getMachineNode(
8943       ARM::CMP_SWAP_64, SDLoc(N),
8944       DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops);
8945
8946   MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
8947   DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
8948
8949   bool isBigEndian = DAG.getDataLayout().isBigEndian();
8950
8951   Results.push_back(
8952       DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
8953                                  SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)));
8954   Results.push_back(
8955       DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
8956                                  SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)));
8957   Results.push_back(SDValue(CmpSwap, 2));
8958 }
8959
8960 static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget,
8961                           SelectionDAG &DAG) {
8962   const auto &TLI = DAG.getTargetLoweringInfo();
8963
8964   assert(Subtarget.getTargetTriple().isOSMSVCRT() &&
8965          "Custom lowering is MSVCRT specific!");
8966
8967   SDLoc dl(Op);
8968   SDValue Val = Op.getOperand(0);
8969   MVT Ty = Val->getSimpleValueType(0);
8970   SDValue Exponent = DAG.getNode(ISD::SINT_TO_FP, dl, Ty, Op.getOperand(1));
8971   SDValue Callee = DAG.getExternalSymbol(Ty == MVT::f32 ? "powf" : "pow",
8972                                          TLI.getPointerTy(DAG.getDataLayout()));
8973
8974   TargetLowering::ArgListTy Args;
8975   TargetLowering::ArgListEntry Entry;
8976
8977   Entry.Node = Val;
8978   Entry.Ty = Val.getValueType().getTypeForEVT(*DAG.getContext());
8979   Entry.IsZExt = true;
8980   Args.push_back(Entry);
8981
8982   Entry.Node = Exponent;
8983   Entry.Ty = Exponent.getValueType().getTypeForEVT(*DAG.getContext());
8984   Entry.IsZExt = true;
8985   Args.push_back(Entry);
8986
8987   Type *LCRTy = Val.getValueType().getTypeForEVT(*DAG.getContext());
8988
8989   // In the in-chain to the call is the entry node  If we are emitting a
8990   // tailcall, the chain will be mutated if the node has a non-entry input
8991   // chain.
8992   SDValue InChain = DAG.getEntryNode();
8993   SDValue TCChain = InChain;
8994
8995   const Function &F = DAG.getMachineFunction().getFunction();
8996   bool IsTC = TLI.isInTailCallPosition(DAG, Op.getNode(), TCChain) &&
8997               F.getReturnType() == LCRTy;
8998   if (IsTC)
8999     InChain = TCChain;
9000
9001   TargetLowering::CallLoweringInfo CLI(DAG);
9002   CLI.setDebugLoc(dl)
9003       .setChain(InChain)
9004       .setCallee(CallingConv::ARM_AAPCS_VFP, LCRTy, Callee, std::move(Args))
9005       .setTailCall(IsTC);
9006   std::pair<SDValue, SDValue> CI = TLI.LowerCallTo(CLI);
9007
9008   // Return the chain (the DAG root) if it is a tail call
9009   return !CI.second.getNode() ? DAG.getRoot() : CI.first;
9010 }
9011
9012 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
9013   LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
9014   switch (Op.getOpcode()) {
9015   default: llvm_unreachable("Don't know how to custom lower this!");
9016   case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
9017   case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
9018   case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
9019   case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
9020   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
9021   case ISD::SELECT:        return LowerSELECT(Op, DAG);
9022   case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
9023   case ISD::BRCOND:        return LowerBRCOND(Op, DAG);
9024   case ISD::BR_CC:         return LowerBR_CC(Op, DAG);
9025   case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
9026   case ISD::VASTART:       return LowerVASTART(Op, DAG);
9027   case ISD::ATOMIC_FENCE:  return LowerATOMIC_FENCE(Op, DAG, Subtarget);
9028   case ISD::PREFETCH:      return LowerPREFETCH(Op, DAG, Subtarget);
9029   case ISD::SINT_TO_FP:
9030   case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
9031   case ISD::FP_TO_SINT:
9032   case ISD::FP_TO_UINT:    return LowerFP_TO_INT(Op, DAG);
9033   case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
9034   case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
9035   case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
9036   case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
9037   case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
9038   case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
9039   case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
9040   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
9041                                                                Subtarget);
9042   case ISD::BITCAST:       return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
9043   case ISD::SHL:
9044   case ISD::SRL:
9045   case ISD::SRA:           return LowerShift(Op.getNode(), DAG, Subtarget);
9046   case ISD::SREM:          return LowerREM(Op.getNode(), DAG);
9047   case ISD::UREM:          return LowerREM(Op.getNode(), DAG);
9048   case ISD::SHL_PARTS:     return LowerShiftLeftParts(Op, DAG);
9049   case ISD::SRL_PARTS:
9050   case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
9051   case ISD::CTTZ:
9052   case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
9053   case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
9054   case ISD::SETCC:         return LowerVSETCC(Op, DAG, Subtarget);
9055   case ISD::SETCCCARRY:    return LowerSETCCCARRY(Op, DAG);
9056   case ISD::ConstantFP:    return LowerConstantFP(Op, DAG, Subtarget);
9057   case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
9058   case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
9059   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
9060   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
9061   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
9062   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
9063   case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
9064   case ISD::MUL:           return LowerMUL(Op, DAG);
9065   case ISD::SDIV:
9066     if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
9067       return LowerDIV_Windows(Op, DAG, /* Signed */ true);
9068     return LowerSDIV(Op, DAG, Subtarget);
9069   case ISD::UDIV:
9070     if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
9071       return LowerDIV_Windows(Op, DAG, /* Signed */ false);
9072     return LowerUDIV(Op, DAG, Subtarget);
9073   case ISD::ADDCARRY:
9074   case ISD::SUBCARRY:      return LowerADDSUBCARRY(Op, DAG);
9075   case ISD::SADDO:
9076   case ISD::SSUBO:
9077     return LowerSignedALUO(Op, DAG);
9078   case ISD::UADDO:
9079   case ISD::USUBO:
9080     return LowerUnsignedALUO(Op, DAG);
9081   case ISD::LOAD:
9082     return LowerPredicateLoad(Op, DAG);
9083   case ISD::STORE:
9084     return LowerPredicateStore(Op, DAG);
9085   case ISD::MLOAD:
9086     return LowerMLOAD(Op, DAG);
9087   case ISD::ATOMIC_LOAD:
9088   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
9089   case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
9090   case ISD::SDIVREM:
9091   case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
9092   case ISD::DYNAMIC_STACKALLOC:
9093     if (Subtarget->isTargetWindows())
9094       return LowerDYNAMIC_STACKALLOC(Op, DAG);
9095     llvm_unreachable("Don't know how to custom lower this!");
9096   case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
9097   case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
9098   case ISD::FPOWI: return LowerFPOWI(Op, *Subtarget, DAG);
9099   case ARMISD::WIN__DBZCHK: return SDValue();
9100   }
9101 }
9102
9103 static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results,
9104                                  SelectionDAG &DAG) {
9105   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
9106   unsigned Opc = 0;
9107   if (IntNo == Intrinsic::arm_smlald)
9108     Opc = ARMISD::SMLALD;
9109   else if (IntNo == Intrinsic::arm_smlaldx)
9110     Opc = ARMISD::SMLALDX;
9111   else if (IntNo == Intrinsic::arm_smlsld)
9112     Opc = ARMISD::SMLSLD;
9113   else if (IntNo == Intrinsic::arm_smlsldx)
9114     Opc = ARMISD::SMLSLDX;
9115   else
9116     return;
9117
9118   SDLoc dl(N);
9119   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
9120                            N->getOperand(3),
9121                            DAG.getConstant(0, dl, MVT::i32));
9122   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
9123                            N->getOperand(3),
9124                            DAG.getConstant(1, dl, MVT::i32));
9125
9126   SDValue LongMul = DAG.getNode(Opc, dl,
9127                                 DAG.getVTList(MVT::i32, MVT::i32),
9128                                 N->getOperand(1), N->getOperand(2),
9129                                 Lo, Hi);
9130   Results.push_back(LongMul.getValue(0));
9131   Results.push_back(LongMul.getValue(1));
9132 }
9133
9134 /// ReplaceNodeResults - Replace the results of node with an illegal result
9135 /// type with new values built out of custom code.
9136 void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
9137                                            SmallVectorImpl<SDValue> &Results,
9138                                            SelectionDAG &DAG) const {
9139   SDValue Res;
9140   switch (N->getOpcode()) {
9141   default:
9142     llvm_unreachable("Don't know how to custom expand this!");
9143   case ISD::READ_REGISTER:
9144     ExpandREAD_REGISTER(N, Results, DAG);
9145     break;
9146   case ISD::BITCAST:
9147     Res = ExpandBITCAST(N, DAG, Subtarget);
9148     break;
9149   case ISD::SRL:
9150   case ISD::SRA:
9151   case ISD::SHL:
9152     Res = Expand64BitShift(N, DAG, Subtarget);
9153     break;
9154   case ISD::SREM:
9155   case ISD::UREM:
9156     Res = LowerREM(N, DAG);
9157     break;
9158   case ISD::SDIVREM:
9159   case ISD::UDIVREM:
9160     Res = LowerDivRem(SDValue(N, 0), DAG);
9161     assert(Res.getNumOperands() == 2 && "DivRem needs two values");
9162     Results.push_back(Res.getValue(0));
9163     Results.push_back(Res.getValue(1));
9164     return;
9165   case ISD::READCYCLECOUNTER:
9166     ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
9167     return;
9168   case ISD::UDIV:
9169   case ISD::SDIV:
9170     assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
9171     return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
9172                              Results);
9173   case ISD::ATOMIC_CMP_SWAP:
9174     ReplaceCMP_SWAP_64Results(N, Results, DAG);
9175     return;
9176   case ISD::INTRINSIC_WO_CHAIN:
9177     return ReplaceLongIntrinsic(N, Results, DAG);
9178   case ISD::ABS:
9179      lowerABS(N, Results, DAG);
9180      return ;
9181
9182   }
9183   if (Res.getNode())
9184     Results.push_back(Res);
9185 }
9186
9187 //===----------------------------------------------------------------------===//
9188 //                           ARM Scheduler Hooks
9189 //===----------------------------------------------------------------------===//
9190
9191 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
9192 /// registers the function context.
9193 void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
9194                                                MachineBasicBlock *MBB,
9195                                                MachineBasicBlock *DispatchBB,
9196                                                int FI) const {
9197   assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
9198          "ROPI/RWPI not currently supported with SjLj");
9199   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
9200   DebugLoc dl = MI.getDebugLoc();
9201   MachineFunction *MF = MBB->getParent();
9202   MachineRegisterInfo *MRI = &MF->getRegInfo();
9203   MachineConstantPool *MCP = MF->getConstantPool();
9204   ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
9205   const Function &F = MF->getFunction();
9206
9207   bool isThumb = Subtarget->isThumb();
9208   bool isThumb2 = Subtarget->isThumb2();
9209
9210   unsigned PCLabelId = AFI->createPICLabelUId();
9211   unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
9212   ARMConstantPoolValue *CPV =
9213     ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
9214   unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
9215
9216   const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
9217                                            : &ARM::GPRRegClass;
9218
9219   // Grab constant pool and fixed stack memory operands.
9220   MachineMemOperand *CPMMO =
9221       MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
9222                                MachineMemOperand::MOLoad, 4, 4);
9223
9224   MachineMemOperand *FIMMOSt =
9225       MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),
9226                                MachineMemOperand::MOStore, 4, 4);
9227
9228   // Load the address of the dispatch MBB into the jump buffer.
9229   if (isThumb2) {
9230     // Incoming value: jbuf
9231     //   ldr.n  r5, LCPI1_1
9232     //   orr    r5, r5, #1
9233     //   add    r5, pc
9234     //   str    r5, [$jbuf, #+4] ; &jbuf[1]
9235     Register NewVReg1 = MRI->createVirtualRegister(TRC);
9236     BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
9237         .addConstantPoolIndex(CPI)
9238         .addMemOperand(CPMMO)
9239         .add(predOps(ARMCC::AL));
9240     // Set the low bit because of thumb mode.
9241     Register NewVReg2 = MRI->createVirtualRegister(TRC);
9242     BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
9243         .addReg(NewVReg1, RegState::Kill)
9244         .addImm(0x01)
9245         .add(predOps(ARMCC::AL))
9246         .add(condCodeOp());
9247     Register NewVReg3 = MRI->createVirtualRegister(TRC);
9248     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
9249       .addReg(NewVReg2, RegState::Kill)
9250       .addImm(PCLabelId);
9251     BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
9252         .addReg(NewVReg3, RegState::Kill)
9253         .addFrameIndex(FI)
9254         .addImm(36) // &jbuf[1] :: pc
9255         .addMemOperand(FIMMOSt)
9256         .add(predOps(ARMCC::AL));
9257   } else if (isThumb) {
9258     // Incoming value: jbuf
9259     //   ldr.n  r1, LCPI1_4
9260     //   add    r1, pc
9261     //   mov    r2, #1
9262     //   orrs   r1, r2
9263     //   add    r2, $jbuf, #+4 ; &jbuf[1]
9264     //   str    r1, [r2]
9265     Register NewVReg1 = MRI->createVirtualRegister(TRC);
9266     BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
9267         .addConstantPoolIndex(CPI)
9268         .addMemOperand(CPMMO)
9269         .add(predOps(ARMCC::AL));
9270     Register NewVReg2 = MRI->createVirtualRegister(TRC);
9271     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
9272       .addReg(NewVReg1, RegState::Kill)
9273       .addImm(PCLabelId);
9274     // Set the low bit because of thumb mode.
9275     Register NewVReg3 = MRI->createVirtualRegister(TRC);
9276     BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
9277         .addReg(ARM::CPSR, RegState::Define)
9278         .addImm(1)
9279         .add(predOps(ARMCC::AL));
9280     Register NewVReg4 = MRI->createVirtualRegister(TRC);
9281     BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
9282         .addReg(ARM::CPSR, RegState::Define)
9283         .addReg(NewVReg2, RegState::Kill)
9284         .addReg(NewVReg3, RegState::Kill)
9285         .add(predOps(ARMCC::AL));
9286     Register NewVReg5 = MRI->createVirtualRegister(TRC);
9287     BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
9288             .addFrameIndex(FI)
9289             .addImm(36); // &jbuf[1] :: pc
9290     BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
9291         .addReg(NewVReg4, RegState::Kill)
9292         .addReg(NewVReg5, RegState::Kill)
9293         .addImm(0)
9294         .addMemOperand(FIMMOSt)
9295         .add(predOps(ARMCC::AL));
9296   } else {
9297     // Incoming value: jbuf
9298     //   ldr  r1, LCPI1_1
9299     //   add  r1, pc, r1
9300     //   str  r1, [$jbuf, #+4] ; &jbuf[1]
9301     Register NewVReg1 = MRI->createVirtualRegister(TRC);
9302     BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
9303         .addConstantPoolIndex(CPI)
9304         .addImm(0)
9305         .addMemOperand(CPMMO)
9306         .add(predOps(ARMCC::AL));
9307     Register NewVReg2 = MRI->createVirtualRegister(TRC);
9308     BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
9309         .addReg(NewVReg1, RegState::Kill)
9310         .addImm(PCLabelId)
9311         .add(predOps(ARMCC::AL));
9312     BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
9313         .addReg(NewVReg2, RegState::Kill)
9314         .addFrameIndex(FI)
9315         .addImm(36) // &jbuf[1] :: pc
9316         .addMemOperand(FIMMOSt)
9317         .add(predOps(ARMCC::AL));
9318   }
9319 }
9320
9321 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
9322                                               MachineBasicBlock *MBB) const {
9323   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
9324   DebugLoc dl = MI.getDebugLoc();
9325   MachineFunction *MF = MBB->getParent();
9326   MachineRegisterInfo *MRI = &MF->getRegInfo();
9327   MachineFrameInfo &MFI = MF->getFrameInfo();
9328   int FI = MFI.getFunctionContextIndex();
9329
9330   const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
9331                                                         : &ARM::GPRnopcRegClass;
9332
9333   // Get a mapping of the call site numbers to all of the landing pads they're
9334   // associated with.
9335   DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
9336   unsigned MaxCSNum = 0;
9337   for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
9338        ++BB) {
9339     if (!BB->isEHPad()) continue;
9340
9341     // FIXME: We should assert that the EH_LABEL is the first MI in the landing
9342     // pad.
9343     for (MachineBasicBlock::iterator
9344            II = BB->begin(), IE = BB->end(); II != IE; ++II) {
9345       if (!II->isEHLabel()) continue;
9346
9347       MCSymbol *Sym = II->getOperand(0).getMCSymbol();
9348       if (!MF->hasCallSiteLandingPad(Sym)) continue;
9349
9350       SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
9351       for (SmallVectorImpl<unsigned>::iterator
9352              CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
9353            CSI != CSE; ++CSI) {
9354         CallSiteNumToLPad[*CSI].push_back(&*BB);
9355         MaxCSNum = std::max(MaxCSNum, *CSI);
9356       }
9357       break;
9358     }
9359   }
9360
9361   // Get an ordered list of the machine basic blocks for the jump table.
9362   std::vector<MachineBasicBlock*> LPadList;
9363   SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
9364   LPadList.reserve(CallSiteNumToLPad.size());
9365   for (unsigned I = 1; I <= MaxCSNum; ++I) {
9366     SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
9367     for (SmallVectorImpl<MachineBasicBlock*>::iterator
9368            II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) {
9369       LPadList.push_back(*II);
9370       InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end());
9371     }
9372   }
9373
9374   assert(!LPadList.empty() &&
9375          "No landing pad destinations for the dispatch jump table!");
9376
9377   // Create the jump table and associated information.
9378   MachineJumpTableInfo *JTI =
9379     MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
9380   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
9381
9382   // Create the MBBs for the dispatch code.
9383
9384   // Shove the dispatch's address into the return slot in the function context.
9385   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
9386   DispatchBB->setIsEHPad();
9387
9388   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
9389   unsigned trap_opcode;
9390   if (Subtarget->isThumb())
9391     trap_opcode = ARM::tTRAP;
9392   else
9393     trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
9394
9395   BuildMI(TrapBB, dl, TII->get(trap_opcode));
9396   DispatchBB->addSuccessor(TrapBB);
9397
9398   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
9399   DispatchBB->addSuccessor(DispContBB);
9400
9401   // Insert and MBBs.
9402   MF->insert(MF->end(), DispatchBB);
9403   MF->insert(MF->end(), DispContBB);
9404   MF->insert(MF->end(), TrapBB);
9405
9406   // Insert code into the entry block that creates and registers the function
9407   // context.
9408   SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
9409
9410   MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
9411       MachinePointerInfo::getFixedStack(*MF, FI),
9412       MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4);
9413
9414   MachineInstrBuilder MIB;
9415   MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
9416
9417   const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
9418   const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
9419
9420   // Add a register mask with no preserved registers.  This results in all
9421   // registers being marked as clobbered. This can't work if the dispatch block
9422   // is in a Thumb1 function and is linked with ARM code which uses the FP
9423   // registers, as there is no way to preserve the FP registers in Thumb1 mode.
9424   MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF));
9425
9426   bool IsPositionIndependent = isPositionIndependent();
9427   unsigned NumLPads = LPadList.size();
9428   if (Subtarget->isThumb2()) {
9429     Register NewVReg1 = MRI->createVirtualRegister(TRC);
9430     BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
9431         .addFrameIndex(FI)
9432         .addImm(4)
9433         .addMemOperand(FIMMOLd)
9434         .add(predOps(ARMCC::AL));
9435
9436     if (NumLPads < 256) {
9437       BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
9438           .addReg(NewVReg1)
9439           .addImm(LPadList.size())
9440           .add(predOps(ARMCC::AL));
9441     } else {
9442       Register VReg1 = MRI->createVirtualRegister(TRC);
9443       BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
9444           .addImm(NumLPads & 0xFFFF)
9445           .add(predOps(ARMCC::AL));
9446
9447       unsigned VReg2 = VReg1;
9448       if ((NumLPads & 0xFFFF0000) != 0) {
9449         VReg2 = MRI->createVirtualRegister(TRC);
9450         BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
9451             .addReg(VReg1)
9452             .addImm(NumLPads >> 16)
9453             .add(predOps(ARMCC::AL));
9454       }
9455
9456       BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
9457           .addReg(NewVReg1)
9458           .addReg(VReg2)
9459           .add(predOps(ARMCC::AL));
9460     }
9461
9462     BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
9463       .addMBB(TrapBB)
9464       .addImm(ARMCC::HI)
9465       .addReg(ARM::CPSR);
9466
9467     Register NewVReg3 = MRI->createVirtualRegister(TRC);
9468     BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
9469         .addJumpTableIndex(MJTI)
9470         .add(predOps(ARMCC::AL));
9471
9472     Register NewVReg4 = MRI->createVirtualRegister(TRC);
9473     BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
9474         .addReg(NewVReg3, RegState::Kill)
9475         .addReg(NewVReg1)
9476         .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
9477         .add(predOps(ARMCC::AL))
9478         .add(condCodeOp());
9479
9480     BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
9481       .addReg(NewVReg4, RegState::Kill)
9482       .addReg(NewVReg1)
9483       .addJumpTableIndex(MJTI);
9484   } else if (Subtarget->isThumb()) {
9485     Register NewVReg1 = MRI->createVirtualRegister(TRC);
9486     BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
9487         .addFrameIndex(FI)
9488         .addImm(1)
9489         .addMemOperand(FIMMOLd)
9490         .add(predOps(ARMCC::AL));
9491
9492     if (NumLPads < 256) {
9493       BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
9494           .addReg(NewVReg1)
9495           .addImm(NumLPads)
9496           .add(predOps(ARMCC::AL));
9497     } else {
9498       MachineConstantPool *ConstantPool = MF->getConstantPool();
9499       Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
9500       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
9501
9502       // MachineConstantPool wants an explicit alignment.
9503       unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
9504       if (Align == 0)
9505         Align = MF->getDataLayout().getTypeAllocSize(C->getType());
9506       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
9507
9508       Register VReg1 = MRI->createVirtualRegister(TRC);
9509       BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
9510           .addReg(VReg1, RegState::Define)
9511           .addConstantPoolIndex(Idx)
9512           .add(predOps(ARMCC::AL));
9513       BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
9514           .addReg(NewVReg1)
9515           .addReg(VReg1)
9516           .add(predOps(ARMCC::AL));
9517     }
9518
9519     BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
9520       .addMBB(TrapBB)
9521       .addImm(ARMCC::HI)
9522       .addReg(ARM::CPSR);
9523
9524     Register NewVReg2 = MRI->createVirtualRegister(TRC);
9525     BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
9526         .addReg(ARM::CPSR, RegState::Define)
9527         .addReg(NewVReg1)
9528         .addImm(2)
9529         .add(predOps(ARMCC::AL));
9530
9531     Register NewVReg3 = MRI->createVirtualRegister(TRC);
9532     BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
9533         .addJumpTableIndex(MJTI)
9534         .add(predOps(ARMCC::AL));
9535
9536     Register NewVReg4 = MRI->createVirtualRegister(TRC);
9537     BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
9538         .addReg(ARM::CPSR, RegState::Define)
9539         .addReg(NewVReg2, RegState::Kill)
9540         .addReg(NewVReg3)
9541         .add(predOps(ARMCC::AL));
9542
9543     MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
9544         MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
9545
9546     Register NewVReg5 = MRI->createVirtualRegister(TRC);
9547     BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
9548         .addReg(NewVReg4, RegState::Kill)
9549         .addImm(0)
9550         .addMemOperand(JTMMOLd)
9551         .add(predOps(ARMCC::AL));
9552
9553     unsigned NewVReg6 = NewVReg5;
9554     if (IsPositionIndependent) {
9555       NewVReg6 = MRI->createVirtualRegister(TRC);
9556       BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
9557           .addReg(ARM::CPSR, RegState::Define)
9558           .addReg(NewVReg5, RegState::Kill)
9559           .addReg(NewVReg3)
9560           .add(predOps(ARMCC::AL));
9561     }
9562
9563     BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
9564       .addReg(NewVReg6, RegState::Kill)
9565       .addJumpTableIndex(MJTI);
9566   } else {
9567     Register NewVReg1 = MRI->createVirtualRegister(TRC);
9568     BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
9569         .addFrameIndex(FI)
9570         .addImm(4)
9571         .addMemOperand(FIMMOLd)
9572         .add(predOps(ARMCC::AL));
9573
9574     if (NumLPads < 256) {
9575       BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
9576           .addReg(NewVReg1)
9577           .addImm(NumLPads)
9578           .add(predOps(ARMCC::AL));
9579     } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
9580       Register VReg1 = MRI->createVirtualRegister(TRC);
9581       BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
9582           .addImm(NumLPads & 0xFFFF)
9583           .add(predOps(ARMCC::AL));
9584
9585       unsigned VReg2 = VReg1;
9586       if ((NumLPads & 0xFFFF0000) != 0) {
9587         VReg2 = MRI->createVirtualRegister(TRC);
9588         BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
9589             .addReg(VReg1)
9590             .addImm(NumLPads >> 16)
9591             .add(predOps(ARMCC::AL));
9592       }
9593
9594       BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
9595           .addReg(NewVReg1)
9596           .addReg(VReg2)
9597           .add(predOps(ARMCC::AL));
9598     } else {
9599       MachineConstantPool *ConstantPool = MF->getConstantPool();
9600       Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
9601       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
9602
9603       // MachineConstantPool wants an explicit alignment.
9604       unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
9605       if (Align == 0)
9606         Align = MF->getDataLayout().getTypeAllocSize(C->getType());
9607       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
9608
9609       Register VReg1 = MRI->createVirtualRegister(TRC);
9610       BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
9611           .addReg(VReg1, RegState::Define)
9612           .addConstantPoolIndex(Idx)
9613           .addImm(0)
9614           .add(predOps(ARMCC::AL));
9615       BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
9616           .addReg(NewVReg1)
9617           .addReg(VReg1, RegState::Kill)
9618           .add(predOps(ARMCC::AL));
9619     }
9620
9621     BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
9622       .addMBB(TrapBB)
9623       .addImm(ARMCC::HI)
9624       .addReg(ARM::CPSR);
9625
9626     Register NewVReg3 = MRI->createVirtualRegister(TRC);
9627     BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
9628         .addReg(NewVReg1)
9629         .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
9630         .add(predOps(ARMCC::AL))
9631         .add(condCodeOp());
9632     Register NewVReg4 = MRI->createVirtualRegister(TRC);
9633     BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
9634         .addJumpTableIndex(MJTI)
9635         .add(predOps(ARMCC::AL));
9636
9637     MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
9638         MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
9639     Register NewVReg5 = MRI->createVirtualRegister(TRC);
9640     BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
9641         .addReg(NewVReg3, RegState::Kill)
9642         .addReg(NewVReg4)
9643         .addImm(0)
9644         .addMemOperand(JTMMOLd)
9645         .add(predOps(ARMCC::AL));
9646
9647     if (IsPositionIndependent) {
9648       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
9649         .addReg(NewVReg5, RegState::Kill)
9650         .addReg(NewVReg4)
9651         .addJumpTableIndex(MJTI);
9652     } else {
9653       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
9654         .addReg(NewVReg5, RegState::Kill)
9655         .addJumpTableIndex(MJTI);
9656     }
9657   }
9658
9659   // Add the jump table entries as successors to the MBB.
9660   SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
9661   for (std::vector<MachineBasicBlock*>::iterator
9662          I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
9663     MachineBasicBlock *CurMBB = *I;
9664     if (SeenMBBs.insert(CurMBB).second)
9665       DispContBB->addSuccessor(CurMBB);
9666   }
9667
9668   // N.B. the order the invoke BBs are processed in doesn't matter here.
9669   const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
9670   SmallVector<MachineBasicBlock*, 64> MBBLPads;
9671   for (MachineBasicBlock *BB : InvokeBBs) {
9672
9673     // Remove the landing pad successor from the invoke block and replace it
9674     // with the new dispatch block.
9675     SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(),
9676                                                   BB->succ_end());
9677     while (!Successors.empty()) {
9678       MachineBasicBlock *SMBB = Successors.pop_back_val();
9679       if (SMBB->isEHPad()) {
9680         BB->removeSuccessor(SMBB);
9681         MBBLPads.push_back(SMBB);
9682       }
9683     }
9684
9685     BB->addSuccessor(DispatchBB, BranchProbability::getZero());
9686     BB->normalizeSuccProbs();
9687
9688     // Find the invoke call and mark all of the callee-saved registers as
9689     // 'implicit defined' so that they're spilled. This prevents code from
9690     // moving instructions to before the EH block, where they will never be
9691     // executed.
9692     for (MachineBasicBlock::reverse_iterator
9693            II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
9694       if (!II->isCall()) continue;
9695
9696       DenseMap<unsigned, bool> DefRegs;
9697       for (MachineInstr::mop_iterator
9698              OI = II->operands_begin(), OE = II->operands_end();
9699            OI != OE; ++OI) {
9700         if (!OI->isReg()) continue;
9701         DefRegs[OI->getReg()] = true;
9702       }
9703
9704       MachineInstrBuilder MIB(*MF, &*II);
9705
9706       for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
9707         unsigned Reg = SavedRegs[i];
9708         if (Subtarget->isThumb2() &&
9709             !ARM::tGPRRegClass.contains(Reg) &&
9710             !ARM::hGPRRegClass.contains(Reg))
9711           continue;
9712         if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
9713           continue;
9714         if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
9715           continue;
9716         if (!DefRegs[Reg])
9717           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
9718       }
9719
9720       break;
9721     }
9722   }
9723
9724   // Mark all former landing pads as non-landing pads. The dispatch is the only
9725   // landing pad now.
9726   for (SmallVectorImpl<MachineBasicBlock*>::iterator
9727          I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I)
9728     (*I)->setIsEHPad(false);
9729
9730   // The instruction is gone now.
9731   MI.eraseFromParent();
9732 }
9733
9734 static
9735 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
9736   for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
9737        E = MBB->succ_end(); I != E; ++I)
9738     if (*I != Succ)
9739       return *I;
9740   llvm_unreachable("Expecting a BB with two successors!");
9741 }
9742
9743 /// Return the load opcode for a given load size. If load size >= 8,
9744 /// neon opcode will be returned.
9745 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
9746   if (LdSize >= 8)
9747     return LdSize == 16 ? ARM::VLD1q32wb_fixed
9748                         : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
9749   if (IsThumb1)
9750     return LdSize == 4 ? ARM::tLDRi
9751                        : LdSize == 2 ? ARM::tLDRHi
9752                                      : LdSize == 1 ? ARM::tLDRBi : 0;
9753   if (IsThumb2)
9754     return LdSize == 4 ? ARM::t2LDR_POST
9755                        : LdSize == 2 ? ARM::t2LDRH_POST
9756                                      : LdSize == 1 ? ARM::t2LDRB_POST : 0;
9757   return LdSize == 4 ? ARM::LDR_POST_IMM
9758                      : LdSize == 2 ? ARM::LDRH_POST
9759                                    : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
9760 }
9761
9762 /// Return the store opcode for a given store size. If store size >= 8,
9763 /// neon opcode will be returned.
9764 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
9765   if (StSize >= 8)
9766     return StSize == 16 ? ARM::VST1q32wb_fixed
9767                         : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
9768   if (IsThumb1)
9769     return StSize == 4 ? ARM::tSTRi
9770                        : StSize == 2 ? ARM::tSTRHi
9771                                      : StSize == 1 ? ARM::tSTRBi : 0;
9772   if (IsThumb2)
9773     return StSize == 4 ? ARM::t2STR_POST
9774                        : StSize == 2 ? ARM::t2STRH_POST
9775                                      : StSize == 1 ? ARM::t2STRB_POST : 0;
9776   return StSize == 4 ? ARM::STR_POST_IMM
9777                      : StSize == 2 ? ARM::STRH_POST
9778                                    : StSize == 1 ? ARM::STRB_POST_IMM : 0;
9779 }
9780
9781 /// Emit a post-increment load operation with given size. The instructions
9782 /// will be added to BB at Pos.
9783 static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
9784                        const TargetInstrInfo *TII, const DebugLoc &dl,
9785                        unsigned LdSize, unsigned Data, unsigned AddrIn,
9786                        unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
9787   unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
9788   assert(LdOpc != 0 && "Should have a load opcode");
9789   if (LdSize >= 8) {
9790     BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
9791         .addReg(AddrOut, RegState::Define)
9792         .addReg(AddrIn)
9793         .addImm(0)
9794         .add(predOps(ARMCC::AL));
9795   } else if (IsThumb1) {
9796     // load + update AddrIn
9797     BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
9798         .addReg(AddrIn)
9799         .addImm(0)
9800         .add(predOps(ARMCC::AL));
9801     BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
9802         .add(t1CondCodeOp())
9803         .addReg(AddrIn)
9804         .addImm(LdSize)
9805         .add(predOps(ARMCC::AL));
9806   } else if (IsThumb2) {
9807     BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
9808         .addReg(AddrOut, RegState::Define)
9809         .addReg(AddrIn)
9810         .addImm(LdSize)
9811         .add(predOps(ARMCC::AL));
9812   } else { // arm
9813     BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
9814         .addReg(AddrOut, RegState::Define)
9815         .addReg(AddrIn)
9816         .addReg(0)
9817         .addImm(LdSize)
9818         .add(predOps(ARMCC::AL));
9819   }
9820 }
9821
9822 /// Emit a post-increment store operation with given size. The instructions
9823 /// will be added to BB at Pos.
9824 static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
9825                        const TargetInstrInfo *TII, const DebugLoc &dl,
9826                        unsigned StSize, unsigned Data, unsigned AddrIn,
9827                        unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
9828   unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
9829   assert(StOpc != 0 && "Should have a store opcode");
9830   if (StSize >= 8) {
9831     BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
9832         .addReg(AddrIn)
9833         .addImm(0)
9834         .addReg(Data)
9835         .add(predOps(ARMCC::AL));
9836   } else if (IsThumb1) {
9837     // store + update AddrIn
9838     BuildMI(*BB, Pos, dl, TII->get(StOpc))
9839         .addReg(Data)
9840         .addReg(AddrIn)
9841         .addImm(0)
9842         .add(predOps(ARMCC::AL));
9843     BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
9844         .add(t1CondCodeOp())
9845         .addReg(AddrIn)
9846         .addImm(StSize)
9847         .add(predOps(ARMCC::AL));
9848   } else if (IsThumb2) {
9849     BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
9850         .addReg(Data)
9851         .addReg(AddrIn)
9852         .addImm(StSize)
9853         .add(predOps(ARMCC::AL));
9854   } else { // arm
9855     BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
9856         .addReg(Data)
9857         .addReg(AddrIn)
9858         .addReg(0)
9859         .addImm(StSize)
9860         .add(predOps(ARMCC::AL));
9861   }
9862 }
9863
9864 MachineBasicBlock *
9865 ARMTargetLowering::EmitStructByval(MachineInstr &MI,
9866                                    MachineBasicBlock *BB) const {
9867   // This pseudo instruction has 3 operands: dst, src, size
9868   // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
9869   // Otherwise, we will generate unrolled scalar copies.
9870   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
9871   const BasicBlock *LLVM_BB = BB->getBasicBlock();
9872   MachineFunction::iterator It = ++BB->getIterator();
9873
9874   Register dest = MI.getOperand(0).getReg();
9875   Register src = MI.getOperand(1).getReg();
9876   unsigned SizeVal = MI.getOperand(2).getImm();
9877   unsigned Align = MI.getOperand(3).getImm();
9878   DebugLoc dl = MI.getDebugLoc();
9879
9880   MachineFunction *MF = BB->getParent();
9881   MachineRegisterInfo &MRI = MF->getRegInfo();
9882   unsigned UnitSize = 0;
9883   const TargetRegisterClass *TRC = nullptr;
9884   const TargetRegisterClass *VecTRC = nullptr;
9885
9886   bool IsThumb1 = Subtarget->isThumb1Only();
9887   bool IsThumb2 = Subtarget->isThumb2();
9888   bool IsThumb = Subtarget->isThumb();
9889
9890   if (Align & 1) {
9891     UnitSize = 1;
9892   } else if (Align & 2) {
9893     UnitSize = 2;
9894   } else {
9895     // Check whether we can use NEON instructions.
9896     if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
9897         Subtarget->hasNEON()) {
9898       if ((Align % 16 == 0) && SizeVal >= 16)
9899         UnitSize = 16;
9900       else if ((Align % 8 == 0) && SizeVal >= 8)
9901         UnitSize = 8;
9902     }
9903     // Can't use NEON instructions.
9904     if (UnitSize == 0)
9905       UnitSize = 4;
9906   }
9907
9908   // Select the correct opcode and register class for unit size load/store
9909   bool IsNeon = UnitSize >= 8;
9910   TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
9911   if (IsNeon)
9912     VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
9913                             : UnitSize == 8 ? &ARM::DPRRegClass
9914                                             : nullptr;
9915
9916   unsigned BytesLeft = SizeVal % UnitSize;
9917   unsigned LoopSize = SizeVal - BytesLeft;
9918
9919   if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
9920     // Use LDR and STR to copy.
9921     // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
9922     // [destOut] = STR_POST(scratch, destIn, UnitSize)
9923     unsigned srcIn = src;
9924     unsigned destIn = dest;
9925     for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
9926       Register srcOut = MRI.createVirtualRegister(TRC);
9927       Register destOut = MRI.createVirtualRegister(TRC);
9928       Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
9929       emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
9930                  IsThumb1, IsThumb2);
9931       emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
9932                  IsThumb1, IsThumb2);
9933       srcIn = srcOut;
9934       destIn = destOut;
9935     }
9936
9937     // Handle the leftover bytes with LDRB and STRB.
9938     // [scratch, srcOut] = LDRB_POST(srcIn, 1)
9939     // [destOut] = STRB_POST(scratch, destIn, 1)
9940     for (unsigned i = 0; i < BytesLeft; i++) {
9941       Register srcOut = MRI.createVirtualRegister(TRC);
9942       Register destOut = MRI.createVirtualRegister(TRC);
9943       Register scratch = MRI.createVirtualRegister(TRC);
9944       emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
9945                  IsThumb1, IsThumb2);
9946       emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
9947                  IsThumb1, IsThumb2);
9948       srcIn = srcOut;
9949       destIn = destOut;
9950     }
9951     MI.eraseFromParent(); // The instruction is gone now.
9952     return BB;
9953   }
9954
9955   // Expand the pseudo op to a loop.
9956   // thisMBB:
9957   //   ...
9958   //   movw varEnd, # --> with thumb2
9959   //   movt varEnd, #
9960   //   ldrcp varEnd, idx --> without thumb2
9961   //   fallthrough --> loopMBB
9962   // loopMBB:
9963   //   PHI varPhi, varEnd, varLoop
9964   //   PHI srcPhi, src, srcLoop
9965   //   PHI destPhi, dst, destLoop
9966   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
9967   //   [destLoop] = STR_POST(scratch, destPhi, UnitSize)
9968   //   subs varLoop, varPhi, #UnitSize
9969   //   bne loopMBB
9970   //   fallthrough --> exitMBB
9971   // exitMBB:
9972   //   epilogue to handle left-over bytes
9973   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
9974   //   [destOut] = STRB_POST(scratch, destLoop, 1)
9975   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
9976   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
9977   MF->insert(It, loopMBB);
9978   MF->insert(It, exitMBB);
9979
9980   // Transfer the remainder of BB and its successor edges to exitMBB.
9981   exitMBB->splice(exitMBB->begin(), BB,
9982                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
9983   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
9984
9985   // Load an immediate to varEnd.
9986   Register varEnd = MRI.createVirtualRegister(TRC);
9987   if (Subtarget->useMovt()) {
9988     unsigned Vtmp = varEnd;
9989     if ((LoopSize & 0xFFFF0000) != 0)
9990       Vtmp = MRI.createVirtualRegister(TRC);
9991     BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp)
9992         .addImm(LoopSize & 0xFFFF)
9993         .add(predOps(ARMCC::AL));
9994
9995     if ((LoopSize & 0xFFFF0000) != 0)
9996       BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd)
9997           .addReg(Vtmp)
9998           .addImm(LoopSize >> 16)
9999           .add(predOps(ARMCC::AL));
10000   } else {
10001     MachineConstantPool *ConstantPool = MF->getConstantPool();
10002     Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
10003     const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
10004
10005     // MachineConstantPool wants an explicit alignment.
10006     unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
10007     if (Align == 0)
10008       Align = MF->getDataLayout().getTypeAllocSize(C->getType());
10009     unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
10010     MachineMemOperand *CPMMO =
10011         MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
10012                                  MachineMemOperand::MOLoad, 4, 4);
10013
10014     if (IsThumb)
10015       BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
10016           .addReg(varEnd, RegState::Define)
10017           .addConstantPoolIndex(Idx)
10018           .add(predOps(ARMCC::AL))
10019           .addMemOperand(CPMMO);
10020     else
10021       BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
10022           .addReg(varEnd, RegState::Define)
10023           .addConstantPoolIndex(Idx)
10024           .addImm(0)
10025           .add(predOps(ARMCC::AL))
10026           .addMemOperand(CPMMO);
10027   }
10028   BB->addSuccessor(loopMBB);
10029
10030   // Generate the loop body:
10031   //   varPhi = PHI(varLoop, varEnd)
10032   //   srcPhi = PHI(srcLoop, src)
10033   //   destPhi = PHI(destLoop, dst)
10034   MachineBasicBlock *entryBB = BB;
10035   BB = loopMBB;
10036   Register varLoop = MRI.createVirtualRegister(TRC);
10037   Register varPhi = MRI.createVirtualRegister(TRC);
10038   Register srcLoop = MRI.createVirtualRegister(TRC);
10039   Register srcPhi = MRI.createVirtualRegister(TRC);
10040   Register destLoop = MRI.createVirtualRegister(TRC);
10041   Register destPhi = MRI.createVirtualRegister(TRC);
10042
10043   BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
10044     .addReg(varLoop).addMBB(loopMBB)
10045     .addReg(varEnd).addMBB(entryBB);
10046   BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
10047     .addReg(srcLoop).addMBB(loopMBB)
10048     .addReg(src).addMBB(entryBB);
10049   BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
10050     .addReg(destLoop).addMBB(loopMBB)
10051     .addReg(dest).addMBB(entryBB);
10052
10053   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
10054   //   [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
10055   Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
10056   emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
10057              IsThumb1, IsThumb2);
10058   emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
10059              IsThumb1, IsThumb2);
10060
10061   // Decrement loop variable by UnitSize.
10062   if (IsThumb1) {
10063     BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
10064         .add(t1CondCodeOp())
10065         .addReg(varPhi)
10066         .addImm(UnitSize)
10067         .add(predOps(ARMCC::AL));
10068   } else {
10069     MachineInstrBuilder MIB =
10070         BuildMI(*BB, BB->end(), dl,
10071                 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
10072     MIB.addReg(varPhi)
10073         .addImm(UnitSize)
10074         .add(predOps(ARMCC::AL))
10075         .add(condCodeOp());
10076     MIB->getOperand(5).setReg(ARM::CPSR);
10077     MIB->getOperand(5).setIsDef(true);
10078   }
10079   BuildMI(*BB, BB->end(), dl,
10080           TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
10081       .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
10082
10083   // loopMBB can loop back to loopMBB or fall through to exitMBB.
10084   BB->addSuccessor(loopMBB);
10085   BB->addSuccessor(exitMBB);
10086
10087   // Add epilogue to handle BytesLeft.
10088   BB = exitMBB;
10089   auto StartOfExit = exitMBB->begin();
10090
10091   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
10092   //   [destOut] = STRB_POST(scratch, destLoop, 1)
10093   unsigned srcIn = srcLoop;
10094   unsigned destIn = destLoop;
10095   for (unsigned i = 0; i < BytesLeft; i++) {
10096     Register srcOut = MRI.createVirtualRegister(TRC);
10097     Register destOut = MRI.createVirtualRegister(TRC);
10098     Register scratch = MRI.createVirtualRegister(TRC);
10099     emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
10100                IsThumb1, IsThumb2);
10101     emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
10102                IsThumb1, IsThumb2);
10103     srcIn = srcOut;
10104     destIn = destOut;
10105   }
10106
10107   MI.eraseFromParent(); // The instruction is gone now.
10108   return BB;
10109 }
10110
10111 MachineBasicBlock *
10112 ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
10113                                        MachineBasicBlock *MBB) const {
10114   const TargetMachine &TM = getTargetMachine();
10115   const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
10116   DebugLoc DL = MI.getDebugLoc();
10117
10118   assert(Subtarget->isTargetWindows() &&
10119          "__chkstk is only supported on Windows");
10120   assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
10121
10122   // __chkstk takes the number of words to allocate on the stack in R4, and
10123   // returns the stack adjustment in number of bytes in R4.  This will not
10124   // clober any other registers (other than the obvious lr).
10125   //
10126   // Although, technically, IP should be considered a register which may be
10127   // clobbered, the call itself will not touch it.  Windows on ARM is a pure
10128   // thumb-2 environment, so there is no interworking required.  As a result, we
10129   // do not expect a veneer to be emitted by the linker, clobbering IP.
10130   //
10131   // Each module receives its own copy of __chkstk, so no import thunk is
10132   // required, again, ensuring that IP is not clobbered.
10133   //
10134   // Finally, although some linkers may theoretically provide a trampoline for
10135   // out of range calls (which is quite common due to a 32M range limitation of
10136   // branches for Thumb), we can generate the long-call version via
10137   // -mcmodel=large, alleviating the need for the trampoline which may clobber
10138   // IP.
10139
10140   switch (TM.getCodeModel()) {
10141   case CodeModel::Tiny:
10142     llvm_unreachable("Tiny code model not available on ARM.");
10143   case CodeModel::Small:
10144   case CodeModel::Medium:
10145   case CodeModel::Kernel:
10146     BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
10147         .add(predOps(ARMCC::AL))
10148         .addExternalSymbol("__chkstk")
10149         .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
10150         .addReg(ARM::R4, RegState::Implicit | RegState::Define)
10151         .addReg(ARM::R12,
10152                 RegState::Implicit | RegState::Define | RegState::Dead)
10153         .addReg(ARM::CPSR,
10154                 RegState::Implicit | RegState::Define | RegState::Dead);
10155     break;
10156   case CodeModel::Large: {
10157     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
10158     Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
10159
10160     BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
10161       .addExternalSymbol("__chkstk");
10162     BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr))
10163         .add(predOps(ARMCC::AL))
10164         .addReg(Reg, RegState::Kill)
10165         .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
10166         .addReg(ARM::R4, RegState::Implicit | RegState::Define)
10167         .addReg(ARM::R12,
10168                 RegState::Implicit | RegState::Define | RegState::Dead)
10169         .addReg(ARM::CPSR,
10170                 RegState::Implicit | RegState::Define | RegState::Dead);
10171     break;
10172   }
10173   }
10174
10175   BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
10176       .addReg(ARM::SP, RegState::Kill)
10177       .addReg(ARM::R4, RegState::Kill)
10178       .setMIFlags(MachineInstr::FrameSetup)
10179       .add(predOps(ARMCC::AL))
10180       .add(condCodeOp());
10181
10182   MI.eraseFromParent();
10183   return MBB;
10184 }
10185
10186 MachineBasicBlock *
10187 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
10188                                        MachineBasicBlock *MBB) const {
10189   DebugLoc DL = MI.getDebugLoc();
10190   MachineFunction *MF = MBB->getParent();
10191   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10192
10193   MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
10194   MF->insert(++MBB->getIterator(), ContBB);
10195   ContBB->splice(ContBB->begin(), MBB,
10196                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
10197   ContBB->transferSuccessorsAndUpdatePHIs(MBB);
10198   MBB->addSuccessor(ContBB);
10199
10200   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
10201   BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
10202   MF->push_back(TrapBB);
10203   MBB->addSuccessor(TrapBB);
10204
10205   BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
10206       .addReg(MI.getOperand(0).getReg())
10207       .addImm(0)
10208       .add(predOps(ARMCC::AL));
10209   BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
10210       .addMBB(TrapBB)
10211       .addImm(ARMCC::EQ)
10212       .addReg(ARM::CPSR);
10213
10214   MI.eraseFromParent();
10215   return ContBB;
10216 }
10217
10218 // The CPSR operand of SelectItr might be missing a kill marker
10219 // because there were multiple uses of CPSR, and ISel didn't know
10220 // which to mark. Figure out whether SelectItr should have had a
10221 // kill marker, and set it if it should. Returns the correct kill
10222 // marker value.
10223 static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr,
10224                                    MachineBasicBlock* BB,
10225                                    const TargetRegisterInfo* TRI) {
10226   // Scan forward through BB for a use/def of CPSR.
10227   MachineBasicBlock::iterator miI(std::next(SelectItr));
10228   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
10229     const MachineInstr& mi = *miI;
10230     if (mi.readsRegister(ARM::CPSR))
10231       return false;
10232     if (mi.definesRegister(ARM::CPSR))
10233       break; // Should have kill-flag - update below.
10234   }
10235
10236   // If we hit the end of the block, check whether CPSR is live into a
10237   // successor.
10238   if (miI == BB->end()) {
10239     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
10240                                           sEnd = BB->succ_end();
10241          sItr != sEnd; ++sItr) {
10242       MachineBasicBlock* succ = *sItr;
10243       if (succ->isLiveIn(ARM::CPSR))
10244         return false;
10245     }
10246   }
10247
10248   // We found a def, or hit the end of the basic block and CPSR wasn't live
10249   // out. SelectMI should have a kill flag on CPSR.
10250   SelectItr->addRegisterKilled(ARM::CPSR, TRI);
10251   return true;
10252 }
10253
10254 MachineBasicBlock *
10255 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
10256                                                MachineBasicBlock *BB) const {
10257   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10258   DebugLoc dl = MI.getDebugLoc();
10259   bool isThumb2 = Subtarget->isThumb2();
10260   switch (MI.getOpcode()) {
10261   default: {
10262     MI.print(errs());
10263     llvm_unreachable("Unexpected instr type to insert");
10264   }
10265
10266   // Thumb1 post-indexed loads are really just single-register LDMs.
10267   case ARM::tLDR_postidx: {
10268     MachineOperand Def(MI.getOperand(1));
10269     BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
10270         .add(Def)  // Rn_wb
10271         .add(MI.getOperand(2))  // Rn
10272         .add(MI.getOperand(3))  // PredImm
10273         .add(MI.getOperand(4))  // PredReg
10274         .add(MI.getOperand(0))  // Rt
10275         .cloneMemRefs(MI);
10276     MI.eraseFromParent();
10277     return BB;
10278   }
10279
10280   // The Thumb2 pre-indexed stores have the same MI operands, they just
10281   // define them differently in the .td files from the isel patterns, so
10282   // they need pseudos.
10283   case ARM::t2STR_preidx:
10284     MI.setDesc(TII->get(ARM::t2STR_PRE));
10285     return BB;
10286   case ARM::t2STRB_preidx:
10287     MI.setDesc(TII->get(ARM::t2STRB_PRE));
10288     return BB;
10289   case ARM::t2STRH_preidx:
10290     MI.setDesc(TII->get(ARM::t2STRH_PRE));
10291     return BB;
10292
10293   case ARM::STRi_preidx:
10294   case ARM::STRBi_preidx: {
10295     unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
10296                                                          : ARM::STRB_PRE_IMM;
10297     // Decode the offset.
10298     unsigned Offset = MI.getOperand(4).getImm();
10299     bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
10300     Offset = ARM_AM::getAM2Offset(Offset);
10301     if (isSub)
10302       Offset = -Offset;
10303
10304     MachineMemOperand *MMO = *MI.memoperands_begin();
10305     BuildMI(*BB, MI, dl, TII->get(NewOpc))
10306         .add(MI.getOperand(0)) // Rn_wb
10307         .add(MI.getOperand(1)) // Rt
10308         .add(MI.getOperand(2)) // Rn
10309         .addImm(Offset)        // offset (skip GPR==zero_reg)
10310         .add(MI.getOperand(5)) // pred
10311         .add(MI.getOperand(6))
10312         .addMemOperand(MMO);
10313     MI.eraseFromParent();
10314     return BB;
10315   }
10316   case ARM::STRr_preidx:
10317   case ARM::STRBr_preidx:
10318   case ARM::STRH_preidx: {
10319     unsigned NewOpc;
10320     switch (MI.getOpcode()) {
10321     default: llvm_unreachable("unexpected opcode!");
10322     case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
10323     case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
10324     case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
10325     }
10326     MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
10327     for (unsigned i = 0; i < MI.getNumOperands(); ++i)
10328       MIB.add(MI.getOperand(i));
10329     MI.eraseFromParent();
10330     return BB;
10331   }
10332
10333   case ARM::tMOVCCr_pseudo: {
10334     // To "insert" a SELECT_CC instruction, we actually have to insert the
10335     // diamond control-flow pattern.  The incoming instruction knows the
10336     // destination vreg to set, the condition code register to branch on, the
10337     // true/false values to select between, and a branch opcode to use.
10338     const BasicBlock *LLVM_BB = BB->getBasicBlock();
10339     MachineFunction::iterator It = ++BB->getIterator();
10340
10341     //  thisMBB:
10342     //  ...
10343     //   TrueVal = ...
10344     //   cmpTY ccX, r1, r2
10345     //   bCC copy1MBB
10346     //   fallthrough --> copy0MBB
10347     MachineBasicBlock *thisMBB  = BB;
10348     MachineFunction *F = BB->getParent();
10349     MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
10350     MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
10351     F->insert(It, copy0MBB);
10352     F->insert(It, sinkMBB);
10353
10354     // Check whether CPSR is live past the tMOVCCr_pseudo.
10355     const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
10356     if (!MI.killsRegister(ARM::CPSR) &&
10357         !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
10358       copy0MBB->addLiveIn(ARM::CPSR);
10359       sinkMBB->addLiveIn(ARM::CPSR);
10360     }
10361
10362     // Transfer the remainder of BB and its successor edges to sinkMBB.
10363     sinkMBB->splice(sinkMBB->begin(), BB,
10364                     std::next(MachineBasicBlock::iterator(MI)), BB->end());
10365     sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
10366
10367     BB->addSuccessor(copy0MBB);
10368     BB->addSuccessor(sinkMBB);
10369
10370     BuildMI(BB, dl, TII->get(ARM::tBcc))
10371         .addMBB(sinkMBB)
10372         .addImm(MI.getOperand(3).getImm())
10373         .addReg(MI.getOperand(4).getReg());
10374
10375     //  copy0MBB:
10376     //   %FalseValue = ...
10377     //   # fallthrough to sinkMBB
10378     BB = copy0MBB;
10379
10380     // Update machine-CFG edges
10381     BB->addSuccessor(sinkMBB);
10382
10383     //  sinkMBB:
10384     //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
10385     //  ...
10386     BB = sinkMBB;
10387     BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
10388         .addReg(MI.getOperand(1).getReg())
10389         .addMBB(copy0MBB)
10390         .addReg(MI.getOperand(2).getReg())
10391         .addMBB(thisMBB);
10392
10393     MI.eraseFromParent(); // The pseudo instruction is gone now.
10394     return BB;
10395   }
10396
10397   case ARM::BCCi64:
10398   case ARM::BCCZi64: {
10399     // If there is an unconditional branch to the other successor, remove it.
10400     BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
10401
10402     // Compare both parts that make up the double comparison separately for
10403     // equality.
10404     bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
10405
10406     Register LHS1 = MI.getOperand(1).getReg();
10407     Register LHS2 = MI.getOperand(2).getReg();
10408     if (RHSisZero) {
10409       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
10410           .addReg(LHS1)
10411           .addImm(0)
10412           .add(predOps(ARMCC::AL));
10413       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
10414         .addReg(LHS2).addImm(0)
10415         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
10416     } else {
10417       Register RHS1 = MI.getOperand(3).getReg();
10418       Register RHS2 = MI.getOperand(4).getReg();
10419       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
10420           .addReg(LHS1)
10421           .addReg(RHS1)
10422           .add(predOps(ARMCC::AL));
10423       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
10424         .addReg(LHS2).addReg(RHS2)
10425         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
10426     }
10427
10428     MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
10429     MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
10430     if (MI.getOperand(0).getImm() == ARMCC::NE)
10431       std::swap(destMBB, exitMBB);
10432
10433     BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
10434       .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
10435     if (isThumb2)
10436       BuildMI(BB, dl, TII->get(ARM::t2B))
10437           .addMBB(exitMBB)
10438           .add(predOps(ARMCC::AL));
10439     else
10440       BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
10441
10442     MI.eraseFromParent(); // The pseudo instruction is gone now.
10443     return BB;
10444   }
10445
10446   case ARM::Int_eh_sjlj_setjmp:
10447   case ARM::Int_eh_sjlj_setjmp_nofp:
10448   case ARM::tInt_eh_sjlj_setjmp:
10449   case ARM::t2Int_eh_sjlj_setjmp:
10450   case ARM::t2Int_eh_sjlj_setjmp_nofp:
10451     return BB;
10452
10453   case ARM::Int_eh_sjlj_setup_dispatch:
10454     EmitSjLjDispatchBlock(MI, BB);
10455     return BB;
10456
10457   case ARM::ABS:
10458   case ARM::t2ABS: {
10459     // To insert an ABS instruction, we have to insert the
10460     // diamond control-flow pattern.  The incoming instruction knows the
10461     // source vreg to test against 0, the destination vreg to set,
10462     // the condition code register to branch on, the
10463     // true/false values to select between, and a branch opcode to use.
10464     // It transforms
10465     //     V1 = ABS V0
10466     // into
10467     //     V2 = MOVS V0
10468     //     BCC                      (branch to SinkBB if V0 >= 0)
10469     //     RSBBB: V3 = RSBri V2, 0  (compute ABS if V2 < 0)
10470     //     SinkBB: V1 = PHI(V2, V3)
10471     const BasicBlock *LLVM_BB = BB->getBasicBlock();
10472     MachineFunction::iterator BBI = ++BB->getIterator();
10473     MachineFunction *Fn = BB->getParent();
10474     MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
10475     MachineBasicBlock *SinkBB  = Fn->CreateMachineBasicBlock(LLVM_BB);
10476     Fn->insert(BBI, RSBBB);
10477     Fn->insert(BBI, SinkBB);
10478
10479     Register ABSSrcReg = MI.getOperand(1).getReg();
10480     Register ABSDstReg = MI.getOperand(0).getReg();
10481     bool ABSSrcKIll = MI.getOperand(1).isKill();
10482     bool isThumb2 = Subtarget->isThumb2();
10483     MachineRegisterInfo &MRI = Fn->getRegInfo();
10484     // In Thumb mode S must not be specified if source register is the SP or
10485     // PC and if destination register is the SP, so restrict register class
10486     Register NewRsbDstReg = MRI.createVirtualRegister(
10487         isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
10488
10489     // Transfer the remainder of BB and its successor edges to sinkMBB.
10490     SinkBB->splice(SinkBB->begin(), BB,
10491                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
10492     SinkBB->transferSuccessorsAndUpdatePHIs(BB);
10493
10494     BB->addSuccessor(RSBBB);
10495     BB->addSuccessor(SinkBB);
10496
10497     // fall through to SinkMBB
10498     RSBBB->addSuccessor(SinkBB);
10499
10500     // insert a cmp at the end of BB
10501     BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
10502         .addReg(ABSSrcReg)
10503         .addImm(0)
10504         .add(predOps(ARMCC::AL));
10505
10506     // insert a bcc with opposite CC to ARMCC::MI at the end of BB
10507     BuildMI(BB, dl,
10508       TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
10509       .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
10510
10511     // insert rsbri in RSBBB
10512     // Note: BCC and rsbri will be converted into predicated rsbmi
10513     // by if-conversion pass
10514     BuildMI(*RSBBB, RSBBB->begin(), dl,
10515             TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
10516         .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
10517         .addImm(0)
10518         .add(predOps(ARMCC::AL))
10519         .add(condCodeOp());
10520
10521     // insert PHI in SinkBB,
10522     // reuse ABSDstReg to not change uses of ABS instruction
10523     BuildMI(*SinkBB, SinkBB->begin(), dl,
10524       TII->get(ARM::PHI), ABSDstReg)
10525       .addReg(NewRsbDstReg).addMBB(RSBBB)
10526       .addReg(ABSSrcReg).addMBB(BB);
10527
10528     // remove ABS instruction
10529     MI.eraseFromParent();
10530
10531     // return last added BB
10532     return SinkBB;
10533   }
10534   case ARM::COPY_STRUCT_BYVAL_I32:
10535     ++NumLoopByVals;
10536     return EmitStructByval(MI, BB);
10537   case ARM::WIN__CHKSTK:
10538     return EmitLowered__chkstk(MI, BB);
10539   case ARM::WIN__DBZCHK:
10540     return EmitLowered__dbzchk(MI, BB);
10541   }
10542 }
10543
10544 /// Attaches vregs to MEMCPY that it will use as scratch registers
10545 /// when it is expanded into LDM/STM. This is done as a post-isel lowering
10546 /// instead of as a custom inserter because we need the use list from the SDNode.
10547 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
10548                                     MachineInstr &MI, const SDNode *Node) {
10549   bool isThumb1 = Subtarget->isThumb1Only();
10550
10551   DebugLoc DL = MI.getDebugLoc();
10552   MachineFunction *MF = MI.getParent()->getParent();
10553   MachineRegisterInfo &MRI = MF->getRegInfo();
10554   MachineInstrBuilder MIB(*MF, MI);
10555
10556   // If the new dst/src is unused mark it as dead.
10557   if (!Node->hasAnyUseOfValue(0)) {
10558     MI.getOperand(0).setIsDead(true);
10559   }
10560   if (!Node->hasAnyUseOfValue(1)) {
10561     MI.getOperand(1).setIsDead(true);
10562   }
10563
10564   // The MEMCPY both defines and kills the scratch registers.
10565   for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
10566     Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
10567                                                          : &ARM::GPRRegClass);
10568     MIB.addReg(TmpReg, RegState::Define|RegState::Dead);
10569   }
10570 }
10571
10572 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
10573                                                       SDNode *Node) const {
10574   if (MI.getOpcode() == ARM::MEMCPY) {
10575     attachMEMCPYScratchRegs(Subtarget, MI, Node);
10576     return;
10577   }
10578
10579   const MCInstrDesc *MCID = &MI.getDesc();
10580   // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
10581   // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
10582   // operand is still set to noreg. If needed, set the optional operand's
10583   // register to CPSR, and remove the redundant implicit def.
10584   //
10585   // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
10586
10587   // Rename pseudo opcodes.
10588   unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
10589   unsigned ccOutIdx;
10590   if (NewOpc) {
10591     const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
10592     MCID = &TII->get(NewOpc);
10593
10594     assert(MCID->getNumOperands() ==
10595            MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
10596         && "converted opcode should be the same except for cc_out"
10597            " (and, on Thumb1, pred)");
10598
10599     MI.setDesc(*MCID);
10600
10601     // Add the optional cc_out operand
10602     MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
10603
10604     // On Thumb1, move all input operands to the end, then add the predicate
10605     if (Subtarget->isThumb1Only()) {
10606       for (unsigned c = MCID->getNumOperands() - 4; c--;) {
10607         MI.addOperand(MI.getOperand(1));
10608         MI.RemoveOperand(1);
10609       }
10610
10611       // Restore the ties
10612       for (unsigned i = MI.getNumOperands(); i--;) {
10613         const MachineOperand& op = MI.getOperand(i);
10614         if (op.isReg() && op.isUse()) {
10615           int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
10616           if (DefIdx != -1)
10617             MI.tieOperands(DefIdx, i);
10618         }
10619       }
10620
10621       MI.addOperand(MachineOperand::CreateImm(ARMCC::AL));
10622       MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
10623       ccOutIdx = 1;
10624     } else
10625       ccOutIdx = MCID->getNumOperands() - 1;
10626   } else
10627     ccOutIdx = MCID->getNumOperands() - 1;
10628
10629   // Any ARM instruction that sets the 's' bit should specify an optional
10630   // "cc_out" operand in the last operand position.
10631   if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
10632     assert(!NewOpc && "Optional cc_out operand required");
10633     return;
10634   }
10635   // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
10636   // since we already have an optional CPSR def.
10637   bool definesCPSR = false;
10638   bool deadCPSR = false;
10639   for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
10640        ++i) {
10641     const MachineOperand &MO = MI.getOperand(i);
10642     if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
10643       definesCPSR = true;
10644       if (MO.isDead())
10645         deadCPSR = true;
10646       MI.RemoveOperand(i);
10647       break;
10648     }
10649   }
10650   if (!definesCPSR) {
10651     assert(!NewOpc && "Optional cc_out operand required");
10652     return;
10653   }
10654   assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
10655   if (deadCPSR) {
10656     assert(!MI.getOperand(ccOutIdx).getReg() &&
10657            "expect uninitialized optional cc_out operand");
10658     // Thumb1 instructions must have the S bit even if the CPSR is dead.
10659     if (!Subtarget->isThumb1Only())
10660       return;
10661   }
10662
10663   // If this instruction was defined with an optional CPSR def and its dag node
10664   // had a live implicit CPSR def, then activate the optional CPSR def.
10665   MachineOperand &MO = MI.getOperand(ccOutIdx);
10666   MO.setReg(ARM::CPSR);
10667   MO.setIsDef(true);
10668 }
10669
10670 //===----------------------------------------------------------------------===//
10671 //                           ARM Optimization Hooks
10672 //===----------------------------------------------------------------------===//
10673
10674 // Helper function that checks if N is a null or all ones constant.
10675 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
10676   return AllOnes ? isAllOnesConstant(N) : isNullConstant(N);
10677 }
10678
10679 // Return true if N is conditionally 0 or all ones.
10680 // Detects these expressions where cc is an i1 value:
10681 //
10682 //   (select cc 0, y)   [AllOnes=0]
10683 //   (select cc y, 0)   [AllOnes=0]
10684 //   (zext cc)          [AllOnes=0]
10685 //   (sext cc)          [AllOnes=0/1]
10686 //   (select cc -1, y)  [AllOnes=1]
10687 //   (select cc y, -1)  [AllOnes=1]
10688 //
10689 // Invert is set when N is the null/all ones constant when CC is false.
10690 // OtherOp is set to the alternative value of N.
10691 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
10692                                        SDValue &CC, bool &Invert,
10693                                        SDValue &OtherOp,
10694                                        SelectionDAG &DAG) {
10695   switch (N->getOpcode()) {
10696   default: return false;
10697   case ISD::SELECT: {
10698     CC = N->getOperand(0);
10699     SDValue N1 = N->getOperand(1);
10700     SDValue N2 = N->getOperand(2);
10701     if (isZeroOrAllOnes(N1, AllOnes)) {
10702       Invert = false;
10703       OtherOp = N2;
10704       return true;
10705     }
10706     if (isZeroOrAllOnes(N2, AllOnes)) {
10707       Invert = true;
10708       OtherOp = N1;
10709       return true;
10710     }
10711     return false;
10712   }
10713   case ISD::ZERO_EXTEND:
10714     // (zext cc) can never be the all ones value.
10715     if (AllOnes)
10716       return false;
10717     LLVM_FALLTHROUGH;
10718   case ISD::SIGN_EXTEND: {
10719     SDLoc dl(N);
10720     EVT VT = N->getValueType(0);
10721     CC = N->getOperand(0);
10722     if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
10723       return false;
10724     Invert = !AllOnes;
10725     if (AllOnes)
10726       // When looking for an AllOnes constant, N is an sext, and the 'other'
10727       // value is 0.
10728       OtherOp = DAG.getConstant(0, dl, VT);
10729     else if (N->getOpcode() == ISD::ZERO_EXTEND)
10730       // When looking for a 0 constant, N can be zext or sext.
10731       OtherOp = DAG.getConstant(1, dl, VT);
10732     else
10733       OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl,
10734                                 VT);
10735     return true;
10736   }
10737   }
10738 }
10739
10740 // Combine a constant select operand into its use:
10741 //
10742 //   (add (select cc, 0, c), x)  -> (select cc, x, (add, x, c))
10743 //   (sub x, (select cc, 0, c))  -> (select cc, x, (sub, x, c))
10744 //   (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))  [AllOnes=1]
10745 //   (or  (select cc, 0, c), x)  -> (select cc, x, (or, x, c))
10746 //   (xor (select cc, 0, c), x)  -> (select cc, x, (xor, x, c))
10747 //
10748 // The transform is rejected if the select doesn't have a constant operand that
10749 // is null, or all ones when AllOnes is set.
10750 //
10751 // Also recognize sext/zext from i1:
10752 //
10753 //   (add (zext cc), x) -> (select cc (add x, 1), x)
10754 //   (add (sext cc), x) -> (select cc (add x, -1), x)
10755 //
10756 // These transformations eventually create predicated instructions.
10757 //
10758 // @param N       The node to transform.
10759 // @param Slct    The N operand that is a select.
10760 // @param OtherOp The other N operand (x above).
10761 // @param DCI     Context.
10762 // @param AllOnes Require the select constant to be all ones instead of null.
10763 // @returns The new node, or SDValue() on failure.
10764 static
10765 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
10766                             TargetLowering::DAGCombinerInfo &DCI,
10767                             bool AllOnes = false) {
10768   SelectionDAG &DAG = DCI.DAG;
10769   EVT VT = N->getValueType(0);
10770   SDValue NonConstantVal;
10771   SDValue CCOp;
10772   bool SwapSelectOps;
10773   if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
10774                                   NonConstantVal, DAG))
10775     return SDValue();
10776
10777   // Slct is now know to be the desired identity constant when CC is true.
10778   SDValue TrueVal = OtherOp;
10779   SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
10780                                  OtherOp, NonConstantVal);
10781   // Unless SwapSelectOps says CC should be false.
10782   if (SwapSelectOps)
10783     std::swap(TrueVal, FalseVal);
10784
10785   return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
10786                      CCOp, TrueVal, FalseVal);
10787 }
10788
10789 // Attempt combineSelectAndUse on each operand of a commutative operator N.
10790 static
10791 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
10792                                        TargetLowering::DAGCombinerInfo &DCI) {
10793   SDValue N0 = N->getOperand(0);
10794   SDValue N1 = N->getOperand(1);
10795   if (N0.getNode()->hasOneUse())
10796     if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
10797       return Result;
10798   if (N1.getNode()->hasOneUse())
10799     if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
10800       return Result;
10801   return SDValue();
10802 }
10803
10804 static bool IsVUZPShuffleNode(SDNode *N) {
10805   // VUZP shuffle node.
10806   if (N->getOpcode() == ARMISD::VUZP)
10807     return true;
10808
10809   // "VUZP" on i32 is an alias for VTRN.
10810   if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
10811     return true;
10812
10813   return false;
10814 }
10815
10816 static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1,
10817                                  TargetLowering::DAGCombinerInfo &DCI,
10818                                  const ARMSubtarget *Subtarget) {
10819   // Look for ADD(VUZP.0, VUZP.1).
10820   if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
10821       N0 == N1)
10822    return SDValue();
10823
10824   // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
10825   if (!N->getValueType(0).is64BitVector())
10826     return SDValue();
10827
10828   // Generate vpadd.
10829   SelectionDAG &DAG = DCI.DAG;
10830   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10831   SDLoc dl(N);
10832   SDNode *Unzip = N0.getNode();
10833   EVT VT = N->getValueType(0);
10834
10835   SmallVector<SDValue, 8> Ops;
10836   Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
10837                                 TLI.getPointerTy(DAG.getDataLayout())));
10838   Ops.push_back(Unzip->getOperand(0));
10839   Ops.push_back(Unzip->getOperand(1));
10840
10841   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
10842 }
10843
10844 static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1,
10845                                       TargetLowering::DAGCombinerInfo &DCI,
10846                                       const ARMSubtarget *Subtarget) {
10847   // Check for two extended operands.
10848   if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
10849         N1.getOpcode() == ISD::SIGN_EXTEND) &&
10850       !(N0.getOpcode() == ISD::ZERO_EXTEND &&
10851         N1.getOpcode() == ISD::ZERO_EXTEND))
10852     return SDValue();
10853
10854   SDValue N00 = N0.getOperand(0);
10855   SDValue N10 = N1.getOperand(0);
10856
10857   // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
10858   if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
10859       N00 == N10)
10860     return SDValue();
10861
10862   // We only recognize Q register paddl here; this can't be reached until
10863   // after type legalization.
10864   if (!N00.getValueType().is64BitVector() ||
10865       !N0.getValueType().is128BitVector())
10866     return SDValue();
10867
10868   // Generate vpaddl.
10869   SelectionDAG &DAG = DCI.DAG;
10870   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10871   SDLoc dl(N);
10872   EVT VT = N->getValueType(0);
10873
10874   SmallVector<SDValue, 8> Ops;
10875   // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
10876   unsigned Opcode;
10877   if (N0.getOpcode() == ISD::SIGN_EXTEND)
10878     Opcode = Intrinsic::arm_neon_vpaddls;
10879   else
10880     Opcode = Intrinsic::arm_neon_vpaddlu;
10881   Ops.push_back(DAG.getConstant(Opcode, dl,
10882                                 TLI.getPointerTy(DAG.getDataLayout())));
10883   EVT ElemTy = N00.getValueType().getVectorElementType();
10884   unsigned NumElts = VT.getVectorNumElements();
10885   EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
10886   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
10887                                N00.getOperand(0), N00.getOperand(1));
10888   Ops.push_back(Concat);
10889
10890   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
10891 }
10892
10893 // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
10894 // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
10895 // much easier to match.
10896 static SDValue
10897 AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1,
10898                                TargetLowering::DAGCombinerInfo &DCI,
10899                                const ARMSubtarget *Subtarget) {
10900   // Only perform optimization if after legalize, and if NEON is available. We
10901   // also expected both operands to be BUILD_VECTORs.
10902   if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
10903       || N0.getOpcode() != ISD::BUILD_VECTOR
10904       || N1.getOpcode() != ISD::BUILD_VECTOR)
10905     return SDValue();
10906
10907   // Check output type since VPADDL operand elements can only be 8, 16, or 32.
10908   EVT VT = N->getValueType(0);
10909   if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
10910     return SDValue();
10911
10912   // Check that the vector operands are of the right form.
10913   // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
10914   // operands, where N is the size of the formed vector.
10915   // Each EXTRACT_VECTOR should have the same input vector and odd or even
10916   // index such that we have a pair wise add pattern.
10917
10918   // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
10919   if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10920     return SDValue();
10921   SDValue Vec = N0->getOperand(0)->getOperand(0);
10922   SDNode *V = Vec.getNode();
10923   unsigned nextIndex = 0;
10924
10925   // For each operands to the ADD which are BUILD_VECTORs,
10926   // check to see if each of their operands are an EXTRACT_VECTOR with
10927   // the same vector and appropriate index.
10928   for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
10929     if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
10930         && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
10931
10932       SDValue ExtVec0 = N0->getOperand(i);
10933       SDValue ExtVec1 = N1->getOperand(i);
10934
10935       // First operand is the vector, verify its the same.
10936       if (V != ExtVec0->getOperand(0).getNode() ||
10937           V != ExtVec1->getOperand(0).getNode())
10938         return SDValue();
10939
10940       // Second is the constant, verify its correct.
10941       ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
10942       ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
10943
10944       // For the constant, we want to see all the even or all the odd.
10945       if (!C0 || !C1 || C0->getZExtValue() != nextIndex
10946           || C1->getZExtValue() != nextIndex+1)
10947         return SDValue();
10948
10949       // Increment index.
10950       nextIndex+=2;
10951     } else
10952       return SDValue();
10953   }
10954
10955   // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
10956   // we're using the entire input vector, otherwise there's a size/legality
10957   // mismatch somewhere.
10958   if (nextIndex != Vec.getValueType().getVectorNumElements() ||
10959       Vec.getValueType().getVectorElementType() == VT.getVectorElementType())
10960     return SDValue();
10961
10962   // Create VPADDL node.
10963   SelectionDAG &DAG = DCI.DAG;
10964   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10965
10966   SDLoc dl(N);
10967
10968   // Build operand list.
10969   SmallVector<SDValue, 8> Ops;
10970   Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
10971                                 TLI.getPointerTy(DAG.getDataLayout())));
10972
10973   // Input is the vector.
10974   Ops.push_back(Vec);
10975
10976   // Get widened type and narrowed type.
10977   MVT widenType;
10978   unsigned numElem = VT.getVectorNumElements();
10979
10980   EVT inputLaneType = Vec.getValueType().getVectorElementType();
10981   switch (inputLaneType.getSimpleVT().SimpleTy) {
10982     case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
10983     case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
10984     case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
10985     default:
10986       llvm_unreachable("Invalid vector element type for padd optimization.");
10987   }
10988
10989   SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
10990   unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
10991   return DAG.getNode(ExtOp, dl, VT, tmp);
10992 }
10993
10994 static SDValue findMUL_LOHI(SDValue V) {
10995   if (V->getOpcode() == ISD::UMUL_LOHI ||
10996       V->getOpcode() == ISD::SMUL_LOHI)
10997     return V;
10998   return SDValue();
10999 }
11000
11001 static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
11002                                         TargetLowering::DAGCombinerInfo &DCI,
11003                                         const ARMSubtarget *Subtarget) {
11004   if (Subtarget->isThumb()) {
11005     if (!Subtarget->hasDSP())
11006       return SDValue();
11007   } else if (!Subtarget->hasV5TEOps())
11008     return SDValue();
11009
11010   // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
11011   // accumulates the product into a 64-bit value. The 16-bit values will
11012   // be sign extended somehow or SRA'd into 32-bit values
11013   // (addc (adde (mul 16bit, 16bit), lo), hi)
11014   SDValue Mul = AddcNode->getOperand(0);
11015   SDValue Lo = AddcNode->getOperand(1);
11016   if (Mul.getOpcode() != ISD::MUL) {
11017     Lo = AddcNode->getOperand(0);
11018     Mul = AddcNode->getOperand(1);
11019     if (Mul.getOpcode() != ISD::MUL)
11020       return SDValue();
11021   }
11022
11023   SDValue SRA = AddeNode->getOperand(0);
11024   SDValue Hi = AddeNode->getOperand(1);
11025   if (SRA.getOpcode() != ISD::SRA) {
11026     SRA = AddeNode->getOperand(1);
11027     Hi = AddeNode->getOperand(0);
11028     if (SRA.getOpcode() != ISD::SRA)
11029       return SDValue();
11030   }
11031   if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
11032     if (Const->getZExtValue() != 31)
11033       return SDValue();
11034   } else
11035     return SDValue();
11036
11037   if (SRA.getOperand(0) != Mul)
11038     return SDValue();
11039
11040   SelectionDAG &DAG = DCI.DAG;
11041   SDLoc dl(AddcNode);
11042   unsigned Opcode = 0;
11043   SDValue Op0;
11044   SDValue Op1;
11045
11046   if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
11047     Opcode = ARMISD::SMLALBB;
11048     Op0 = Mul.getOperand(0);
11049     Op1 = Mul.getOperand(1);
11050   } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
11051     Opcode = ARMISD::SMLALBT;
11052     Op0 = Mul.getOperand(0);
11053     Op1 = Mul.getOperand(1).getOperand(0);
11054   } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
11055     Opcode = ARMISD::SMLALTB;
11056     Op0 = Mul.getOperand(0).getOperand(0);
11057     Op1 = Mul.getOperand(1);
11058   } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
11059     Opcode = ARMISD::SMLALTT;
11060     Op0 = Mul->getOperand(0).getOperand(0);
11061     Op1 = Mul->getOperand(1).getOperand(0);
11062   }
11063
11064   if (!Op0 || !Op1)
11065     return SDValue();
11066
11067   SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
11068                               Op0, Op1, Lo, Hi);
11069   // Replace the ADDs' nodes uses by the MLA node's values.
11070   SDValue HiMLALResult(SMLAL.getNode(), 1);
11071   SDValue LoMLALResult(SMLAL.getNode(), 0);
11072
11073   DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
11074   DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
11075
11076   // Return original node to notify the driver to stop replacing.
11077   SDValue resNode(AddcNode, 0);
11078   return resNode;
11079 }
11080
11081 static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode,
11082                                      TargetLowering::DAGCombinerInfo &DCI,
11083                                      const ARMSubtarget *Subtarget) {
11084   // Look for multiply add opportunities.
11085   // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
11086   // each add nodes consumes a value from ISD::UMUL_LOHI and there is
11087   // a glue link from the first add to the second add.
11088   // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
11089   // a S/UMLAL instruction.
11090   //                  UMUL_LOHI
11091   //                 / :lo    \ :hi
11092   //                V          \          [no multiline comment]
11093   //    loAdd ->  ADDC         |
11094   //                 \ :carry /
11095   //                  V      V
11096   //                    ADDE   <- hiAdd
11097   //
11098   // In the special case where only the higher part of a signed result is used
11099   // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
11100   // a constant with the exact value of 0x80000000, we recognize we are dealing
11101   // with a "rounded multiply and add" (or subtract) and transform it into
11102   // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
11103
11104   assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
11105           AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
11106          "Expect an ADDE or SUBE");
11107
11108   assert(AddeSubeNode->getNumOperands() == 3 &&
11109          AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
11110          "ADDE node has the wrong inputs");
11111
11112   // Check that we are chained to the right ADDC or SUBC node.
11113   SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
11114   if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
11115        AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
11116       (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
11117        AddcSubcNode->getOpcode() != ARMISD::SUBC))
11118     return SDValue();
11119
11120   SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
11121   SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
11122
11123   // Check if the two operands are from the same mul_lohi node.
11124   if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
11125     return SDValue();
11126
11127   assert(AddcSubcNode->getNumValues() == 2 &&
11128          AddcSubcNode->getValueType(0) == MVT::i32 &&
11129          "Expect ADDC with two result values. First: i32");
11130
11131   // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
11132   // maybe a SMLAL which multiplies two 16-bit values.
11133   if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
11134       AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
11135       AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
11136       AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
11137       AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
11138     return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
11139
11140   // Check for the triangle shape.
11141   SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
11142   SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
11143
11144   // Make sure that the ADDE/SUBE operands are not coming from the same node.
11145   if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
11146     return SDValue();
11147
11148   // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
11149   bool IsLeftOperandMUL = false;
11150   SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
11151   if (MULOp == SDValue())
11152     MULOp = findMUL_LOHI(AddeSubeOp1);
11153   else
11154     IsLeftOperandMUL = true;
11155   if (MULOp == SDValue())
11156     return SDValue();
11157
11158   // Figure out the right opcode.
11159   unsigned Opc = MULOp->getOpcode();
11160   unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
11161
11162   // Figure out the high and low input values to the MLAL node.
11163   SDValue *HiAddSub = nullptr;
11164   SDValue *LoMul = nullptr;
11165   SDValue *LowAddSub = nullptr;
11166
11167   // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
11168   if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
11169     return SDValue();
11170
11171   if (IsLeftOperandMUL)
11172     HiAddSub = &AddeSubeOp1;
11173   else
11174     HiAddSub = &AddeSubeOp0;
11175
11176   // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
11177   // whose low result is fed to the ADDC/SUBC we are checking.
11178
11179   if (AddcSubcOp0 == MULOp.getValue(0)) {
11180     LoMul = &AddcSubcOp0;
11181     LowAddSub = &AddcSubcOp1;
11182   }
11183   if (AddcSubcOp1 == MULOp.getValue(0)) {
11184     LoMul = &AddcSubcOp1;
11185     LowAddSub = &AddcSubcOp0;
11186   }
11187
11188   if (!LoMul)
11189     return SDValue();
11190
11191   // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
11192   // the replacement below will create a cycle.
11193   if (AddcSubcNode == HiAddSub->getNode() ||
11194       AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
11195     return SDValue();
11196
11197   // Create the merged node.
11198   SelectionDAG &DAG = DCI.DAG;
11199
11200   // Start building operand list.
11201   SmallVector<SDValue, 8> Ops;
11202   Ops.push_back(LoMul->getOperand(0));
11203   Ops.push_back(LoMul->getOperand(1));
11204
11205   // Check whether we can use SMMLAR, SMMLSR or SMMULR instead.  For this to be
11206   // the case, we must be doing signed multiplication and only use the higher
11207   // part of the result of the MLAL, furthermore the LowAddSub must be a constant
11208   // addition or subtraction with the value of 0x800000.
11209   if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
11210       FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
11211       LowAddSub->getNode()->getOpcode() == ISD::Constant &&
11212       static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
11213           0x80000000) {
11214     Ops.push_back(*HiAddSub);
11215     if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
11216       FinalOpc = ARMISD::SMMLSR;
11217     } else {
11218       FinalOpc = ARMISD::SMMLAR;
11219     }
11220     SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
11221     DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
11222
11223     return SDValue(AddeSubeNode, 0);
11224   } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
11225     // SMMLS is generated during instruction selection and the rest of this
11226     // function can not handle the case where AddcSubcNode is a SUBC.
11227     return SDValue();
11228
11229   // Finish building the operand list for {U/S}MLAL
11230   Ops.push_back(*LowAddSub);
11231   Ops.push_back(*HiAddSub);
11232
11233   SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
11234                                  DAG.getVTList(MVT::i32, MVT::i32), Ops);
11235
11236   // Replace the ADDs' nodes uses by the MLA node's values.
11237   SDValue HiMLALResult(MLALNode.getNode(), 1);
11238   DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
11239
11240   SDValue LoMLALResult(MLALNode.getNode(), 0);
11241   DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
11242
11243   // Return original node to notify the driver to stop replacing.
11244   return SDValue(AddeSubeNode, 0);
11245 }
11246
11247 static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode,
11248                                       TargetLowering::DAGCombinerInfo &DCI,
11249                                       const ARMSubtarget *Subtarget) {
11250   // UMAAL is similar to UMLAL except that it adds two unsigned values.
11251   // While trying to combine for the other MLAL nodes, first search for the
11252   // chance to use UMAAL. Check if Addc uses a node which has already
11253   // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
11254   // as the addend, and it's handled in PerformUMLALCombine.
11255
11256   if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
11257     return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
11258
11259   // Check that we have a glued ADDC node.
11260   SDNode* AddcNode = AddeNode->getOperand(2).getNode();
11261   if (AddcNode->getOpcode() != ARMISD::ADDC)
11262     return SDValue();
11263
11264   // Find the converted UMAAL or quit if it doesn't exist.
11265   SDNode *UmlalNode = nullptr;
11266   SDValue AddHi;
11267   if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
11268     UmlalNode = AddcNode->getOperand(0).getNode();
11269     AddHi = AddcNode->getOperand(1);
11270   } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
11271     UmlalNode = AddcNode->getOperand(1).getNode();
11272     AddHi = AddcNode->getOperand(0);
11273   } else {
11274     return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
11275   }
11276
11277   // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
11278   // the ADDC as well as Zero.
11279   if (!isNullConstant(UmlalNode->getOperand(3)))
11280     return SDValue();
11281
11282   if ((isNullConstant(AddeNode->getOperand(0)) &&
11283        AddeNode->getOperand(1).getNode() == UmlalNode) ||
11284       (AddeNode->getOperand(0).getNode() == UmlalNode &&
11285        isNullConstant(AddeNode->getOperand(1)))) {
11286     SelectionDAG &DAG = DCI.DAG;
11287     SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
11288                       UmlalNode->getOperand(2), AddHi };
11289     SDValue UMAAL =  DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
11290                                  DAG.getVTList(MVT::i32, MVT::i32), Ops);
11291
11292     // Replace the ADDs' nodes uses by the UMAAL node's values.
11293     DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
11294     DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
11295
11296     // Return original node to notify the driver to stop replacing.
11297     return SDValue(AddeNode, 0);
11298   }
11299   return SDValue();
11300 }
11301
11302 static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG,
11303                                    const ARMSubtarget *Subtarget) {
11304   if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
11305     return SDValue();
11306
11307   // Check that we have a pair of ADDC and ADDE as operands.
11308   // Both addends of the ADDE must be zero.
11309   SDNode* AddcNode = N->getOperand(2).getNode();
11310   SDNode* AddeNode = N->getOperand(3).getNode();
11311   if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
11312       (AddeNode->getOpcode() == ARMISD::ADDE) &&
11313       isNullConstant(AddeNode->getOperand(0)) &&
11314       isNullConstant(AddeNode->getOperand(1)) &&
11315       (AddeNode->getOperand(2).getNode() == AddcNode))
11316     return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
11317                        DAG.getVTList(MVT::i32, MVT::i32),
11318                        {N->getOperand(0), N->getOperand(1),
11319                         AddcNode->getOperand(0), AddcNode->getOperand(1)});
11320   else
11321     return SDValue();
11322 }
11323
11324 static SDValue PerformAddcSubcCombine(SDNode *N,
11325                                       TargetLowering::DAGCombinerInfo &DCI,
11326                                       const ARMSubtarget *Subtarget) {
11327   SelectionDAG &DAG(DCI.DAG);
11328
11329   if (N->getOpcode() == ARMISD::SUBC) {
11330     // (SUBC (ADDE 0, 0, C), 1) -> C
11331     SDValue LHS = N->getOperand(0);
11332     SDValue RHS = N->getOperand(1);
11333     if (LHS->getOpcode() == ARMISD::ADDE &&
11334         isNullConstant(LHS->getOperand(0)) &&
11335         isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
11336       return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
11337     }
11338   }
11339
11340   if (Subtarget->isThumb1Only()) {
11341     SDValue RHS = N->getOperand(1);
11342     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
11343       int32_t imm = C->getSExtValue();
11344       if (imm < 0 && imm > std::numeric_limits<int>::min()) {
11345         SDLoc DL(N);
11346         RHS = DAG.getConstant(-imm, DL, MVT::i32);
11347         unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
11348                                                            : ARMISD::ADDC;
11349         return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
11350       }
11351     }
11352   }
11353
11354   return SDValue();
11355 }
11356
11357 static SDValue PerformAddeSubeCombine(SDNode *N,
11358                                       TargetLowering::DAGCombinerInfo &DCI,
11359                                       const ARMSubtarget *Subtarget) {
11360   if (Subtarget->isThumb1Only()) {
11361     SelectionDAG &DAG = DCI.DAG;
11362     SDValue RHS = N->getOperand(1);
11363     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
11364       int64_t imm = C->getSExtValue();
11365       if (imm < 0) {
11366         SDLoc DL(N);
11367
11368         // The with-carry-in form matches bitwise not instead of the negation.
11369         // Effectively, the inverse interpretation of the carry flag already
11370         // accounts for part of the negation.
11371         RHS = DAG.getConstant(~imm, DL, MVT::i32);
11372
11373         unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
11374                                                            : ARMISD::ADDE;
11375         return DAG.getNode(Opcode, DL, N->getVTList(),
11376                            N->getOperand(0), RHS, N->getOperand(2));
11377       }
11378     }
11379   } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
11380     return AddCombineTo64bitMLAL(N, DCI, Subtarget);
11381   }
11382   return SDValue();
11383 }
11384
11385 static SDValue PerformABSCombine(SDNode *N,
11386                                   TargetLowering::DAGCombinerInfo &DCI,
11387                                   const ARMSubtarget *Subtarget) {
11388   SDValue res;
11389   SelectionDAG &DAG = DCI.DAG;
11390   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11391
11392   if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0)))
11393     return SDValue();
11394
11395   if (!TLI.expandABS(N, res, DAG))
11396       return SDValue();
11397
11398   return res;
11399 }
11400
11401 /// PerformADDECombine - Target-specific dag combine transform from
11402 /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
11403 /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
11404 static SDValue PerformADDECombine(SDNode *N,
11405                                   TargetLowering::DAGCombinerInfo &DCI,
11406                                   const ARMSubtarget *Subtarget) {
11407   // Only ARM and Thumb2 support UMLAL/SMLAL.
11408   if (Subtarget->isThumb1Only())
11409     return PerformAddeSubeCombine(N, DCI, Subtarget);
11410
11411   // Only perform the checks after legalize when the pattern is available.
11412   if (DCI.isBeforeLegalize()) return SDValue();
11413
11414   return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
11415 }
11416
11417 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
11418 /// operands N0 and N1.  This is a helper for PerformADDCombine that is
11419 /// called with the default operands, and if that fails, with commuted
11420 /// operands.
11421 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
11422                                           TargetLowering::DAGCombinerInfo &DCI,
11423                                           const ARMSubtarget *Subtarget){
11424   // Attempt to create vpadd for this add.
11425   if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
11426     return Result;
11427
11428   // Attempt to create vpaddl for this add.
11429   if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
11430     return Result;
11431   if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
11432                                                       Subtarget))
11433     return Result;
11434
11435   // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
11436   if (N0.getNode()->hasOneUse())
11437     if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
11438       return Result;
11439   return SDValue();
11440 }
11441
11442 bool
11443 ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
11444                                                  CombineLevel Level) const {
11445   if (Level == BeforeLegalizeTypes)
11446     return true;
11447
11448   if (N->getOpcode() != ISD::SHL)
11449     return true;
11450
11451   if (Subtarget->isThumb1Only()) {
11452     // Avoid making expensive immediates by commuting shifts. (This logic
11453     // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
11454     // for free.)
11455     if (N->getOpcode() != ISD::SHL)
11456       return true;
11457     SDValue N1 = N->getOperand(0);
11458     if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
11459         N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
11460       return true;
11461     if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
11462       if (Const->getAPIntValue().ult(256))
11463         return false;
11464       if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
11465           Const->getAPIntValue().sgt(-256))
11466         return false;
11467     }
11468     return true;
11469   }
11470
11471   // Turn off commute-with-shift transform after legalization, so it doesn't
11472   // conflict with PerformSHLSimplify.  (We could try to detect when
11473   // PerformSHLSimplify would trigger more precisely, but it isn't
11474   // really necessary.)
11475   return false;
11476 }
11477
11478 bool ARMTargetLowering::shouldFoldConstantShiftPairToMask(
11479     const SDNode *N, CombineLevel Level) const {
11480   if (!Subtarget->isThumb1Only())
11481     return true;
11482
11483   if (Level == BeforeLegalizeTypes)
11484     return true;
11485
11486   return false;
11487 }
11488
11489 bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
11490   if (!Subtarget->hasNEON()) {
11491     if (Subtarget->isThumb1Only())
11492       return VT.getScalarSizeInBits() <= 32;
11493     return true;
11494   }
11495   return VT.isScalarInteger();
11496 }
11497
11498 static SDValue PerformSHLSimplify(SDNode *N,
11499                                 TargetLowering::DAGCombinerInfo &DCI,
11500                                 const ARMSubtarget *ST) {
11501   // Allow the generic combiner to identify potential bswaps.
11502   if (DCI.isBeforeLegalize())
11503     return SDValue();
11504
11505   // DAG combiner will fold:
11506   // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
11507   // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
11508   // Other code patterns that can be also be modified have the following form:
11509   // b + ((a << 1) | 510)
11510   // b + ((a << 1) & 510)
11511   // b + ((a << 1) ^ 510)
11512   // b + ((a << 1) + 510)
11513
11514   // Many instructions can  perform the shift for free, but it requires both
11515   // the operands to be registers. If c1 << c2 is too large, a mov immediate
11516   // instruction will needed. So, unfold back to the original pattern if:
11517   // - if c1 and c2 are small enough that they don't require mov imms.
11518   // - the user(s) of the node can perform an shl
11519
11520   // No shifted operands for 16-bit instructions.
11521   if (ST->isThumb() && ST->isThumb1Only())
11522     return SDValue();
11523
11524   // Check that all the users could perform the shl themselves.
11525   for (auto U : N->uses()) {
11526     switch(U->getOpcode()) {
11527     default:
11528       return SDValue();
11529     case ISD::SUB:
11530     case ISD::ADD:
11531     case ISD::AND:
11532     case ISD::OR:
11533     case ISD::XOR:
11534     case ISD::SETCC:
11535     case ARMISD::CMP:
11536       // Check that the user isn't already using a constant because there
11537       // aren't any instructions that support an immediate operand and a
11538       // shifted operand.
11539       if (isa<ConstantSDNode>(U->getOperand(0)) ||
11540           isa<ConstantSDNode>(U->getOperand(1)))
11541         return SDValue();
11542
11543       // Check that it's not already using a shift.
11544       if (U->getOperand(0).getOpcode() == ISD::SHL ||
11545           U->getOperand(1).getOpcode() == ISD::SHL)
11546         return SDValue();
11547       break;
11548     }
11549   }
11550
11551   if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
11552       N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
11553     return SDValue();
11554
11555   if (N->getOperand(0).getOpcode() != ISD::SHL)
11556     return SDValue();
11557
11558   SDValue SHL = N->getOperand(0);
11559
11560   auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
11561   auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
11562   if (!C1ShlC2 || !C2)
11563     return SDValue();
11564
11565   APInt C2Int = C2->getAPIntValue();
11566   APInt C1Int = C1ShlC2->getAPIntValue();
11567
11568   // Check that performing a lshr will not lose any information.
11569   APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(),
11570                                      C2Int.getBitWidth() - C2->getZExtValue());
11571   if ((C1Int & Mask) != C1Int)
11572     return SDValue();
11573
11574   // Shift the first constant.
11575   C1Int.lshrInPlace(C2Int);
11576
11577   // The immediates are encoded as an 8-bit value that can be rotated.
11578   auto LargeImm = [](const APInt &Imm) {
11579     unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros();
11580     return Imm.getBitWidth() - Zeros > 8;
11581   };
11582
11583   if (LargeImm(C1Int) || LargeImm(C2Int))
11584     return SDValue();
11585
11586   SelectionDAG &DAG = DCI.DAG;
11587   SDLoc dl(N);
11588   SDValue X = SHL.getOperand(0);
11589   SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
11590                               DAG.getConstant(C1Int, dl, MVT::i32));
11591   // Shift left to compensate for the lshr of C1Int.
11592   SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
11593
11594   LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
11595              SHL.dump(); N->dump());
11596   LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
11597   return Res;
11598 }
11599
11600
11601 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
11602 ///
11603 static SDValue PerformADDCombine(SDNode *N,
11604                                  TargetLowering::DAGCombinerInfo &DCI,
11605                                  const ARMSubtarget *Subtarget) {
11606   SDValue N0 = N->getOperand(0);
11607   SDValue N1 = N->getOperand(1);
11608
11609   // Only works one way, because it needs an immediate operand.
11610   if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
11611     return Result;
11612
11613   // First try with the default operand order.
11614   if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
11615     return Result;
11616
11617   // If that didn't work, try again with the operands commuted.
11618   return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
11619 }
11620
11621 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
11622 ///
11623 static SDValue PerformSUBCombine(SDNode *N,
11624                                  TargetLowering::DAGCombinerInfo &DCI) {
11625   SDValue N0 = N->getOperand(0);
11626   SDValue N1 = N->getOperand(1);
11627
11628   // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
11629   if (N1.getNode()->hasOneUse())
11630     if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
11631       return Result;
11632
11633   return SDValue();
11634 }
11635
11636 /// PerformVMULCombine
11637 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
11638 /// special multiplier accumulator forwarding.
11639 ///   vmul d3, d0, d2
11640 ///   vmla d3, d1, d2
11641 /// is faster than
11642 ///   vadd d3, d0, d1
11643 ///   vmul d3, d3, d2
11644 //  However, for (A + B) * (A + B),
11645 //    vadd d2, d0, d1
11646 //    vmul d3, d0, d2
11647 //    vmla d3, d1, d2
11648 //  is slower than
11649 //    vadd d2, d0, d1
11650 //    vmul d3, d2, d2
11651 static SDValue PerformVMULCombine(SDNode *N,
11652                                   TargetLowering::DAGCombinerInfo &DCI,
11653                                   const ARMSubtarget *Subtarget) {
11654   if (!Subtarget->hasVMLxForwarding())
11655     return SDValue();
11656
11657   SelectionDAG &DAG = DCI.DAG;
11658   SDValue N0 = N->getOperand(0);
11659   SDValue N1 = N->getOperand(1);
11660   unsigned Opcode = N0.getOpcode();
11661   if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
11662       Opcode != ISD::FADD && Opcode != ISD::FSUB) {
11663     Opcode = N1.getOpcode();
11664     if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
11665         Opcode != ISD::FADD && Opcode != ISD::FSUB)
11666       return SDValue();
11667     std::swap(N0, N1);
11668   }
11669
11670   if (N0 == N1)
11671     return SDValue();
11672
11673   EVT VT = N->getValueType(0);
11674   SDLoc DL(N);
11675   SDValue N00 = N0->getOperand(0);
11676   SDValue N01 = N0->getOperand(1);
11677   return DAG.getNode(Opcode, DL, VT,
11678                      DAG.getNode(ISD::MUL, DL, VT, N00, N1),
11679                      DAG.getNode(ISD::MUL, DL, VT, N01, N1));
11680 }
11681
11682 static SDValue PerformMULCombine(SDNode *N,
11683                                  TargetLowering::DAGCombinerInfo &DCI,
11684                                  const ARMSubtarget *Subtarget) {
11685   SelectionDAG &DAG = DCI.DAG;
11686
11687   if (Subtarget->isThumb1Only())
11688     return SDValue();
11689
11690   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
11691     return SDValue();
11692
11693   EVT VT = N->getValueType(0);
11694   if (VT.is64BitVector() || VT.is128BitVector())
11695     return PerformVMULCombine(N, DCI, Subtarget);
11696   if (VT != MVT::i32)
11697     return SDValue();
11698
11699   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
11700   if (!C)
11701     return SDValue();
11702
11703   int64_t MulAmt = C->getSExtValue();
11704   unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);
11705
11706   ShiftAmt = ShiftAmt & (32 - 1);
11707   SDValue V = N->getOperand(0);
11708   SDLoc DL(N);
11709
11710   SDValue Res;
11711   MulAmt >>= ShiftAmt;
11712
11713   if (MulAmt >= 0) {
11714     if (isPowerOf2_32(MulAmt - 1)) {
11715       // (mul x, 2^N + 1) => (add (shl x, N), x)
11716       Res = DAG.getNode(ISD::ADD, DL, VT,
11717                         V,
11718                         DAG.getNode(ISD::SHL, DL, VT,
11719                                     V,
11720                                     DAG.getConstant(Log2_32(MulAmt - 1), DL,
11721                                                     MVT::i32)));
11722     } else if (isPowerOf2_32(MulAmt + 1)) {
11723       // (mul x, 2^N - 1) => (sub (shl x, N), x)
11724       Res = DAG.getNode(ISD::SUB, DL, VT,
11725                         DAG.getNode(ISD::SHL, DL, VT,
11726                                     V,
11727                                     DAG.getConstant(Log2_32(MulAmt + 1), DL,
11728                                                     MVT::i32)),
11729                         V);
11730     } else
11731       return SDValue();
11732   } else {
11733     uint64_t MulAmtAbs = -MulAmt;
11734     if (isPowerOf2_32(MulAmtAbs + 1)) {
11735       // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
11736       Res = DAG.getNode(ISD::SUB, DL, VT,
11737                         V,
11738                         DAG.getNode(ISD::SHL, DL, VT,
11739                                     V,
11740                                     DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
11741                                                     MVT::i32)));
11742     } else if (isPowerOf2_32(MulAmtAbs - 1)) {
11743       // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
11744       Res = DAG.getNode(ISD::ADD, DL, VT,
11745                         V,
11746                         DAG.getNode(ISD::SHL, DL, VT,
11747                                     V,
11748                                     DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
11749                                                     MVT::i32)));
11750       Res = DAG.getNode(ISD::SUB, DL, VT,
11751                         DAG.getConstant(0, DL, MVT::i32), Res);
11752     } else
11753       return SDValue();
11754   }
11755
11756   if (ShiftAmt != 0)
11757     Res = DAG.getNode(ISD::SHL, DL, VT,
11758                       Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
11759
11760   // Do not add new nodes to DAG combiner worklist.
11761   DCI.CombineTo(N, Res, false);
11762   return SDValue();
11763 }
11764
11765 static SDValue CombineANDShift(SDNode *N,
11766                                TargetLowering::DAGCombinerInfo &DCI,
11767                                const ARMSubtarget *Subtarget) {
11768   // Allow DAGCombine to pattern-match before we touch the canonical form.
11769   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
11770     return SDValue();
11771
11772   if (N->getValueType(0) != MVT::i32)
11773     return SDValue();
11774
11775   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
11776   if (!N1C)
11777     return SDValue();
11778
11779   uint32_t C1 = (uint32_t)N1C->getZExtValue();
11780   // Don't transform uxtb/uxth.
11781   if (C1 == 255 || C1 == 65535)
11782     return SDValue();
11783
11784   SDNode *N0 = N->getOperand(0).getNode();
11785   if (!N0->hasOneUse())
11786     return SDValue();
11787
11788   if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
11789     return SDValue();
11790
11791   bool LeftShift = N0->getOpcode() == ISD::SHL;
11792
11793   ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1));
11794   if (!N01C)
11795     return SDValue();
11796
11797   uint32_t C2 = (uint32_t)N01C->getZExtValue();
11798   if (!C2 || C2 >= 32)
11799     return SDValue();
11800
11801   // Clear irrelevant bits in the mask.
11802   if (LeftShift)
11803     C1 &= (-1U << C2);
11804   else
11805     C1 &= (-1U >> C2);
11806
11807   SelectionDAG &DAG = DCI.DAG;
11808   SDLoc DL(N);
11809
11810   // We have a pattern of the form "(and (shl x, c2) c1)" or
11811   // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
11812   // transform to a pair of shifts, to save materializing c1.
11813
11814   // First pattern: right shift, then mask off leading bits.
11815   // FIXME: Use demanded bits?
11816   if (!LeftShift && isMask_32(C1)) {
11817     uint32_t C3 = countLeadingZeros(C1);
11818     if (C2 < C3) {
11819       SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
11820                                 DAG.getConstant(C3 - C2, DL, MVT::i32));
11821       return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
11822                          DAG.getConstant(C3, DL, MVT::i32));
11823     }
11824   }
11825
11826   // First pattern, reversed: left shift, then mask off trailing bits.
11827   if (LeftShift && isMask_32(~C1)) {
11828     uint32_t C3 = countTrailingZeros(C1);
11829     if (C2 < C3) {
11830       SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
11831                                 DAG.getConstant(C3 - C2, DL, MVT::i32));
11832       return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
11833                          DAG.getConstant(C3, DL, MVT::i32));
11834     }
11835   }
11836
11837   // Second pattern: left shift, then mask off leading bits.
11838   // FIXME: Use demanded bits?
11839   if (LeftShift && isShiftedMask_32(C1)) {
11840     uint32_t Trailing = countTrailingZeros(C1);
11841     uint32_t C3 = countLeadingZeros(C1);
11842     if (Trailing == C2 && C2 + C3 < 32) {
11843       SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
11844                                 DAG.getConstant(C2 + C3, DL, MVT::i32));
11845       return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
11846                         DAG.getConstant(C3, DL, MVT::i32));
11847     }
11848   }
11849
11850   // Second pattern, reversed: right shift, then mask off trailing bits.
11851   // FIXME: Handle other patterns of known/demanded bits.
11852   if (!LeftShift && isShiftedMask_32(C1)) {
11853     uint32_t Leading = countLeadingZeros(C1);
11854     uint32_t C3 = countTrailingZeros(C1);
11855     if (Leading == C2 && C2 + C3 < 32) {
11856       SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
11857                                 DAG.getConstant(C2 + C3, DL, MVT::i32));
11858       return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
11859                          DAG.getConstant(C3, DL, MVT::i32));
11860     }
11861   }
11862
11863   // FIXME: Transform "(and (shl x, c2) c1)" ->
11864   // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than
11865   // c1.
11866   return SDValue();
11867 }
11868
11869 static SDValue PerformANDCombine(SDNode *N,
11870                                  TargetLowering::DAGCombinerInfo &DCI,
11871                                  const ARMSubtarget *Subtarget) {
11872   // Attempt to use immediate-form VBIC
11873   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
11874   SDLoc dl(N);
11875   EVT VT = N->getValueType(0);
11876   SelectionDAG &DAG = DCI.DAG;
11877
11878   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
11879     return SDValue();
11880
11881   APInt SplatBits, SplatUndef;
11882   unsigned SplatBitSize;
11883   bool HasAnyUndefs;
11884   if (BVN && Subtarget->hasNEON() &&
11885       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
11886     if (SplatBitSize <= 64) {
11887       EVT VbicVT;
11888       SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
11889                                       SplatUndef.getZExtValue(), SplatBitSize,
11890                                       DAG, dl, VbicVT, VT.is128BitVector(),
11891                                       OtherModImm);
11892       if (Val.getNode()) {
11893         SDValue Input =
11894           DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
11895         SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
11896         return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
11897       }
11898     }
11899   }
11900
11901   if (!Subtarget->isThumb1Only()) {
11902     // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
11903     if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
11904       return Result;
11905
11906     if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
11907       return Result;
11908   }
11909
11910   if (Subtarget->isThumb1Only())
11911     if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
11912       return Result;
11913
11914   return SDValue();
11915 }
11916
11917 // Try combining OR nodes to SMULWB, SMULWT.
11918 static SDValue PerformORCombineToSMULWBT(SDNode *OR,
11919                                          TargetLowering::DAGCombinerInfo &DCI,
11920                                          const ARMSubtarget *Subtarget) {
11921   if (!Subtarget->hasV6Ops() ||
11922       (Subtarget->isThumb() &&
11923        (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
11924     return SDValue();
11925
11926   SDValue SRL = OR->getOperand(0);
11927   SDValue SHL = OR->getOperand(1);
11928
11929   if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
11930     SRL = OR->getOperand(1);
11931     SHL = OR->getOperand(0);
11932   }
11933   if (!isSRL16(SRL) || !isSHL16(SHL))
11934     return SDValue();
11935
11936   // The first operands to the shifts need to be the two results from the
11937   // same smul_lohi node.
11938   if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
11939        SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
11940     return SDValue();
11941
11942   SDNode *SMULLOHI = SRL.getOperand(0).getNode();
11943   if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
11944       SHL.getOperand(0) != SDValue(SMULLOHI, 1))
11945     return SDValue();
11946
11947   // Now we have:
11948   // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
11949   // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
11950   // For SMUWB the 16-bit value will signed extended somehow.
11951   // For SMULWT only the SRA is required.
11952   // Check both sides of SMUL_LOHI
11953   SDValue OpS16 = SMULLOHI->getOperand(0);
11954   SDValue OpS32 = SMULLOHI->getOperand(1);
11955
11956   SelectionDAG &DAG = DCI.DAG;
11957   if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
11958     OpS16 = OpS32;
11959     OpS32 = SMULLOHI->getOperand(0);
11960   }
11961
11962   SDLoc dl(OR);
11963   unsigned Opcode = 0;
11964   if (isS16(OpS16, DAG))
11965     Opcode = ARMISD::SMULWB;
11966   else if (isSRA16(OpS16)) {
11967     Opcode = ARMISD::SMULWT;
11968     OpS16 = OpS16->getOperand(0);
11969   }
11970   else
11971     return SDValue();
11972
11973   SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
11974   DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
11975   return SDValue(OR, 0);
11976 }
11977
11978 static SDValue PerformORCombineToBFI(SDNode *N,
11979                                      TargetLowering::DAGCombinerInfo &DCI,
11980                                      const ARMSubtarget *Subtarget) {
11981   // BFI is only available on V6T2+
11982   if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
11983     return SDValue();
11984
11985   EVT VT = N->getValueType(0);
11986   SDValue N0 = N->getOperand(0);
11987   SDValue N1 = N->getOperand(1);
11988   SelectionDAG &DAG = DCI.DAG;
11989   SDLoc DL(N);
11990   // 1) or (and A, mask), val => ARMbfi A, val, mask
11991   //      iff (val & mask) == val
11992   //
11993   // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
11994   //  2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
11995   //          && mask == ~mask2
11996   //  2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
11997   //          && ~mask == mask2
11998   //  (i.e., copy a bitfield value into another bitfield of the same width)
11999
12000   if (VT != MVT::i32)
12001     return SDValue();
12002
12003   SDValue N00 = N0.getOperand(0);
12004
12005   // The value and the mask need to be constants so we can verify this is
12006   // actually a bitfield set. If the mask is 0xffff, we can do better
12007   // via a movt instruction, so don't use BFI in that case.
12008   SDValue MaskOp = N0.getOperand(1);
12009   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
12010   if (!MaskC)
12011     return SDValue();
12012   unsigned Mask = MaskC->getZExtValue();
12013   if (Mask == 0xffff)
12014     return SDValue();
12015   SDValue Res;
12016   // Case (1): or (and A, mask), val => ARMbfi A, val, mask
12017   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
12018   if (N1C) {
12019     unsigned Val = N1C->getZExtValue();
12020     if ((Val & ~Mask) != Val)
12021       return SDValue();
12022
12023     if (ARM::isBitFieldInvertedMask(Mask)) {
12024       Val >>= countTrailingZeros(~Mask);
12025
12026       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
12027                         DAG.getConstant(Val, DL, MVT::i32),
12028                         DAG.getConstant(Mask, DL, MVT::i32));
12029
12030       DCI.CombineTo(N, Res, false);
12031       // Return value from the original node to inform the combiner than N is
12032       // now dead.
12033       return SDValue(N, 0);
12034     }
12035   } else if (N1.getOpcode() == ISD::AND) {
12036     // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
12037     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
12038     if (!N11C)
12039       return SDValue();
12040     unsigned Mask2 = N11C->getZExtValue();
12041
12042     // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
12043     // as is to match.
12044     if (ARM::isBitFieldInvertedMask(Mask) &&
12045         (Mask == ~Mask2)) {
12046       // The pack halfword instruction works better for masks that fit it,
12047       // so use that when it's available.
12048       if (Subtarget->hasDSP() &&
12049           (Mask == 0xffff || Mask == 0xffff0000))
12050         return SDValue();
12051       // 2a
12052       unsigned amt = countTrailingZeros(Mask2);
12053       Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
12054                         DAG.getConstant(amt, DL, MVT::i32));
12055       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
12056                         DAG.getConstant(Mask, DL, MVT::i32));
12057       DCI.CombineTo(N, Res, false);
12058       // Return value from the original node to inform the combiner than N is
12059       // now dead.
12060       return SDValue(N, 0);
12061     } else if (ARM::isBitFieldInvertedMask(~Mask) &&
12062                (~Mask == Mask2)) {
12063       // The pack halfword instruction works better for masks that fit it,
12064       // so use that when it's available.
12065       if (Subtarget->hasDSP() &&
12066           (Mask2 == 0xffff || Mask2 == 0xffff0000))
12067         return SDValue();
12068       // 2b
12069       unsigned lsb = countTrailingZeros(Mask);
12070       Res = DAG.getNode(ISD::SRL, DL, VT, N00,
12071                         DAG.getConstant(lsb, DL, MVT::i32));
12072       Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
12073                         DAG.getConstant(Mask2, DL, MVT::i32));
12074       DCI.CombineTo(N, Res, false);
12075       // Return value from the original node to inform the combiner than N is
12076       // now dead.
12077       return SDValue(N, 0);
12078     }
12079   }
12080
12081   if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
12082       N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
12083       ARM::isBitFieldInvertedMask(~Mask)) {
12084     // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
12085     // where lsb(mask) == #shamt and masked bits of B are known zero.
12086     SDValue ShAmt = N00.getOperand(1);
12087     unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
12088     unsigned LSB = countTrailingZeros(Mask);
12089     if (ShAmtC != LSB)
12090       return SDValue();
12091
12092     Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
12093                       DAG.getConstant(~Mask, DL, MVT::i32));
12094
12095     DCI.CombineTo(N, Res, false);
12096     // Return value from the original node to inform the combiner than N is
12097     // now dead.
12098     return SDValue(N, 0);
12099   }
12100
12101   return SDValue();
12102 }
12103
12104 static bool isValidMVECond(unsigned CC, bool IsFloat) {
12105   switch (CC) {
12106   case ARMCC::EQ:
12107   case ARMCC::NE:
12108   case ARMCC::LE:
12109   case ARMCC::GT:
12110   case ARMCC::GE:
12111   case ARMCC::LT:
12112     return true;
12113   case ARMCC::HS:
12114   case ARMCC::HI:
12115     return !IsFloat;
12116   default:
12117     return false;
12118   };
12119 }
12120
12121 static SDValue PerformORCombine_i1(SDNode *N,
12122                                    TargetLowering::DAGCombinerInfo &DCI,
12123                                    const ARMSubtarget *Subtarget) {
12124   // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
12125   // together with predicates
12126   EVT VT = N->getValueType(0);
12127   SDValue N0 = N->getOperand(0);
12128   SDValue N1 = N->getOperand(1);
12129
12130   ARMCC::CondCodes CondCode0 = ARMCC::AL;
12131   ARMCC::CondCodes CondCode1 = ARMCC::AL;
12132   if (N0->getOpcode() == ARMISD::VCMP)
12133     CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(2))
12134                     ->getZExtValue();
12135   else if (N0->getOpcode() == ARMISD::VCMPZ)
12136     CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(1))
12137                     ->getZExtValue();
12138   if (N1->getOpcode() == ARMISD::VCMP)
12139     CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(2))
12140                     ->getZExtValue();
12141   else if (N1->getOpcode() == ARMISD::VCMPZ)
12142     CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(1))
12143                     ->getZExtValue();
12144
12145   if (CondCode0 == ARMCC::AL || CondCode1 == ARMCC::AL)
12146     return SDValue();
12147
12148   unsigned Opposite0 = ARMCC::getOppositeCondition(CondCode0);
12149   unsigned Opposite1 = ARMCC::getOppositeCondition(CondCode1);
12150
12151   if (!isValidMVECond(Opposite0,
12152                       N0->getOperand(0)->getValueType(0).isFloatingPoint()) ||
12153       !isValidMVECond(Opposite1,
12154                       N1->getOperand(0)->getValueType(0).isFloatingPoint()))
12155     return SDValue();
12156
12157   SmallVector<SDValue, 4> Ops0;
12158   Ops0.push_back(N0->getOperand(0));
12159   if (N0->getOpcode() == ARMISD::VCMP)
12160     Ops0.push_back(N0->getOperand(1));
12161   Ops0.push_back(DCI.DAG.getConstant(Opposite0, SDLoc(N0), MVT::i32));
12162   SmallVector<SDValue, 4> Ops1;
12163   Ops1.push_back(N1->getOperand(0));
12164   if (N1->getOpcode() == ARMISD::VCMP)
12165     Ops1.push_back(N1->getOperand(1));
12166   Ops1.push_back(DCI.DAG.getConstant(Opposite1, SDLoc(N1), MVT::i32));
12167
12168   SDValue NewN0 = DCI.DAG.getNode(N0->getOpcode(), SDLoc(N0), VT, Ops0);
12169   SDValue NewN1 = DCI.DAG.getNode(N1->getOpcode(), SDLoc(N1), VT, Ops1);
12170   SDValue And = DCI.DAG.getNode(ISD::AND, SDLoc(N), VT, NewN0, NewN1);
12171   return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VT, And,
12172                          DCI.DAG.getAllOnesConstant(SDLoc(N), VT));
12173 }
12174
12175 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR
12176 static SDValue PerformORCombine(SDNode *N,
12177                                 TargetLowering::DAGCombinerInfo &DCI,
12178                                 const ARMSubtarget *Subtarget) {
12179   // Attempt to use immediate-form VORR
12180   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
12181   SDLoc dl(N);
12182   EVT VT = N->getValueType(0);
12183   SelectionDAG &DAG = DCI.DAG;
12184
12185   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
12186     return SDValue();
12187
12188   APInt SplatBits, SplatUndef;
12189   unsigned SplatBitSize;
12190   bool HasAnyUndefs;
12191   if (BVN && Subtarget->hasNEON() &&
12192       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
12193     if (SplatBitSize <= 64) {
12194       EVT VorrVT;
12195       SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(),
12196                                       SplatUndef.getZExtValue(), SplatBitSize,
12197                                       DAG, dl, VorrVT, VT.is128BitVector(),
12198                                       OtherModImm);
12199       if (Val.getNode()) {
12200         SDValue Input =
12201           DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
12202         SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
12203         return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
12204       }
12205     }
12206   }
12207
12208   if (!Subtarget->isThumb1Only()) {
12209     // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
12210     if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
12211       return Result;
12212     if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
12213       return Result;
12214   }
12215
12216   SDValue N0 = N->getOperand(0);
12217   SDValue N1 = N->getOperand(1);
12218
12219   // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
12220   if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
12221       DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
12222
12223     // The code below optimizes (or (and X, Y), Z).
12224     // The AND operand needs to have a single user to make these optimizations
12225     // profitable.
12226     if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
12227       return SDValue();
12228
12229     APInt SplatUndef;
12230     unsigned SplatBitSize;
12231     bool HasAnyUndefs;
12232
12233     APInt SplatBits0, SplatBits1;
12234     BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
12235     BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
12236     // Ensure that the second operand of both ands are constants
12237     if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
12238                                       HasAnyUndefs) && !HasAnyUndefs) {
12239         if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
12240                                           HasAnyUndefs) && !HasAnyUndefs) {
12241             // Ensure that the bit width of the constants are the same and that
12242             // the splat arguments are logical inverses as per the pattern we
12243             // are trying to simplify.
12244             if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
12245                 SplatBits0 == ~SplatBits1) {
12246                 // Canonicalize the vector type to make instruction selection
12247                 // simpler.
12248                 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
12249                 SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
12250                                              N0->getOperand(1),
12251                                              N0->getOperand(0),
12252                                              N1->getOperand(0));
12253                 return DAG.getNode(ISD::BITCAST, dl, VT, Result);
12254             }
12255         }
12256     }
12257   }
12258
12259   if (Subtarget->hasMVEIntegerOps() &&
12260       (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1))
12261     return PerformORCombine_i1(N, DCI, Subtarget);
12262
12263   // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
12264   // reasonable.
12265   if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
12266     if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
12267       return Res;
12268   }
12269
12270   if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
12271     return Result;
12272
12273   return SDValue();
12274 }
12275
12276 static SDValue PerformXORCombine(SDNode *N,
12277                                  TargetLowering::DAGCombinerInfo &DCI,
12278                                  const ARMSubtarget *Subtarget) {
12279   EVT VT = N->getValueType(0);
12280   SelectionDAG &DAG = DCI.DAG;
12281
12282   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
12283     return SDValue();
12284
12285   if (!Subtarget->isThumb1Only()) {
12286     // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
12287     if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
12288       return Result;
12289
12290     if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
12291       return Result;
12292   }
12293
12294   return SDValue();
12295 }
12296
12297 // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
12298 // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
12299 // their position in "to" (Rd).
12300 static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
12301   assert(N->getOpcode() == ARMISD::BFI);
12302
12303   SDValue From = N->getOperand(1);
12304   ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue();
12305   FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation());
12306
12307   // If the Base came from a SHR #C, we can deduce that it is really testing bit
12308   // #C in the base of the SHR.
12309   if (From->getOpcode() == ISD::SRL &&
12310       isa<ConstantSDNode>(From->getOperand(1))) {
12311     APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue();
12312     assert(Shift.getLimitedValue() < 32 && "Shift too large!");
12313     FromMask <<= Shift.getLimitedValue(31);
12314     From = From->getOperand(0);
12315   }
12316
12317   return From;
12318 }
12319
12320 // If A and B contain one contiguous set of bits, does A | B == A . B?
12321 //
12322 // Neither A nor B must be zero.
12323 static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
12324   unsigned LastActiveBitInA =  A.countTrailingZeros();
12325   unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1;
12326   return LastActiveBitInA - 1 == FirstActiveBitInB;
12327 }
12328
12329 static SDValue FindBFIToCombineWith(SDNode *N) {
12330   // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with,
12331   // if one exists.
12332   APInt ToMask, FromMask;
12333   SDValue From = ParseBFI(N, ToMask, FromMask);
12334   SDValue To = N->getOperand(0);
12335
12336   // Now check for a compatible BFI to merge with. We can pass through BFIs that
12337   // aren't compatible, but not if they set the same bit in their destination as
12338   // we do (or that of any BFI we're going to combine with).
12339   SDValue V = To;
12340   APInt CombinedToMask = ToMask;
12341   while (V.getOpcode() == ARMISD::BFI) {
12342     APInt NewToMask, NewFromMask;
12343     SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
12344     if (NewFrom != From) {
12345       // This BFI has a different base. Keep going.
12346       CombinedToMask |= NewToMask;
12347       V = V.getOperand(0);
12348       continue;
12349     }
12350
12351     // Do the written bits conflict with any we've seen so far?
12352     if ((NewToMask & CombinedToMask).getBoolValue())
12353       // Conflicting bits - bail out because going further is unsafe.
12354       return SDValue();
12355
12356     // Are the new bits contiguous when combined with the old bits?
12357     if (BitsProperlyConcatenate(ToMask, NewToMask) &&
12358         BitsProperlyConcatenate(FromMask, NewFromMask))
12359       return V;
12360     if (BitsProperlyConcatenate(NewToMask, ToMask) &&
12361         BitsProperlyConcatenate(NewFromMask, FromMask))
12362       return V;
12363
12364     // We've seen a write to some bits, so track it.
12365     CombinedToMask |= NewToMask;
12366     // Keep going...
12367     V = V.getOperand(0);
12368   }
12369
12370   return SDValue();
12371 }
12372
12373 static SDValue PerformBFICombine(SDNode *N,
12374                                  TargetLowering::DAGCombinerInfo &DCI) {
12375   SDValue N1 = N->getOperand(1);
12376   if (N1.getOpcode() == ISD::AND) {
12377     // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
12378     // the bits being cleared by the AND are not demanded by the BFI.
12379     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
12380     if (!N11C)
12381       return SDValue();
12382     unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
12383     unsigned LSB = countTrailingZeros(~InvMask);
12384     unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
12385     assert(Width <
12386                static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
12387            "undefined behavior");
12388     unsigned Mask = (1u << Width) - 1;
12389     unsigned Mask2 = N11C->getZExtValue();
12390     if ((Mask & (~Mask2)) == 0)
12391       return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
12392                              N->getOperand(0), N1.getOperand(0),
12393                              N->getOperand(2));
12394   } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
12395     // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes.
12396     // Keep track of any consecutive bits set that all come from the same base
12397     // value. We can combine these together into a single BFI.
12398     SDValue CombineBFI = FindBFIToCombineWith(N);
12399     if (CombineBFI == SDValue())
12400       return SDValue();
12401
12402     // We've found a BFI.
12403     APInt ToMask1, FromMask1;
12404     SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
12405
12406     APInt ToMask2, FromMask2;
12407     SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
12408     assert(From1 == From2);
12409     (void)From2;
12410
12411     // First, unlink CombineBFI.
12412     DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0));
12413     // Then create a new BFI, combining the two together.
12414     APInt NewFromMask = FromMask1 | FromMask2;
12415     APInt NewToMask = ToMask1 | ToMask2;
12416
12417     EVT VT = N->getValueType(0);
12418     SDLoc dl(N);
12419
12420     if (NewFromMask[0] == 0)
12421       From1 = DCI.DAG.getNode(
12422         ISD::SRL, dl, VT, From1,
12423         DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT));
12424     return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1,
12425                            DCI.DAG.getConstant(~NewToMask, dl, VT));
12426   }
12427   return SDValue();
12428 }
12429
12430 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for
12431 /// ARMISD::VMOVRRD.
12432 static SDValue PerformVMOVRRDCombine(SDNode *N,
12433                                      TargetLowering::DAGCombinerInfo &DCI,
12434                                      const ARMSubtarget *Subtarget) {
12435   // vmovrrd(vmovdrr x, y) -> x,y
12436   SDValue InDouble = N->getOperand(0);
12437   if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
12438     return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
12439
12440   // vmovrrd(load f64) -> (load i32), (load i32)
12441   SDNode *InNode = InDouble.getNode();
12442   if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
12443       InNode->getValueType(0) == MVT::f64 &&
12444       InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
12445       !cast<LoadSDNode>(InNode)->isVolatile()) {
12446     // TODO: Should this be done for non-FrameIndex operands?
12447     LoadSDNode *LD = cast<LoadSDNode>(InNode);
12448
12449     SelectionDAG &DAG = DCI.DAG;
12450     SDLoc DL(LD);
12451     SDValue BasePtr = LD->getBasePtr();
12452     SDValue NewLD1 =
12453         DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
12454                     LD->getAlignment(), LD->getMemOperand()->getFlags());
12455
12456     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
12457                                     DAG.getConstant(4, DL, MVT::i32));
12458
12459     SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
12460                                  LD->getPointerInfo().getWithOffset(4),
12461                                  std::min(4U, LD->getAlignment()),
12462                                  LD->getMemOperand()->getFlags());
12463
12464     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
12465     if (DCI.DAG.getDataLayout().isBigEndian())
12466       std::swap (NewLD1, NewLD2);
12467     SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
12468     return Result;
12469   }
12470
12471   return SDValue();
12472 }
12473
12474 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for
12475 /// ARMISD::VMOVDRR.  This is also used for BUILD_VECTORs with 2 operands.
12476 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
12477   // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
12478   SDValue Op0 = N->getOperand(0);
12479   SDValue Op1 = N->getOperand(1);
12480   if (Op0.getOpcode() == ISD::BITCAST)
12481     Op0 = Op0.getOperand(0);
12482   if (Op1.getOpcode() == ISD::BITCAST)
12483     Op1 = Op1.getOperand(0);
12484   if (Op0.getOpcode() == ARMISD::VMOVRRD &&
12485       Op0.getNode() == Op1.getNode() &&
12486       Op0.getResNo() == 0 && Op1.getResNo() == 1)
12487     return DAG.getNode(ISD::BITCAST, SDLoc(N),
12488                        N->getValueType(0), Op0.getOperand(0));
12489   return SDValue();
12490 }
12491
12492 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
12493 /// are normal, non-volatile loads.  If so, it is profitable to bitcast an
12494 /// i64 vector to have f64 elements, since the value can then be loaded
12495 /// directly into a VFP register.
12496 static bool hasNormalLoadOperand(SDNode *N) {
12497   unsigned NumElts = N->getValueType(0).getVectorNumElements();
12498   for (unsigned i = 0; i < NumElts; ++i) {
12499     SDNode *Elt = N->getOperand(i).getNode();
12500     if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
12501       return true;
12502   }
12503   return false;
12504 }
12505
12506 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
12507 /// ISD::BUILD_VECTOR.
12508 static SDValue PerformBUILD_VECTORCombine(SDNode *N,
12509                                           TargetLowering::DAGCombinerInfo &DCI,
12510                                           const ARMSubtarget *Subtarget) {
12511   // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
12512   // VMOVRRD is introduced when legalizing i64 types.  It forces the i64 value
12513   // into a pair of GPRs, which is fine when the value is used as a scalar,
12514   // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
12515   SelectionDAG &DAG = DCI.DAG;
12516   if (N->getNumOperands() == 2)
12517     if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
12518       return RV;
12519
12520   // Load i64 elements as f64 values so that type legalization does not split
12521   // them up into i32 values.
12522   EVT VT = N->getValueType(0);
12523   if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
12524     return SDValue();
12525   SDLoc dl(N);
12526   SmallVector<SDValue, 8> Ops;
12527   unsigned NumElts = VT.getVectorNumElements();
12528   for (unsigned i = 0; i < NumElts; ++i) {
12529     SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
12530     Ops.push_back(V);
12531     // Make the DAGCombiner fold the bitcast.
12532     DCI.AddToWorklist(V.getNode());
12533   }
12534   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
12535   SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
12536   return DAG.getNode(ISD::BITCAST, dl, VT, BV);
12537 }
12538
12539 /// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
12540 static SDValue
12541 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
12542   // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
12543   // At that time, we may have inserted bitcasts from integer to float.
12544   // If these bitcasts have survived DAGCombine, change the lowering of this
12545   // BUILD_VECTOR in something more vector friendly, i.e., that does not
12546   // force to use floating point types.
12547
12548   // Make sure we can change the type of the vector.
12549   // This is possible iff:
12550   // 1. The vector is only used in a bitcast to a integer type. I.e.,
12551   //    1.1. Vector is used only once.
12552   //    1.2. Use is a bit convert to an integer type.
12553   // 2. The size of its operands are 32-bits (64-bits are not legal).
12554   EVT VT = N->getValueType(0);
12555   EVT EltVT = VT.getVectorElementType();
12556
12557   // Check 1.1. and 2.
12558   if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
12559     return SDValue();
12560
12561   // By construction, the input type must be float.
12562   assert(EltVT == MVT::f32 && "Unexpected type!");
12563
12564   // Check 1.2.
12565   SDNode *Use = *N->use_begin();
12566   if (Use->getOpcode() != ISD::BITCAST ||
12567       Use->getValueType(0).isFloatingPoint())
12568     return SDValue();
12569
12570   // Check profitability.
12571   // Model is, if more than half of the relevant operands are bitcast from
12572   // i32, turn the build_vector into a sequence of insert_vector_elt.
12573   // Relevant operands are everything that is not statically
12574   // (i.e., at compile time) bitcasted.
12575   unsigned NumOfBitCastedElts = 0;
12576   unsigned NumElts = VT.getVectorNumElements();
12577   unsigned NumOfRelevantElts = NumElts;
12578   for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
12579     SDValue Elt = N->getOperand(Idx);
12580     if (Elt->getOpcode() == ISD::BITCAST) {
12581       // Assume only bit cast to i32 will go away.
12582       if (Elt->getOperand(0).getValueType() == MVT::i32)
12583         ++NumOfBitCastedElts;
12584     } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
12585       // Constants are statically casted, thus do not count them as
12586       // relevant operands.
12587       --NumOfRelevantElts;
12588   }
12589
12590   // Check if more than half of the elements require a non-free bitcast.
12591   if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
12592     return SDValue();
12593
12594   SelectionDAG &DAG = DCI.DAG;
12595   // Create the new vector type.
12596   EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
12597   // Check if the type is legal.
12598   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12599   if (!TLI.isTypeLegal(VecVT))
12600     return SDValue();
12601
12602   // Combine:
12603   // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
12604   // => BITCAST INSERT_VECTOR_ELT
12605   //                      (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
12606   //                      (BITCAST EN), N.
12607   SDValue Vec = DAG.getUNDEF(VecVT);
12608   SDLoc dl(N);
12609   for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
12610     SDValue V = N->getOperand(Idx);
12611     if (V.isUndef())
12612       continue;
12613     if (V.getOpcode() == ISD::BITCAST &&
12614         V->getOperand(0).getValueType() == MVT::i32)
12615       // Fold obvious case.
12616       V = V.getOperand(0);
12617     else {
12618       V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
12619       // Make the DAGCombiner fold the bitcasts.
12620       DCI.AddToWorklist(V.getNode());
12621     }
12622     SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
12623     Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
12624   }
12625   Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
12626   // Make the DAGCombiner fold the bitcasts.
12627   DCI.AddToWorklist(Vec.getNode());
12628   return Vec;
12629 }
12630
12631 static SDValue
12632 PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
12633   EVT VT = N->getValueType(0);
12634   SDValue Op = N->getOperand(0);
12635   SDLoc dl(N);
12636
12637   // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
12638   if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
12639     // If the valuetypes are the same, we can remove the cast entirely.
12640     if (Op->getOperand(0).getValueType() == VT)
12641       return Op->getOperand(0);
12642     return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl,
12643                            Op->getOperand(0).getValueType(), Op->getOperand(0));
12644   }
12645
12646   return SDValue();
12647 }
12648
12649 /// PerformInsertEltCombine - Target-specific dag combine xforms for
12650 /// ISD::INSERT_VECTOR_ELT.
12651 static SDValue PerformInsertEltCombine(SDNode *N,
12652                                        TargetLowering::DAGCombinerInfo &DCI) {
12653   // Bitcast an i64 load inserted into a vector to f64.
12654   // Otherwise, the i64 value will be legalized to a pair of i32 values.
12655   EVT VT = N->getValueType(0);
12656   SDNode *Elt = N->getOperand(1).getNode();
12657   if (VT.getVectorElementType() != MVT::i64 ||
12658       !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
12659     return SDValue();
12660
12661   SelectionDAG &DAG = DCI.DAG;
12662   SDLoc dl(N);
12663   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
12664                                  VT.getVectorNumElements());
12665   SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
12666   SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
12667   // Make the DAGCombiner fold the bitcasts.
12668   DCI.AddToWorklist(Vec.getNode());
12669   DCI.AddToWorklist(V.getNode());
12670   SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
12671                                Vec, V, N->getOperand(2));
12672   return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
12673 }
12674
12675 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
12676 /// ISD::VECTOR_SHUFFLE.
12677 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
12678   // The LLVM shufflevector instruction does not require the shuffle mask
12679   // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
12680   // have that requirement.  When translating to ISD::VECTOR_SHUFFLE, if the
12681   // operands do not match the mask length, they are extended by concatenating
12682   // them with undef vectors.  That is probably the right thing for other
12683   // targets, but for NEON it is better to concatenate two double-register
12684   // size vector operands into a single quad-register size vector.  Do that
12685   // transformation here:
12686   //   shuffle(concat(v1, undef), concat(v2, undef)) ->
12687   //   shuffle(concat(v1, v2), undef)
12688   SDValue Op0 = N->getOperand(0);
12689   SDValue Op1 = N->getOperand(1);
12690   if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
12691       Op1.getOpcode() != ISD::CONCAT_VECTORS ||
12692       Op0.getNumOperands() != 2 ||
12693       Op1.getNumOperands() != 2)
12694     return SDValue();
12695   SDValue Concat0Op1 = Op0.getOperand(1);
12696   SDValue Concat1Op1 = Op1.getOperand(1);
12697   if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
12698     return SDValue();
12699   // Skip the transformation if any of the types are illegal.
12700   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12701   EVT VT = N->getValueType(0);
12702   if (!TLI.isTypeLegal(VT) ||
12703       !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
12704       !TLI.isTypeLegal(Concat1Op1.getValueType()))
12705     return SDValue();
12706
12707   SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
12708                                   Op0.getOperand(0), Op1.getOperand(0));
12709   // Translate the shuffle mask.
12710   SmallVector<int, 16> NewMask;
12711   unsigned NumElts = VT.getVectorNumElements();
12712   unsigned HalfElts = NumElts/2;
12713   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
12714   for (unsigned n = 0; n < NumElts; ++n) {
12715     int MaskElt = SVN->getMaskElt(n);
12716     int NewElt = -1;
12717     if (MaskElt < (int)HalfElts)
12718       NewElt = MaskElt;
12719     else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
12720       NewElt = HalfElts + MaskElt - NumElts;
12721     NewMask.push_back(NewElt);
12722   }
12723   return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
12724                               DAG.getUNDEF(VT), NewMask);
12725 }
12726
12727 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
12728 /// NEON load/store intrinsics, and generic vector load/stores, to merge
12729 /// base address updates.
12730 /// For generic load/stores, the memory type is assumed to be a vector.
12731 /// The caller is assumed to have checked legality.
12732 static SDValue CombineBaseUpdate(SDNode *N,
12733                                  TargetLowering::DAGCombinerInfo &DCI) {
12734   SelectionDAG &DAG = DCI.DAG;
12735   const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
12736                             N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
12737   const bool isStore = N->getOpcode() == ISD::STORE;
12738   const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
12739   SDValue Addr = N->getOperand(AddrOpIdx);
12740   MemSDNode *MemN = cast<MemSDNode>(N);
12741   SDLoc dl(N);
12742
12743   // Search for a use of the address operand that is an increment.
12744   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
12745          UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
12746     SDNode *User = *UI;
12747     if (User->getOpcode() != ISD::ADD ||
12748         UI.getUse().getResNo() != Addr.getResNo())
12749       continue;
12750
12751     // Check that the add is independent of the load/store.  Otherwise, folding
12752     // it would create a cycle. We can avoid searching through Addr as it's a
12753     // predecessor to both.
12754     SmallPtrSet<const SDNode *, 32> Visited;
12755     SmallVector<const SDNode *, 16> Worklist;
12756     Visited.insert(Addr.getNode());
12757     Worklist.push_back(N);
12758     Worklist.push_back(User);
12759     if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
12760         SDNode::hasPredecessorHelper(User, Visited, Worklist))
12761       continue;
12762
12763     // Find the new opcode for the updating load/store.
12764     bool isLoadOp = true;
12765     bool isLaneOp = false;
12766     unsigned NewOpc = 0;
12767     unsigned NumVecs = 0;
12768     if (isIntrinsic) {
12769       unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
12770       switch (IntNo) {
12771       default: llvm_unreachable("unexpected intrinsic for Neon base update");
12772       case Intrinsic::arm_neon_vld1:     NewOpc = ARMISD::VLD1_UPD;
12773         NumVecs = 1; break;
12774       case Intrinsic::arm_neon_vld2:     NewOpc = ARMISD::VLD2_UPD;
12775         NumVecs = 2; break;
12776       case Intrinsic::arm_neon_vld3:     NewOpc = ARMISD::VLD3_UPD;
12777         NumVecs = 3; break;
12778       case Intrinsic::arm_neon_vld4:     NewOpc = ARMISD::VLD4_UPD;
12779         NumVecs = 4; break;
12780       case Intrinsic::arm_neon_vld2dup:
12781       case Intrinsic::arm_neon_vld3dup:
12782       case Intrinsic::arm_neon_vld4dup:
12783         // TODO: Support updating VLDxDUP nodes. For now, we just skip
12784         // combining base updates for such intrinsics.
12785         continue;
12786       case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
12787         NumVecs = 2; isLaneOp = true; break;
12788       case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
12789         NumVecs = 3; isLaneOp = true; break;
12790       case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
12791         NumVecs = 4; isLaneOp = true; break;
12792       case Intrinsic::arm_neon_vst1:     NewOpc = ARMISD::VST1_UPD;
12793         NumVecs = 1; isLoadOp = false; break;
12794       case Intrinsic::arm_neon_vst2:     NewOpc = ARMISD::VST2_UPD;
12795         NumVecs = 2; isLoadOp = false; break;
12796       case Intrinsic::arm_neon_vst3:     NewOpc = ARMISD::VST3_UPD;
12797         NumVecs = 3; isLoadOp = false; break;
12798       case Intrinsic::arm_neon_vst4:     NewOpc = ARMISD::VST4_UPD;
12799         NumVecs = 4; isLoadOp = false; break;
12800       case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
12801         NumVecs = 2; isLoadOp = false; isLaneOp = true; break;
12802       case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
12803         NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
12804       case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
12805         NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
12806       }
12807     } else {
12808       isLaneOp = true;
12809       switch (N->getOpcode()) {
12810       default: llvm_unreachable("unexpected opcode for Neon base update");
12811       case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break;
12812       case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
12813       case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
12814       case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
12815       case ISD::LOAD:       NewOpc = ARMISD::VLD1_UPD;
12816         NumVecs = 1; isLaneOp = false; break;
12817       case ISD::STORE:      NewOpc = ARMISD::VST1_UPD;
12818         NumVecs = 1; isLaneOp = false; isLoadOp = false; break;
12819       }
12820     }
12821
12822     // Find the size of memory referenced by the load/store.
12823     EVT VecTy;
12824     if (isLoadOp) {
12825       VecTy = N->getValueType(0);
12826     } else if (isIntrinsic) {
12827       VecTy = N->getOperand(AddrOpIdx+1).getValueType();
12828     } else {
12829       assert(isStore && "Node has to be a load, a store, or an intrinsic!");
12830       VecTy = N->getOperand(1).getValueType();
12831     }
12832
12833     unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
12834     if (isLaneOp)
12835       NumBytes /= VecTy.getVectorNumElements();
12836
12837     // If the increment is a constant, it must match the memory ref size.
12838     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
12839     ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
12840     if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) {
12841       // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
12842       // separate instructions that make it harder to use a non-constant update.
12843       continue;
12844     }
12845
12846     // OK, we found an ADD we can fold into the base update.
12847     // Now, create a _UPD node, taking care of not breaking alignment.
12848
12849     EVT AlignedVecTy = VecTy;
12850     unsigned Alignment = MemN->getAlignment();
12851
12852     // If this is a less-than-standard-aligned load/store, change the type to
12853     // match the standard alignment.
12854     // The alignment is overlooked when selecting _UPD variants; and it's
12855     // easier to introduce bitcasts here than fix that.
12856     // There are 3 ways to get to this base-update combine:
12857     // - intrinsics: they are assumed to be properly aligned (to the standard
12858     //   alignment of the memory type), so we don't need to do anything.
12859     // - ARMISD::VLDx nodes: they are only generated from the aforementioned
12860     //   intrinsics, so, likewise, there's nothing to do.
12861     // - generic load/store instructions: the alignment is specified as an
12862     //   explicit operand, rather than implicitly as the standard alignment
12863     //   of the memory type (like the intrisics).  We need to change the
12864     //   memory type to match the explicit alignment.  That way, we don't
12865     //   generate non-standard-aligned ARMISD::VLDx nodes.
12866     if (isa<LSBaseSDNode>(N)) {
12867       if (Alignment == 0)
12868         Alignment = 1;
12869       if (Alignment < VecTy.getScalarSizeInBits() / 8) {
12870         MVT EltTy = MVT::getIntegerVT(Alignment * 8);
12871         assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
12872         assert(!isLaneOp && "Unexpected generic load/store lane.");
12873         unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
12874         AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
12875       }
12876       // Don't set an explicit alignment on regular load/stores that we want
12877       // to transform to VLD/VST 1_UPD nodes.
12878       // This matches the behavior of regular load/stores, which only get an
12879       // explicit alignment if the MMO alignment is larger than the standard
12880       // alignment of the memory type.
12881       // Intrinsics, however, always get an explicit alignment, set to the
12882       // alignment of the MMO.
12883       Alignment = 1;
12884     }
12885
12886     // Create the new updating load/store node.
12887     // First, create an SDVTList for the new updating node's results.
12888     EVT Tys[6];
12889     unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
12890     unsigned n;
12891     for (n = 0; n < NumResultVecs; ++n)
12892       Tys[n] = AlignedVecTy;
12893     Tys[n++] = MVT::i32;
12894     Tys[n] = MVT::Other;
12895     SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));
12896
12897     // Then, gather the new node's operands.
12898     SmallVector<SDValue, 8> Ops;
12899     Ops.push_back(N->getOperand(0)); // incoming chain
12900     Ops.push_back(N->getOperand(AddrOpIdx));
12901     Ops.push_back(Inc);
12902
12903     if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
12904       // Try to match the intrinsic's signature
12905       Ops.push_back(StN->getValue());
12906     } else {
12907       // Loads (and of course intrinsics) match the intrinsics' signature,
12908       // so just add all but the alignment operand.
12909       for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i)
12910         Ops.push_back(N->getOperand(i));
12911     }
12912
12913     // For all node types, the alignment operand is always the last one.
12914     Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
12915
12916     // If this is a non-standard-aligned STORE, the penultimate operand is the
12917     // stored value.  Bitcast it to the aligned type.
12918     if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
12919       SDValue &StVal = Ops[Ops.size()-2];
12920       StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
12921     }
12922
12923     EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
12924     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
12925                                            MemN->getMemOperand());
12926
12927     // Update the uses.
12928     SmallVector<SDValue, 5> NewResults;
12929     for (unsigned i = 0; i < NumResultVecs; ++i)
12930       NewResults.push_back(SDValue(UpdN.getNode(), i));
12931
12932     // If this is an non-standard-aligned LOAD, the first result is the loaded
12933     // value.  Bitcast it to the expected result type.
12934     if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
12935       SDValue &LdVal = NewResults[0];
12936       LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
12937     }
12938
12939     NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
12940     DCI.CombineTo(N, NewResults);
12941     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
12942
12943     break;
12944   }
12945   return SDValue();
12946 }
12947
12948 static SDValue PerformVLDCombine(SDNode *N,
12949                                  TargetLowering::DAGCombinerInfo &DCI) {
12950   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
12951     return SDValue();
12952
12953   return CombineBaseUpdate(N, DCI);
12954 }
12955
12956 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
12957 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
12958 /// are also VDUPLANEs.  If so, combine them to a vldN-dup operation and
12959 /// return true.
12960 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
12961   SelectionDAG &DAG = DCI.DAG;
12962   EVT VT = N->getValueType(0);
12963   // vldN-dup instructions only support 64-bit vectors for N > 1.
12964   if (!VT.is64BitVector())
12965     return false;
12966
12967   // Check if the VDUPLANE operand is a vldN-dup intrinsic.
12968   SDNode *VLD = N->getOperand(0).getNode();
12969   if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
12970     return false;
12971   unsigned NumVecs = 0;
12972   unsigned NewOpc = 0;
12973   unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
12974   if (IntNo == Intrinsic::arm_neon_vld2lane) {
12975     NumVecs = 2;
12976     NewOpc = ARMISD::VLD2DUP;
12977   } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
12978     NumVecs = 3;
12979     NewOpc = ARMISD::VLD3DUP;
12980   } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
12981     NumVecs = 4;
12982     NewOpc = ARMISD::VLD4DUP;
12983   } else {
12984     return false;
12985   }
12986
12987   // First check that all the vldN-lane uses are VDUPLANEs and that the lane
12988   // numbers match the load.
12989   unsigned VLDLaneNo =
12990     cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
12991   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
12992        UI != UE; ++UI) {
12993     // Ignore uses of the chain result.
12994     if (UI.getUse().getResNo() == NumVecs)
12995       continue;
12996     SDNode *User = *UI;
12997     if (User->getOpcode() != ARMISD::VDUPLANE ||
12998         VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
12999       return false;
13000   }
13001
13002   // Create the vldN-dup node.
13003   EVT Tys[5];
13004   unsigned n;
13005   for (n = 0; n < NumVecs; ++n)
13006     Tys[n] = VT;
13007   Tys[n] = MVT::Other;
13008   SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1));
13009   SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
13010   MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
13011   SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
13012                                            Ops, VLDMemInt->getMemoryVT(),
13013                                            VLDMemInt->getMemOperand());
13014
13015   // Update the uses.
13016   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
13017        UI != UE; ++UI) {
13018     unsigned ResNo = UI.getUse().getResNo();
13019     // Ignore uses of the chain result.
13020     if (ResNo == NumVecs)
13021       continue;
13022     SDNode *User = *UI;
13023     DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
13024   }
13025
13026   // Now the vldN-lane intrinsic is dead except for its chain result.
13027   // Update uses of the chain.
13028   std::vector<SDValue> VLDDupResults;
13029   for (unsigned n = 0; n < NumVecs; ++n)
13030     VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
13031   VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
13032   DCI.CombineTo(VLD, VLDDupResults);
13033
13034   return true;
13035 }
13036
13037 /// PerformVDUPLANECombine - Target-specific dag combine xforms for
13038 /// ARMISD::VDUPLANE.
13039 static SDValue PerformVDUPLANECombine(SDNode *N,
13040                                       TargetLowering::DAGCombinerInfo &DCI) {
13041   SDValue Op = N->getOperand(0);
13042
13043   // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
13044   // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
13045   if (CombineVLDDUP(N, DCI))
13046     return SDValue(N, 0);
13047
13048   // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
13049   // redundant.  Ignore bit_converts for now; element sizes are checked below.
13050   while (Op.getOpcode() == ISD::BITCAST)
13051     Op = Op.getOperand(0);
13052   if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
13053     return SDValue();
13054
13055   // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
13056   unsigned EltSize = Op.getScalarValueSizeInBits();
13057   // The canonical VMOV for a zero vector uses a 32-bit element size.
13058   unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
13059   unsigned EltBits;
13060   if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
13061     EltSize = 8;
13062   EVT VT = N->getValueType(0);
13063   if (EltSize > VT.getScalarSizeInBits())
13064     return SDValue();
13065
13066   return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
13067 }
13068
13069 /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
13070 static SDValue PerformVDUPCombine(SDNode *N,
13071                                   TargetLowering::DAGCombinerInfo &DCI,
13072                                   const ARMSubtarget *Subtarget) {
13073   SelectionDAG &DAG = DCI.DAG;
13074   SDValue Op = N->getOperand(0);
13075
13076   if (!Subtarget->hasNEON())
13077     return SDValue();
13078
13079   // Match VDUP(LOAD) -> VLD1DUP.
13080   // We match this pattern here rather than waiting for isel because the
13081   // transform is only legal for unindexed loads.
13082   LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
13083   if (LD && Op.hasOneUse() && LD->isUnindexed() &&
13084       LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
13085     SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1),
13086                       DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) };
13087     SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
13088     SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys,
13089                                              Ops, LD->getMemoryVT(),
13090                                              LD->getMemOperand());
13091     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
13092     return VLDDup;
13093   }
13094
13095   return SDValue();
13096 }
13097
13098 static SDValue PerformLOADCombine(SDNode *N,
13099                                   TargetLowering::DAGCombinerInfo &DCI) {
13100   EVT VT = N->getValueType(0);
13101
13102   // If this is a legal vector load, try to combine it into a VLD1_UPD.
13103   if (ISD::isNormalLoad(N) && VT.isVector() &&
13104       DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
13105     return CombineBaseUpdate(N, DCI);
13106
13107   return SDValue();
13108 }
13109
13110 // Optimize trunc store (of multiple scalars) to shuffle and store.  First,
13111 // pack all of the elements in one place.  Next, store to memory in fewer
13112 // chunks.
13113 static SDValue PerformTruncatingStoreCombine(StoreSDNode *St,
13114                                              SelectionDAG &DAG) {
13115   SDValue StVal = St->getValue();
13116   EVT VT = StVal.getValueType();
13117   if (!St->isTruncatingStore() || !VT.isVector())
13118     return SDValue();
13119   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13120   EVT StVT = St->getMemoryVT();
13121   unsigned NumElems = VT.getVectorNumElements();
13122   assert(StVT != VT && "Cannot truncate to the same type");
13123   unsigned FromEltSz = VT.getScalarSizeInBits();
13124   unsigned ToEltSz = StVT.getScalarSizeInBits();
13125
13126   // From, To sizes and ElemCount must be pow of two
13127   if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
13128     return SDValue();
13129
13130   // We are going to use the original vector elt for storing.
13131   // Accumulated smaller vector elements must be a multiple of the store size.
13132   if (0 != (NumElems * FromEltSz) % ToEltSz)
13133     return SDValue();
13134
13135   unsigned SizeRatio = FromEltSz / ToEltSz;
13136   assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
13137
13138   // Create a type on which we perform the shuffle.
13139   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
13140                                    NumElems * SizeRatio);
13141   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
13142
13143   SDLoc DL(St);
13144   SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
13145   SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
13146   for (unsigned i = 0; i < NumElems; ++i)
13147     ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
13148                                                       : i * SizeRatio;
13149
13150   // Can't shuffle using an illegal type.
13151   if (!TLI.isTypeLegal(WideVecVT))
13152     return SDValue();
13153
13154   SDValue Shuff = DAG.getVectorShuffle(
13155       WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
13156   // At this point all of the data is stored at the bottom of the
13157   // register. We now need to save it to mem.
13158
13159   // Find the largest store unit
13160   MVT StoreType = MVT::i8;
13161   for (MVT Tp : MVT::integer_valuetypes()) {
13162     if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
13163       StoreType = Tp;
13164   }
13165   // Didn't find a legal store type.
13166   if (!TLI.isTypeLegal(StoreType))
13167     return SDValue();
13168
13169   // Bitcast the original vector into a vector of store-size units
13170   EVT StoreVecVT =
13171       EVT::getVectorVT(*DAG.getContext(), StoreType,
13172                        VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
13173   assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
13174   SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
13175   SmallVector<SDValue, 8> Chains;
13176   SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
13177                                       TLI.getPointerTy(DAG.getDataLayout()));
13178   SDValue BasePtr = St->getBasePtr();
13179
13180   // Perform one or more big stores into memory.
13181   unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
13182   for (unsigned I = 0; I < E; I++) {
13183     SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
13184                                  ShuffWide, DAG.getIntPtrConstant(I, DL));
13185     SDValue Ch =
13186         DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
13187                      St->getAlignment(), St->getMemOperand()->getFlags());
13188     BasePtr =
13189         DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
13190     Chains.push_back(Ch);
13191   }
13192   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
13193 }
13194
13195 // Try taking a single vector store from an truncate (which would otherwise turn
13196 // into an expensive buildvector) and splitting it into a series of narrowing
13197 // stores.
13198 static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
13199                                                  SelectionDAG &DAG) {
13200   if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
13201     return SDValue();
13202   SDValue Trunc = St->getValue();
13203   if (Trunc->getOpcode() != ISD::TRUNCATE)
13204     return SDValue();
13205   EVT FromVT = Trunc->getOperand(0).getValueType();
13206   EVT ToVT = Trunc.getValueType();
13207   if (!ToVT.isVector())
13208     return SDValue();
13209   assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements());
13210   EVT ToEltVT = ToVT.getVectorElementType();
13211   EVT FromEltVT = FromVT.getVectorElementType();
13212
13213   unsigned NumElements = 0;
13214   if (FromEltVT == MVT::i32 && (ToEltVT == MVT::i16 || ToEltVT == MVT::i8))
13215     NumElements = 4;
13216   if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8)
13217     NumElements = 8;
13218   if (NumElements == 0 || FromVT.getVectorNumElements() == NumElements ||
13219       FromVT.getVectorNumElements() % NumElements != 0)
13220     return SDValue();
13221
13222   SDLoc DL(St);
13223   // Details about the old store
13224   SDValue Ch = St->getChain();
13225   SDValue BasePtr = St->getBasePtr();
13226   unsigned Alignment = St->getOriginalAlignment();
13227   MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
13228   AAMDNodes AAInfo = St->getAAInfo();
13229
13230   EVT NewFromVT = EVT::getVectorVT(*DAG.getContext(), FromEltVT, NumElements);
13231   EVT NewToVT = EVT::getVectorVT(*DAG.getContext(), ToEltVT, NumElements);
13232
13233   SmallVector<SDValue, 4> Stores;
13234   for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
13235     unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
13236     SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset);
13237
13238     SDValue Extract =
13239         DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
13240                     DAG.getConstant(i * NumElements, DL, MVT::i32));
13241     SDValue Store = DAG.getTruncStore(
13242         Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
13243         NewToVT, Alignment, MMOFlags, AAInfo);
13244     Stores.push_back(Store);
13245   }
13246   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
13247 }
13248
13249 /// PerformSTORECombine - Target-specific dag combine xforms for
13250 /// ISD::STORE.
13251 static SDValue PerformSTORECombine(SDNode *N,
13252                                    TargetLowering::DAGCombinerInfo &DCI,
13253                                    const ARMSubtarget *Subtarget) {
13254   StoreSDNode *St = cast<StoreSDNode>(N);
13255   if (St->isVolatile())
13256     return SDValue();
13257   SDValue StVal = St->getValue();
13258   EVT VT = StVal.getValueType();
13259
13260   if (Subtarget->hasNEON())
13261     if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
13262       return Store;
13263
13264   if (Subtarget->hasMVEIntegerOps())
13265     if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
13266       return NewToken;
13267
13268   if (!ISD::isNormalStore(St))
13269     return SDValue();
13270
13271   // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
13272   // ARM stores of arguments in the same cache line.
13273   if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
13274       StVal.getNode()->hasOneUse()) {
13275     SelectionDAG  &DAG = DCI.DAG;
13276     bool isBigEndian = DAG.getDataLayout().isBigEndian();
13277     SDLoc DL(St);
13278     SDValue BasePtr = St->getBasePtr();
13279     SDValue NewST1 = DAG.getStore(
13280         St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
13281         BasePtr, St->getPointerInfo(), St->getAlignment(),
13282         St->getMemOperand()->getFlags());
13283
13284     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
13285                                     DAG.getConstant(4, DL, MVT::i32));
13286     return DAG.getStore(NewST1.getValue(0), DL,
13287                         StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
13288                         OffsetPtr, St->getPointerInfo(),
13289                         std::min(4U, St->getAlignment() / 2),
13290                         St->getMemOperand()->getFlags());
13291   }
13292
13293   if (StVal.getValueType() == MVT::i64 &&
13294       StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
13295
13296     // Bitcast an i64 store extracted from a vector to f64.
13297     // Otherwise, the i64 value will be legalized to a pair of i32 values.
13298     SelectionDAG &DAG = DCI.DAG;
13299     SDLoc dl(StVal);
13300     SDValue IntVec = StVal.getOperand(0);
13301     EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
13302                                    IntVec.getValueType().getVectorNumElements());
13303     SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
13304     SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
13305                                  Vec, StVal.getOperand(1));
13306     dl = SDLoc(N);
13307     SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
13308     // Make the DAGCombiner fold the bitcasts.
13309     DCI.AddToWorklist(Vec.getNode());
13310     DCI.AddToWorklist(ExtElt.getNode());
13311     DCI.AddToWorklist(V.getNode());
13312     return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
13313                         St->getPointerInfo(), St->getAlignment(),
13314                         St->getMemOperand()->getFlags(), St->getAAInfo());
13315   }
13316
13317   // If this is a legal vector store, try to combine it into a VST1_UPD.
13318   if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
13319       DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
13320     return CombineBaseUpdate(N, DCI);
13321
13322   return SDValue();
13323 }
13324
13325 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
13326 /// can replace combinations of VMUL and VCVT (floating-point to integer)
13327 /// when the VMUL has a constant operand that is a power of 2.
13328 ///
13329 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
13330 ///  vmul.f32        d16, d17, d16
13331 ///  vcvt.s32.f32    d16, d16
13332 /// becomes:
13333 ///  vcvt.s32.f32    d16, d16, #3
13334 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
13335                                   const ARMSubtarget *Subtarget) {
13336   if (!Subtarget->hasNEON())
13337     return SDValue();
13338
13339   SDValue Op = N->getOperand(0);
13340   if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
13341       Op.getOpcode() != ISD::FMUL)
13342     return SDValue();
13343
13344   SDValue ConstVec = Op->getOperand(1);
13345   if (!isa<BuildVectorSDNode>(ConstVec))
13346     return SDValue();
13347
13348   MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
13349   uint32_t FloatBits = FloatTy.getSizeInBits();
13350   MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
13351   uint32_t IntBits = IntTy.getSizeInBits();
13352   unsigned NumLanes = Op.getValueType().getVectorNumElements();
13353   if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
13354     // These instructions only exist converting from f32 to i32. We can handle
13355     // smaller integers by generating an extra truncate, but larger ones would
13356     // be lossy. We also can't handle anything other than 2 or 4 lanes, since
13357     // these intructions only support v2i32/v4i32 types.
13358     return SDValue();
13359   }
13360
13361   BitVector UndefElements;
13362   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
13363   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
13364   if (C == -1 || C == 0 || C > 32)
13365     return SDValue();
13366
13367   SDLoc dl(N);
13368   bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
13369   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
13370     Intrinsic::arm_neon_vcvtfp2fxu;
13371   SDValue FixConv = DAG.getNode(
13372       ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
13373       DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
13374       DAG.getConstant(C, dl, MVT::i32));
13375
13376   if (IntBits < FloatBits)
13377     FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
13378
13379   return FixConv;
13380 }
13381
13382 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
13383 /// can replace combinations of VCVT (integer to floating-point) and VDIV
13384 /// when the VDIV has a constant operand that is a power of 2.
13385 ///
13386 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
13387 ///  vcvt.f32.s32    d16, d16
13388 ///  vdiv.f32        d16, d17, d16
13389 /// becomes:
13390 ///  vcvt.f32.s32    d16, d16, #3
13391 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
13392                                   const ARMSubtarget *Subtarget) {
13393   if (!Subtarget->hasNEON())
13394     return SDValue();
13395
13396   SDValue Op = N->getOperand(0);
13397   unsigned OpOpcode = Op.getNode()->getOpcode();
13398   if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
13399       (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
13400     return SDValue();
13401
13402   SDValue ConstVec = N->getOperand(1);
13403   if (!isa<BuildVectorSDNode>(ConstVec))
13404     return SDValue();
13405
13406   MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
13407   uint32_t FloatBits = FloatTy.getSizeInBits();
13408   MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
13409   uint32_t IntBits = IntTy.getSizeInBits();
13410   unsigned NumLanes = Op.getValueType().getVectorNumElements();
13411   if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
13412     // These instructions only exist converting from i32 to f32. We can handle
13413     // smaller integers by generating an extra extend, but larger ones would
13414     // be lossy. We also can't handle anything other than 2 or 4 lanes, since
13415     // these intructions only support v2i32/v4i32 types.
13416     return SDValue();
13417   }
13418
13419   BitVector UndefElements;
13420   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
13421   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
13422   if (C == -1 || C == 0 || C > 32)
13423     return SDValue();
13424
13425   SDLoc dl(N);
13426   bool isSigned = OpOpcode == ISD::SINT_TO_FP;
13427   SDValue ConvInput = Op.getOperand(0);
13428   if (IntBits < FloatBits)
13429     ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
13430                             dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
13431                             ConvInput);
13432
13433   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
13434     Intrinsic::arm_neon_vcvtfxu2fp;
13435   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
13436                      Op.getValueType(),
13437                      DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
13438                      ConvInput, DAG.getConstant(C, dl, MVT::i32));
13439 }
13440
13441 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
13442 static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
13443   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
13444   switch (IntNo) {
13445   default:
13446     // Don't do anything for most intrinsics.
13447     break;
13448
13449   // Vector shifts: check for immediate versions and lower them.
13450   // Note: This is done during DAG combining instead of DAG legalizing because
13451   // the build_vectors for 64-bit vector element shift counts are generally
13452   // not legal, and it is hard to see their values after they get legalized to
13453   // loads from a constant pool.
13454   case Intrinsic::arm_neon_vshifts:
13455   case Intrinsic::arm_neon_vshiftu:
13456   case Intrinsic::arm_neon_vrshifts:
13457   case Intrinsic::arm_neon_vrshiftu:
13458   case Intrinsic::arm_neon_vrshiftn:
13459   case Intrinsic::arm_neon_vqshifts:
13460   case Intrinsic::arm_neon_vqshiftu:
13461   case Intrinsic::arm_neon_vqshiftsu:
13462   case Intrinsic::arm_neon_vqshiftns:
13463   case Intrinsic::arm_neon_vqshiftnu:
13464   case Intrinsic::arm_neon_vqshiftnsu:
13465   case Intrinsic::arm_neon_vqrshiftns:
13466   case Intrinsic::arm_neon_vqrshiftnu:
13467   case Intrinsic::arm_neon_vqrshiftnsu: {
13468     EVT VT = N->getOperand(1).getValueType();
13469     int64_t Cnt;
13470     unsigned VShiftOpc = 0;
13471
13472     switch (IntNo) {
13473     case Intrinsic::arm_neon_vshifts:
13474     case Intrinsic::arm_neon_vshiftu:
13475       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
13476         VShiftOpc = ARMISD::VSHLIMM;
13477         break;
13478       }
13479       if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
13480         VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
13481                                                           : ARMISD::VSHRuIMM);
13482         break;
13483       }
13484       return SDValue();
13485
13486     case Intrinsic::arm_neon_vrshifts:
13487     case Intrinsic::arm_neon_vrshiftu:
13488       if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
13489         break;
13490       return SDValue();
13491
13492     case Intrinsic::arm_neon_vqshifts:
13493     case Intrinsic::arm_neon_vqshiftu:
13494       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
13495         break;
13496       return SDValue();
13497
13498     case Intrinsic::arm_neon_vqshiftsu:
13499       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
13500         break;
13501       llvm_unreachable("invalid shift count for vqshlu intrinsic");
13502
13503     case Intrinsic::arm_neon_vrshiftn:
13504     case Intrinsic::arm_neon_vqshiftns:
13505     case Intrinsic::arm_neon_vqshiftnu:
13506     case Intrinsic::arm_neon_vqshiftnsu:
13507     case Intrinsic::arm_neon_vqrshiftns:
13508     case Intrinsic::arm_neon_vqrshiftnu:
13509     case Intrinsic::arm_neon_vqrshiftnsu:
13510       // Narrowing shifts require an immediate right shift.
13511       if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
13512         break;
13513       llvm_unreachable("invalid shift count for narrowing vector shift "
13514                        "intrinsic");
13515
13516     default:
13517       llvm_unreachable("unhandled vector shift");
13518     }
13519
13520     switch (IntNo) {
13521     case Intrinsic::arm_neon_vshifts:
13522     case Intrinsic::arm_neon_vshiftu:
13523       // Opcode already set above.
13524       break;
13525     case Intrinsic::arm_neon_vrshifts:
13526       VShiftOpc = ARMISD::VRSHRsIMM;
13527       break;
13528     case Intrinsic::arm_neon_vrshiftu:
13529       VShiftOpc = ARMISD::VRSHRuIMM;
13530       break;
13531     case Intrinsic::arm_neon_vrshiftn:
13532       VShiftOpc = ARMISD::VRSHRNIMM;
13533       break;
13534     case Intrinsic::arm_neon_vqshifts:
13535       VShiftOpc = ARMISD::VQSHLsIMM;
13536       break;
13537     case Intrinsic::arm_neon_vqshiftu:
13538       VShiftOpc = ARMISD::VQSHLuIMM;
13539       break;
13540     case Intrinsic::arm_neon_vqshiftsu:
13541       VShiftOpc = ARMISD::VQSHLsuIMM;
13542       break;
13543     case Intrinsic::arm_neon_vqshiftns:
13544       VShiftOpc = ARMISD::VQSHRNsIMM;
13545       break;
13546     case Intrinsic::arm_neon_vqshiftnu:
13547       VShiftOpc = ARMISD::VQSHRNuIMM;
13548       break;
13549     case Intrinsic::arm_neon_vqshiftnsu:
13550       VShiftOpc = ARMISD::VQSHRNsuIMM;
13551       break;
13552     case Intrinsic::arm_neon_vqrshiftns:
13553       VShiftOpc = ARMISD::VQRSHRNsIMM;
13554       break;
13555     case Intrinsic::arm_neon_vqrshiftnu:
13556       VShiftOpc = ARMISD::VQRSHRNuIMM;
13557       break;
13558     case Intrinsic::arm_neon_vqrshiftnsu:
13559       VShiftOpc = ARMISD::VQRSHRNsuIMM;
13560       break;
13561     }
13562
13563     SDLoc dl(N);
13564     return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
13565                        N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
13566   }
13567
13568   case Intrinsic::arm_neon_vshiftins: {
13569     EVT VT = N->getOperand(1).getValueType();
13570     int64_t Cnt;
13571     unsigned VShiftOpc = 0;
13572
13573     if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
13574       VShiftOpc = ARMISD::VSLIIMM;
13575     else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
13576       VShiftOpc = ARMISD::VSRIIMM;
13577     else {
13578       llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
13579     }
13580
13581     SDLoc dl(N);
13582     return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
13583                        N->getOperand(1), N->getOperand(2),
13584                        DAG.getConstant(Cnt, dl, MVT::i32));
13585   }
13586
13587   case Intrinsic::arm_neon_vqrshifts:
13588   case Intrinsic::arm_neon_vqrshiftu:
13589     // No immediate versions of these to check for.
13590     break;
13591   }
13592
13593   return SDValue();
13594 }
13595
13596 /// PerformShiftCombine - Checks for immediate versions of vector shifts and
13597 /// lowers them.  As with the vector shift intrinsics, this is done during DAG
13598 /// combining instead of DAG legalizing because the build_vectors for 64-bit
13599 /// vector element shift counts are generally not legal, and it is hard to see
13600 /// their values after they get legalized to loads from a constant pool.
13601 static SDValue PerformShiftCombine(SDNode *N,
13602                                    TargetLowering::DAGCombinerInfo &DCI,
13603                                    const ARMSubtarget *ST) {
13604   SelectionDAG &DAG = DCI.DAG;
13605   EVT VT = N->getValueType(0);
13606   if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
13607     // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
13608     // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16.
13609     SDValue N1 = N->getOperand(1);
13610     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
13611       SDValue N0 = N->getOperand(0);
13612       if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
13613           DAG.MaskedValueIsZero(N0.getOperand(0),
13614                                 APInt::getHighBitsSet(32, 16)))
13615         return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1);
13616     }
13617   }
13618
13619   if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
13620       N->getOperand(0)->getOpcode() == ISD::AND &&
13621       N->getOperand(0)->hasOneUse()) {
13622     if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
13623       return SDValue();
13624     // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
13625     // usually show up because instcombine prefers to canonicalize it to
13626     // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
13627     // out of GEP lowering in some cases.
13628     SDValue N0 = N->getOperand(0);
13629     ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
13630     if (!ShiftAmtNode)
13631       return SDValue();
13632     uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
13633     ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
13634     if (!AndMaskNode)
13635       return SDValue();
13636     uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
13637     // Don't transform uxtb/uxth.
13638     if (AndMask == 255 || AndMask == 65535)
13639       return SDValue();
13640     if (isMask_32(AndMask)) {
13641       uint32_t MaskedBits = countLeadingZeros(AndMask);
13642       if (MaskedBits > ShiftAmt) {
13643         SDLoc DL(N);
13644         SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
13645                                   DAG.getConstant(MaskedBits, DL, MVT::i32));
13646         return DAG.getNode(
13647             ISD::SRL, DL, MVT::i32, SHL,
13648             DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
13649       }
13650     }
13651   }
13652
13653   // Nothing to be done for scalar shifts.
13654   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13655   if (!VT.isVector() || !TLI.isTypeLegal(VT))
13656     return SDValue();
13657   if (ST->hasMVEIntegerOps() && VT == MVT::v2i64)
13658     return SDValue();
13659
13660   int64_t Cnt;
13661
13662   switch (N->getOpcode()) {
13663   default: llvm_unreachable("unexpected shift opcode");
13664
13665   case ISD::SHL:
13666     if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
13667       SDLoc dl(N);
13668       return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
13669                          DAG.getConstant(Cnt, dl, MVT::i32));
13670     }
13671     break;
13672
13673   case ISD::SRA:
13674   case ISD::SRL:
13675     if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
13676       unsigned VShiftOpc =
13677           (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
13678       SDLoc dl(N);
13679       return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
13680                          DAG.getConstant(Cnt, dl, MVT::i32));
13681     }
13682   }
13683   return SDValue();
13684 }
13685
13686 // Look for a sign/zero extend of a larger than legal load. This can be split
13687 // into two extending loads, which are simpler to deal with than an arbitrary
13688 // sign extend.
13689 static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {
13690   SDValue N0 = N->getOperand(0);
13691   if (N0.getOpcode() != ISD::LOAD)
13692     return SDValue();
13693   LoadSDNode *LD = cast<LoadSDNode>(N0.getNode());
13694   if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
13695       LD->getExtensionType() != ISD::NON_EXTLOAD)
13696     return SDValue();
13697   EVT FromVT = LD->getValueType(0);
13698   EVT ToVT = N->getValueType(0);
13699   if (!ToVT.isVector())
13700     return SDValue();
13701   assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements());
13702   EVT ToEltVT = ToVT.getVectorElementType();
13703   EVT FromEltVT = FromVT.getVectorElementType();
13704
13705   unsigned NumElements = 0;
13706   if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
13707     NumElements = 4;
13708   if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
13709     NumElements = 8;
13710   if (NumElements == 0 ||
13711       FromVT.getVectorNumElements() == NumElements ||
13712       FromVT.getVectorNumElements() % NumElements != 0 ||
13713       !isPowerOf2_32(NumElements))
13714     return SDValue();
13715
13716   SDLoc DL(LD);
13717   // Details about the old load
13718   SDValue Ch = LD->getChain();
13719   SDValue BasePtr = LD->getBasePtr();
13720   unsigned Alignment = LD->getOriginalAlignment();
13721   MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
13722   AAMDNodes AAInfo = LD->getAAInfo();
13723
13724   ISD::LoadExtType NewExtType =
13725       N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
13726   SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
13727   EVT NewFromVT = FromVT.getHalfNumVectorElementsVT(*DAG.getContext());
13728   EVT NewToVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
13729   unsigned NewOffset = NewFromVT.getSizeInBits() / 8;
13730   SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset);
13731
13732   // Split the load in half, each side of which is extended separately. This
13733   // is good enough, as legalisation will take it from there. They are either
13734   // already legal or they will be split further into something that is
13735   // legal.
13736   SDValue NewLoad1 =
13737       DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, BasePtr, Offset,
13738                   LD->getPointerInfo(), NewFromVT, Alignment, MMOFlags, AAInfo);
13739   SDValue NewLoad2 =
13740       DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
13741                   LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
13742                   Alignment, MMOFlags, AAInfo);
13743
13744   SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
13745                                  SDValue(NewLoad1.getNode(), 1),
13746                                  SDValue(NewLoad2.getNode(), 1));
13747   DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
13748   return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, NewLoad1, NewLoad2);
13749 }
13750
13751 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
13752 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
13753 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
13754                                     const ARMSubtarget *ST) {
13755   SDValue N0 = N->getOperand(0);
13756
13757   // Check for sign- and zero-extensions of vector extract operations of 8-
13758   // and 16-bit vector elements.  NEON supports these directly.  They are
13759   // handled during DAG combining because type legalization will promote them
13760   // to 32-bit types and it is messy to recognize the operations after that.
13761   if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
13762     SDValue Vec = N0.getOperand(0);
13763     SDValue Lane = N0.getOperand(1);
13764     EVT VT = N->getValueType(0);
13765     EVT EltVT = N0.getValueType();
13766     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13767
13768     if (VT == MVT::i32 &&
13769         (EltVT == MVT::i8 || EltVT == MVT::i16) &&
13770         TLI.isTypeLegal(Vec.getValueType()) &&
13771         isa<ConstantSDNode>(Lane)) {
13772
13773       unsigned Opc = 0;
13774       switch (N->getOpcode()) {
13775       default: llvm_unreachable("unexpected opcode");
13776       case ISD::SIGN_EXTEND:
13777         Opc = ARMISD::VGETLANEs;
13778         break;
13779       case ISD::ZERO_EXTEND:
13780       case ISD::ANY_EXTEND:
13781         Opc = ARMISD::VGETLANEu;
13782         break;
13783       }
13784       return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
13785     }
13786   }
13787
13788   if (ST->hasMVEIntegerOps())
13789     if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
13790       return NewLoad;
13791
13792   return SDValue();
13793 }
13794
13795 static const APInt *isPowerOf2Constant(SDValue V) {
13796   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
13797   if (!C)
13798     return nullptr;
13799   const APInt *CV = &C->getAPIntValue();
13800   return CV->isPowerOf2() ? CV : nullptr;
13801 }
13802
13803 SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const {
13804   // If we have a CMOV, OR and AND combination such as:
13805   //   if (x & CN)
13806   //     y |= CM;
13807   //
13808   // And:
13809   //   * CN is a single bit;
13810   //   * All bits covered by CM are known zero in y
13811   //
13812   // Then we can convert this into a sequence of BFI instructions. This will
13813   // always be a win if CM is a single bit, will always be no worse than the
13814   // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
13815   // three bits (due to the extra IT instruction).
13816
13817   SDValue Op0 = CMOV->getOperand(0);
13818   SDValue Op1 = CMOV->getOperand(1);
13819   auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2));
13820   auto CC = CCNode->getAPIntValue().getLimitedValue();
13821   SDValue CmpZ = CMOV->getOperand(4);
13822
13823   // The compare must be against zero.
13824   if (!isNullConstant(CmpZ->getOperand(1)))
13825     return SDValue();
13826
13827   assert(CmpZ->getOpcode() == ARMISD::CMPZ);
13828   SDValue And = CmpZ->getOperand(0);
13829   if (And->getOpcode() != ISD::AND)
13830     return SDValue();
13831   const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
13832   if (!AndC)
13833     return SDValue();
13834   SDValue X = And->getOperand(0);
13835
13836   if (CC == ARMCC::EQ) {
13837     // We're performing an "equal to zero" compare. Swap the operands so we
13838     // canonicalize on a "not equal to zero" compare.
13839     std::swap(Op0, Op1);
13840   } else {
13841     assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
13842   }
13843
13844   if (Op1->getOpcode() != ISD::OR)
13845     return SDValue();
13846
13847   ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1));
13848   if (!OrC)
13849     return SDValue();
13850   SDValue Y = Op1->getOperand(0);
13851
13852   if (Op0 != Y)
13853     return SDValue();
13854
13855   // Now, is it profitable to continue?
13856   APInt OrCI = OrC->getAPIntValue();
13857   unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
13858   if (OrCI.countPopulation() > Heuristic)
13859     return SDValue();
13860
13861   // Lastly, can we determine that the bits defined by OrCI
13862   // are zero in Y?
13863   KnownBits Known = DAG.computeKnownBits(Y);
13864   if ((OrCI & Known.Zero) != OrCI)
13865     return SDValue();
13866
13867   // OK, we can do the combine.
13868   SDValue V = Y;
13869   SDLoc dl(X);
13870   EVT VT = X.getValueType();
13871   unsigned BitInX = AndC->logBase2();
13872
13873   if (BitInX != 0) {
13874     // We must shift X first.
13875     X = DAG.getNode(ISD::SRL, dl, VT, X,
13876                     DAG.getConstant(BitInX, dl, VT));
13877   }
13878
13879   for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
13880        BitInY < NumActiveBits; ++BitInY) {
13881     if (OrCI[BitInY] == 0)
13882       continue;
13883     APInt Mask(VT.getSizeInBits(), 0);
13884     Mask.setBit(BitInY);
13885     V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
13886                     // Confusingly, the operand is an *inverted* mask.
13887                     DAG.getConstant(~Mask, dl, VT));
13888   }
13889
13890   return V;
13891 }
13892
13893 // Given N, the value controlling the conditional branch, search for the loop
13894 // intrinsic, returning it, along with how the value is used. We need to handle
13895 // patterns such as the following:
13896 // (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
13897 // (brcond (setcc (loop.decrement), 0, eq), exit)
13898 // (brcond (setcc (loop.decrement), 0, ne), header)
13899 static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm,
13900                                    bool &Negate) {
13901   switch (N->getOpcode()) {
13902   default:
13903     break;
13904   case ISD::XOR: {
13905     if (!isa<ConstantSDNode>(N.getOperand(1)))
13906       return SDValue();
13907     if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
13908       return SDValue();
13909     Negate = !Negate;
13910     return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
13911   }
13912   case ISD::SETCC: {
13913     auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
13914     if (!Const)
13915       return SDValue();
13916     if (Const->isNullValue())
13917       Imm = 0;
13918     else if (Const->isOne())
13919       Imm = 1;
13920     else
13921       return SDValue();
13922     CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
13923     return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
13924   }
13925   case ISD::INTRINSIC_W_CHAIN: {
13926     unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue();
13927     if (IntOp != Intrinsic::test_set_loop_iterations &&
13928         IntOp != Intrinsic::loop_decrement_reg)
13929       return SDValue();
13930     return N;
13931   }
13932   }
13933   return SDValue();
13934 }
13935
13936 static SDValue PerformHWLoopCombine(SDNode *N,
13937                                     TargetLowering::DAGCombinerInfo &DCI,
13938                                     const ARMSubtarget *ST) {
13939
13940   // The hwloop intrinsics that we're interested are used for control-flow,
13941   // either for entering or exiting the loop:
13942   // - test.set.loop.iterations will test whether its operand is zero. If it
13943   //   is zero, the proceeding branch should not enter the loop.
13944   // - loop.decrement.reg also tests whether its operand is zero. If it is
13945   //   zero, the proceeding branch should not branch back to the beginning of
13946   //   the loop.
13947   // So here, we need to check that how the brcond is using the result of each
13948   // of the intrinsics to ensure that we're branching to the right place at the
13949   // right time.
13950
13951   ISD::CondCode CC;
13952   SDValue Cond;
13953   int Imm = 1;
13954   bool Negate = false;
13955   SDValue Chain = N->getOperand(0);
13956   SDValue Dest;
13957
13958   if (N->getOpcode() == ISD::BRCOND) {
13959     CC = ISD::SETEQ;
13960     Cond = N->getOperand(1);
13961     Dest = N->getOperand(2);
13962   } else {
13963     assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
13964     CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
13965     Cond = N->getOperand(2);
13966     Dest = N->getOperand(4);
13967     if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
13968       if (!Const->isOne() && !Const->isNullValue())
13969         return SDValue();
13970       Imm = Const->getZExtValue();
13971     } else
13972       return SDValue();
13973   }
13974
13975   SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
13976   if (!Int)
13977     return SDValue();
13978
13979   if (Negate)
13980     CC = ISD::getSetCCInverse(CC, true);
13981
13982   auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
13983     return (CC == ISD::SETEQ && Imm == 0) ||
13984            (CC == ISD::SETNE && Imm == 1) ||
13985            (CC == ISD::SETLT && Imm == 1) ||
13986            (CC == ISD::SETULT && Imm == 1);
13987   };
13988
13989   auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
13990     return (CC == ISD::SETEQ && Imm == 1) ||
13991            (CC == ISD::SETNE && Imm == 0) ||
13992            (CC == ISD::SETGT && Imm == 0) ||
13993            (CC == ISD::SETUGT && Imm == 0) ||
13994            (CC == ISD::SETGE && Imm == 1) ||
13995            (CC == ISD::SETUGE && Imm == 1);
13996   };
13997
13998   assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
13999          "unsupported condition");
14000
14001   SDLoc dl(Int);
14002   SelectionDAG &DAG = DCI.DAG;
14003   SDValue Elements = Int.getOperand(2);
14004   unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue();
14005   assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR)
14006           && "expected single br user");
14007   SDNode *Br = *N->use_begin();
14008   SDValue OtherTarget = Br->getOperand(1);
14009
14010   // Update the unconditional branch to branch to the given Dest.
14011   auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
14012     SDValue NewBrOps[] = { Br->getOperand(0), Dest };
14013     SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
14014     DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
14015   };
14016
14017   if (IntOp == Intrinsic::test_set_loop_iterations) {
14018     SDValue Res;
14019     // We expect this 'instruction' to branch when the counter is zero.
14020     if (IsTrueIfZero(CC, Imm)) {
14021       SDValue Ops[] = { Chain, Elements, Dest };
14022       Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
14023     } else {
14024       // The logic is the reverse of what we need for WLS, so find the other
14025       // basic block target: the target of the proceeding br.
14026       UpdateUncondBr(Br, Dest, DAG);
14027
14028       SDValue Ops[] = { Chain, Elements, OtherTarget };
14029       Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
14030     }
14031     DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0));
14032     return Res;
14033   } else {
14034     SDValue Size = DAG.getTargetConstant(
14035       cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32);
14036     SDValue Args[] = { Int.getOperand(0), Elements, Size, };
14037     SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
14038                                   DAG.getVTList(MVT::i32, MVT::Other), Args);
14039     DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
14040
14041     // We expect this instruction to branch when the count is not zero.
14042     SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
14043
14044     // Update the unconditional branch to target the loop preheader if we've
14045     // found the condition has been reversed.
14046     if (Target == OtherTarget)
14047       UpdateUncondBr(Br, Dest, DAG);
14048
14049     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
14050                         SDValue(LoopDec.getNode(), 1), Chain);
14051
14052     SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
14053     return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
14054   }
14055   return SDValue();
14056 }
14057
14058 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
14059 SDValue
14060 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {
14061   SDValue Cmp = N->getOperand(4);
14062   if (Cmp.getOpcode() != ARMISD::CMPZ)
14063     // Only looking at NE cases.
14064     return SDValue();
14065
14066   EVT VT = N->getValueType(0);
14067   SDLoc dl(N);
14068   SDValue LHS = Cmp.getOperand(0);
14069   SDValue RHS = Cmp.getOperand(1);
14070   SDValue Chain = N->getOperand(0);
14071   SDValue BB = N->getOperand(1);
14072   SDValue ARMcc = N->getOperand(2);
14073   ARMCC::CondCodes CC =
14074     (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
14075
14076   // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))
14077   // -> (brcond Chain BB CC CPSR Cmp)
14078   if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
14079       LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
14080       LHS->getOperand(0)->hasOneUse()) {
14081     auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0));
14082     auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1));
14083     auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
14084     auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
14085     if ((LHS00C && LHS00C->getZExtValue() == 0) &&
14086         (LHS01C && LHS01C->getZExtValue() == 1) &&
14087         (LHS1C && LHS1C->getZExtValue() == 1) &&
14088         (RHSC && RHSC->getZExtValue() == 0)) {
14089       return DAG.getNode(
14090           ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2),
14091           LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4));
14092     }
14093   }
14094
14095   return SDValue();
14096 }
14097
14098 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
14099 SDValue
14100 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
14101   SDValue Cmp = N->getOperand(4);
14102   if (Cmp.getOpcode() != ARMISD::CMPZ)
14103     // Only looking at EQ and NE cases.
14104     return SDValue();
14105
14106   EVT VT = N->getValueType(0);
14107   SDLoc dl(N);
14108   SDValue LHS = Cmp.getOperand(0);
14109   SDValue RHS = Cmp.getOperand(1);
14110   SDValue FalseVal = N->getOperand(0);
14111   SDValue TrueVal = N->getOperand(1);
14112   SDValue ARMcc = N->getOperand(2);
14113   ARMCC::CondCodes CC =
14114     (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
14115
14116   // BFI is only available on V6T2+.
14117   if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
14118     SDValue R = PerformCMOVToBFICombine(N, DAG);
14119     if (R)
14120       return R;
14121   }
14122
14123   // Simplify
14124   //   mov     r1, r0
14125   //   cmp     r1, x
14126   //   mov     r0, y
14127   //   moveq   r0, x
14128   // to
14129   //   cmp     r0, x
14130   //   movne   r0, y
14131   //
14132   //   mov     r1, r0
14133   //   cmp     r1, x
14134   //   mov     r0, x
14135   //   movne   r0, y
14136   // to
14137   //   cmp     r0, x
14138   //   movne   r0, y
14139   /// FIXME: Turn this into a target neutral optimization?
14140   SDValue Res;
14141   if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
14142     Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
14143                       N->getOperand(3), Cmp);
14144   } else if (CC == ARMCC::EQ && TrueVal == RHS) {
14145     SDValue ARMcc;
14146     SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
14147     Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,
14148                       N->getOperand(3), NewCmp);
14149   }
14150
14151   // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0))
14152   // -> (cmov F T CC CPSR Cmp)
14153   if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) {
14154     auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0));
14155     auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
14156     auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
14157     if ((LHS0C && LHS0C->getZExtValue() == 0) &&
14158         (LHS1C && LHS1C->getZExtValue() == 1) &&
14159         (RHSC && RHSC->getZExtValue() == 0)) {
14160       return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
14161                          LHS->getOperand(2), LHS->getOperand(3),
14162                          LHS->getOperand(4));
14163     }
14164   }
14165
14166   if (!VT.isInteger())
14167       return SDValue();
14168
14169   // Materialize a boolean comparison for integers so we can avoid branching.
14170   if (isNullConstant(FalseVal)) {
14171     if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
14172       if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
14173         // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
14174         // right 5 bits will make that 32 be 1, otherwise it will be 0.
14175         // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
14176         SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
14177         Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
14178                           DAG.getConstant(5, dl, MVT::i32));
14179       } else {
14180         // CMOV 0, 1, ==, (CMPZ x, y) ->
14181         //     (ADDCARRY (SUB x, y), t:0, t:1)
14182         // where t = (SUBCARRY 0, (SUB x, y), 0)
14183         //
14184         // The SUBCARRY computes 0 - (x - y) and this will give a borrow when
14185         // x != y. In other words, a carry C == 1 when x == y, C == 0
14186         // otherwise.
14187         // The final ADDCARRY computes
14188         //     x - y + (0 - (x - y)) + C == C
14189         SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
14190         SDVTList VTs = DAG.getVTList(VT, MVT::i32);
14191         SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
14192         // ISD::SUBCARRY returns a borrow but we want the carry here
14193         // actually.
14194         SDValue Carry =
14195             DAG.getNode(ISD::SUB, dl, MVT::i32,
14196                         DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
14197         Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry);
14198       }
14199     } else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
14200                (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
14201       // This seems pointless but will allow us to combine it further below.
14202       // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
14203       SDValue Sub =
14204           DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
14205       SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
14206                                           Sub.getValue(1), SDValue());
14207       Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
14208                         N->getOperand(3), CPSRGlue.getValue(1));
14209       FalseVal = Sub;
14210     }
14211   } else if (isNullConstant(TrueVal)) {
14212     if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
14213         (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
14214       // This seems pointless but will allow us to combine it further below
14215       // Note that we change == for != as this is the dual for the case above.
14216       // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1
14217       SDValue Sub =
14218           DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
14219       SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR,
14220                                           Sub.getValue(1), SDValue());
14221       Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
14222                         DAG.getConstant(ARMCC::NE, dl, MVT::i32),
14223                         N->getOperand(3), CPSRGlue.getValue(1));
14224       FalseVal = Sub;
14225     }
14226   }
14227
14228   // On Thumb1, the DAG above may be further combined if z is a power of 2
14229   // (z == 2 ^ K).
14230   // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 ->
14231   // t1 = (USUBO (SUB x, y), 1)
14232   // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1)
14233   // Result = if K != 0 then (SHL t2:0, K) else t2:0
14234   //
14235   // This also handles the special case of comparing against zero; it's
14236   // essentially, the same pattern, except there's no SUBS:
14237   // CMOV x, z, !=, (CMPZ x, 0) ->
14238   // t1 = (USUBO x, 1)
14239   // t2 = (SUBCARRY x, t1:0, t1:1)
14240   // Result = if K != 0 then (SHL t2:0, K) else t2:0
14241   const APInt *TrueConst;
14242   if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
14243       ((FalseVal.getOpcode() == ARMISD::SUBS &&
14244         FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) ||
14245        (FalseVal == LHS && isNullConstant(RHS))) &&
14246       (TrueConst = isPowerOf2Constant(TrueVal))) {
14247     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
14248     unsigned ShiftAmount = TrueConst->logBase2();
14249     if (ShiftAmount)
14250       TrueVal = DAG.getConstant(1, dl, VT);
14251     SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
14252     Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1));
14253
14254     if (ShiftAmount)
14255       Res = DAG.getNode(ISD::SHL, dl, VT, Res,
14256                         DAG.getConstant(ShiftAmount, dl, MVT::i32));
14257   }
14258
14259   if (Res.getNode()) {
14260     KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
14261     // Capture demanded bits information that would be otherwise lost.
14262     if (Known.Zero == 0xfffffffe)
14263       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
14264                         DAG.getValueType(MVT::i1));
14265     else if (Known.Zero == 0xffffff00)
14266       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
14267                         DAG.getValueType(MVT::i8));
14268     else if (Known.Zero == 0xffff0000)
14269       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
14270                         DAG.getValueType(MVT::i16));
14271   }
14272
14273   return Res;
14274 }
14275
14276 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
14277                                              DAGCombinerInfo &DCI) const {
14278   switch (N->getOpcode()) {
14279   default: break;
14280   case ISD::ABS:        return PerformABSCombine(N, DCI, Subtarget);
14281   case ARMISD::ADDE:    return PerformADDECombine(N, DCI, Subtarget);
14282   case ARMISD::UMLAL:   return PerformUMLALCombine(N, DCI.DAG, Subtarget);
14283   case ISD::ADD:        return PerformADDCombine(N, DCI, Subtarget);
14284   case ISD::SUB:        return PerformSUBCombine(N, DCI);
14285   case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
14286   case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
14287   case ISD::XOR:        return PerformXORCombine(N, DCI, Subtarget);
14288   case ISD::AND:        return PerformANDCombine(N, DCI, Subtarget);
14289   case ISD::BRCOND:
14290   case ISD::BR_CC:      return PerformHWLoopCombine(N, DCI, Subtarget);
14291   case ARMISD::ADDC:
14292   case ARMISD::SUBC:    return PerformAddcSubcCombine(N, DCI, Subtarget);
14293   case ARMISD::SUBE:    return PerformAddeSubeCombine(N, DCI, Subtarget);
14294   case ARMISD::BFI:     return PerformBFICombine(N, DCI);
14295   case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
14296   case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
14297   case ISD::STORE:      return PerformSTORECombine(N, DCI, Subtarget);
14298   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
14299   case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
14300   case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
14301   case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
14302   case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget);
14303   case ISD::FP_TO_SINT:
14304   case ISD::FP_TO_UINT:
14305     return PerformVCVTCombine(N, DCI.DAG, Subtarget);
14306   case ISD::FDIV:
14307     return PerformVDIVCombine(N, DCI.DAG, Subtarget);
14308   case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
14309   case ISD::SHL:
14310   case ISD::SRA:
14311   case ISD::SRL:
14312     return PerformShiftCombine(N, DCI, Subtarget);
14313   case ISD::SIGN_EXTEND:
14314   case ISD::ZERO_EXTEND:
14315   case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
14316   case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
14317   case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
14318   case ISD::LOAD:       return PerformLOADCombine(N, DCI);
14319   case ARMISD::VLD1DUP:
14320   case ARMISD::VLD2DUP:
14321   case ARMISD::VLD3DUP:
14322   case ARMISD::VLD4DUP:
14323     return PerformVLDCombine(N, DCI);
14324   case ARMISD::BUILD_VECTOR:
14325     return PerformARMBUILD_VECTORCombine(N, DCI);
14326   case ARMISD::PREDICATE_CAST:
14327     return PerformPREDICATE_CASTCombine(N, DCI);
14328   case ARMISD::SMULWB: {
14329     unsigned BitWidth = N->getValueType(0).getSizeInBits();
14330     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
14331     if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
14332       return SDValue();
14333     break;
14334   }
14335   case ARMISD::SMULWT: {
14336     unsigned BitWidth = N->getValueType(0).getSizeInBits();
14337     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
14338     if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
14339       return SDValue();
14340     break;
14341   }
14342   case ARMISD::SMLALBB: {
14343     unsigned BitWidth = N->getValueType(0).getSizeInBits();
14344     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
14345     if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
14346         (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
14347       return SDValue();
14348     break;
14349   }
14350   case ARMISD::SMLALBT: {
14351     unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
14352     APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
14353     unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
14354     APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
14355     if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
14356         (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
14357       return SDValue();
14358     break;
14359   }
14360   case ARMISD::SMLALTB: {
14361     unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
14362     APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
14363     unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
14364     APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
14365     if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
14366         (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
14367       return SDValue();
14368     break;
14369   }
14370   case ARMISD::SMLALTT: {
14371     unsigned BitWidth = N->getValueType(0).getSizeInBits();
14372     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
14373     if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
14374         (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
14375       return SDValue();
14376     break;
14377   }
14378   case ISD::INTRINSIC_VOID:
14379   case ISD::INTRINSIC_W_CHAIN:
14380     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
14381     case Intrinsic::arm_neon_vld1:
14382     case Intrinsic::arm_neon_vld1x2:
14383     case Intrinsic::arm_neon_vld1x3:
14384     case Intrinsic::arm_neon_vld1x4:
14385     case Intrinsic::arm_neon_vld2:
14386     case Intrinsic::arm_neon_vld3:
14387     case Intrinsic::arm_neon_vld4:
14388     case Intrinsic::arm_neon_vld2lane:
14389     case Intrinsic::arm_neon_vld3lane:
14390     case Intrinsic::arm_neon_vld4lane:
14391     case Intrinsic::arm_neon_vld2dup:
14392     case Intrinsic::arm_neon_vld3dup:
14393     case Intrinsic::arm_neon_vld4dup:
14394     case Intrinsic::arm_neon_vst1:
14395     case Intrinsic::arm_neon_vst1x2:
14396     case Intrinsic::arm_neon_vst1x3:
14397     case Intrinsic::arm_neon_vst1x4:
14398     case Intrinsic::arm_neon_vst2:
14399     case Intrinsic::arm_neon_vst3:
14400     case Intrinsic::arm_neon_vst4:
14401     case Intrinsic::arm_neon_vst2lane:
14402     case Intrinsic::arm_neon_vst3lane:
14403     case Intrinsic::arm_neon_vst4lane:
14404       return PerformVLDCombine(N, DCI);
14405     default: break;
14406     }
14407     break;
14408   }
14409   return SDValue();
14410 }
14411
14412 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
14413                                                           EVT VT) const {
14414   return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
14415 }
14416
14417 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
14418                                                        unsigned Alignment,
14419                                                        MachineMemOperand::Flags,
14420                                                        bool *Fast) const {
14421   // Depends what it gets converted into if the type is weird.
14422   if (!VT.isSimple())
14423     return false;
14424
14425   // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
14426   bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
14427   auto Ty = VT.getSimpleVT().SimpleTy;
14428
14429   if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
14430     // Unaligned access can use (for example) LRDB, LRDH, LDR
14431     if (AllowsUnaligned) {
14432       if (Fast)
14433         *Fast = Subtarget->hasV7Ops();
14434       return true;
14435     }
14436   }
14437
14438   if (Ty == MVT::f64 || Ty == MVT::v2f64) {
14439     // For any little-endian targets with neon, we can support unaligned ld/st
14440     // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
14441     // A big-endian target may also explicitly support unaligned accesses
14442     if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
14443       if (Fast)
14444         *Fast = true;
14445       return true;
14446     }
14447   }
14448
14449   if (!Subtarget->hasMVEIntegerOps())
14450     return false;
14451
14452   // These are for predicates
14453   if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1)) {
14454     if (Fast)
14455       *Fast = true;
14456     return true;
14457   }
14458
14459   // These are for truncated stores/narrowing loads. They are fine so long as
14460   // the alignment is at least the size of the item being loaded
14461   if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
14462       Alignment >= VT.getScalarSizeInBits() / 8) {
14463     if (Fast)
14464       *Fast = true;
14465     return true;
14466   }
14467
14468   // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
14469   // VSTRW.U32 all store the vector register in exactly the same format, and
14470   // differ only in the range of their immediate offset field and the required
14471   // alignment. So there is always a store that can be used, regardless of
14472   // actual type.
14473   //
14474   // For big endian, that is not the case. But can still emit a (VSTRB.U8;
14475   // VREV64.8) pair and get the same effect. This will likely be better than
14476   // aligning the vector through the stack.
14477   if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
14478       Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
14479       Ty == MVT::v2f64) {
14480     if (Fast)
14481       *Fast = true;
14482     return true;
14483   }
14484
14485   return false;
14486 }
14487
14488 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
14489                        unsigned AlignCheck) {
14490   return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
14491           (DstAlign == 0 || DstAlign % AlignCheck == 0));
14492 }
14493
14494 EVT ARMTargetLowering::getOptimalMemOpType(
14495     uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
14496     bool ZeroMemset, bool MemcpyStrSrc,
14497     const AttributeList &FuncAttributes) const {
14498   // See if we can use NEON instructions for this...
14499   if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() &&
14500       !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
14501     bool Fast;
14502     if (Size >= 16 &&
14503         (memOpAlign(SrcAlign, DstAlign, 16) ||
14504          (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1,
14505                                          MachineMemOperand::MONone, &Fast) &&
14506           Fast))) {
14507       return MVT::v2f64;
14508     } else if (Size >= 8 &&
14509                (memOpAlign(SrcAlign, DstAlign, 8) ||
14510                 (allowsMisalignedMemoryAccesses(
14511                      MVT::f64, 0, 1, MachineMemOperand::MONone, &Fast) &&
14512                  Fast))) {
14513       return MVT::f64;
14514     }
14515   }
14516
14517   // Let the target-independent logic figure it out.
14518   return MVT::Other;
14519 }
14520
14521 // 64-bit integers are split into their high and low parts and held in two
14522 // different registers, so the trunc is free since the low register can just
14523 // be used.
14524 bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
14525   if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
14526     return false;
14527   unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
14528   unsigned DestBits = DstTy->getPrimitiveSizeInBits();
14529   return (SrcBits == 64 && DestBits == 32);
14530 }
14531
14532 bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const {
14533   if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
14534       !DstVT.isInteger())
14535     return false;
14536   unsigned SrcBits = SrcVT.getSizeInBits();
14537   unsigned DestBits = DstVT.getSizeInBits();
14538   return (SrcBits == 64 && DestBits == 32);
14539 }
14540
14541 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
14542   if (Val.getOpcode() != ISD::LOAD)
14543     return false;
14544
14545   EVT VT1 = Val.getValueType();
14546   if (!VT1.isSimple() || !VT1.isInteger() ||
14547       !VT2.isSimple() || !VT2.isInteger())
14548     return false;
14549
14550   switch (VT1.getSimpleVT().SimpleTy) {
14551   default: break;
14552   case MVT::i1:
14553   case MVT::i8:
14554   case MVT::i16:
14555     // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
14556     return true;
14557   }
14558
14559   return false;
14560 }
14561
14562 bool ARMTargetLowering::isFNegFree(EVT VT) const {
14563   if (!VT.isSimple())
14564     return false;
14565
14566   // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
14567   // negate values directly (fneg is free). So, we don't want to let the DAG
14568   // combiner rewrite fneg into xors and some other instructions.  For f16 and
14569   // FullFP16 argument passing, some bitcast nodes may be introduced,
14570   // triggering this DAG combine rewrite, so we are avoiding that with this.
14571   switch (VT.getSimpleVT().SimpleTy) {
14572   default: break;
14573   case MVT::f16:
14574     return Subtarget->hasFullFP16();
14575   }
14576
14577   return false;
14578 }
14579
14580 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
14581 /// of the vector elements.
14582 static bool areExtractExts(Value *Ext1, Value *Ext2) {
14583   auto areExtDoubled = [](Instruction *Ext) {
14584     return Ext->getType()->getScalarSizeInBits() ==
14585            2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
14586   };
14587
14588   if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
14589       !match(Ext2, m_ZExtOrSExt(m_Value())) ||
14590       !areExtDoubled(cast<Instruction>(Ext1)) ||
14591       !areExtDoubled(cast<Instruction>(Ext2)))
14592     return false;
14593
14594   return true;
14595 }
14596
14597 /// Check if sinking \p I's operands to I's basic block is profitable, because
14598 /// the operands can be folded into a target instruction, e.g.
14599 /// sext/zext can be folded into vsubl.
14600 bool ARMTargetLowering::shouldSinkOperands(Instruction *I,
14601                                            SmallVectorImpl<Use *> &Ops) const {
14602   if (!I->getType()->isVectorTy())
14603     return false;
14604
14605   if (Subtarget->hasNEON()) {
14606     switch (I->getOpcode()) {
14607     case Instruction::Sub:
14608     case Instruction::Add: {
14609       if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
14610         return false;
14611       Ops.push_back(&I->getOperandUse(0));
14612       Ops.push_back(&I->getOperandUse(1));
14613       return true;
14614     }
14615     default:
14616       return false;
14617     }
14618   }
14619
14620   if (!Subtarget->hasMVEIntegerOps())
14621     return false;
14622
14623   auto IsSinker = [](Instruction *I, int Operand) {
14624     switch (I->getOpcode()) {
14625     case Instruction::Add:
14626     case Instruction::Mul:
14627       return true;
14628     case Instruction::Sub:
14629       return Operand == 1;
14630     default:
14631       return false;
14632     }
14633   };
14634
14635   int Op = 0;
14636   if (!isa<ShuffleVectorInst>(I->getOperand(Op)))
14637     Op = 1;
14638   if (!IsSinker(I, Op))
14639     return false;
14640   if (!match(I->getOperand(Op),
14641              m_ShuffleVector(m_InsertElement(m_Undef(), m_Value(), m_ZeroInt()),
14642                              m_Undef(), m_Zero()))) {
14643     return false;
14644   }
14645   Instruction *Shuffle = cast<Instruction>(I->getOperand(Op));
14646   // All uses of the shuffle should be sunk to avoid duplicating it across gpr
14647   // and vector registers
14648   for (Use &U : Shuffle->uses()) {
14649     Instruction *Insn = cast<Instruction>(U.getUser());
14650     if (!IsSinker(Insn, U.getOperandNo()))
14651       return false;
14652   }
14653   Ops.push_back(&Shuffle->getOperandUse(0));
14654   Ops.push_back(&I->getOperandUse(Op));
14655   return true;
14656 }
14657
14658 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
14659   EVT VT = ExtVal.getValueType();
14660
14661   if (!isTypeLegal(VT))
14662     return false;
14663
14664   // Don't create a loadext if we can fold the extension into a wide/long
14665   // instruction.
14666   // If there's more than one user instruction, the loadext is desirable no
14667   // matter what.  There can be two uses by the same instruction.
14668   if (ExtVal->use_empty() ||
14669       !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode()))
14670     return true;
14671
14672   SDNode *U = *ExtVal->use_begin();
14673   if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
14674        U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
14675     return false;
14676
14677   return true;
14678 }
14679
14680 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
14681   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
14682     return false;
14683
14684   if (!isTypeLegal(EVT::getEVT(Ty1)))
14685     return false;
14686
14687   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
14688
14689   // Assuming the caller doesn't have a zeroext or signext return parameter,
14690   // truncation all the way down to i1 is valid.
14691   return true;
14692 }
14693
14694 int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL,
14695                                                 const AddrMode &AM, Type *Ty,
14696                                                 unsigned AS) const {
14697   if (isLegalAddressingMode(DL, AM, Ty, AS)) {
14698     if (Subtarget->hasFPAO())
14699       return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
14700     return 0;
14701   }
14702   return -1;
14703 }
14704
14705 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
14706   if (V < 0)
14707     return false;
14708
14709   unsigned Scale = 1;
14710   switch (VT.getSimpleVT().SimpleTy) {
14711   case MVT::i1:
14712   case MVT::i8:
14713     // Scale == 1;
14714     break;
14715   case MVT::i16:
14716     // Scale == 2;
14717     Scale = 2;
14718     break;
14719   default:
14720     // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
14721     // Scale == 4;
14722     Scale = 4;
14723     break;
14724   }
14725
14726   if ((V & (Scale - 1)) != 0)
14727     return false;
14728   return isUInt<5>(V / Scale);
14729 }
14730
14731 static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
14732                                       const ARMSubtarget *Subtarget) {
14733   if (!VT.isInteger() && !VT.isFloatingPoint())
14734     return false;
14735   if (VT.isVector() && Subtarget->hasNEON())
14736     return false;
14737   if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
14738       !Subtarget->hasMVEFloatOps())
14739     return false;
14740
14741   bool IsNeg = false;
14742   if (V < 0) {
14743     IsNeg = true;
14744     V = -V;
14745   }
14746
14747   unsigned NumBytes = std::max(VT.getSizeInBits() / 8, 1U);
14748
14749   // MVE: size * imm7
14750   if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
14751     switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
14752     case MVT::i32:
14753     case MVT::f32:
14754       return isShiftedUInt<7,2>(V);
14755     case MVT::i16:
14756     case MVT::f16:
14757       return isShiftedUInt<7,1>(V);
14758     case MVT::i8:
14759       return isUInt<7>(V);
14760     default:
14761       return false;
14762     }
14763   }
14764
14765   // half VLDR: 2 * imm8
14766   if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
14767     return isShiftedUInt<8, 1>(V);
14768   // VLDR and LDRD: 4 * imm8
14769   if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
14770     return isShiftedUInt<8, 2>(V);
14771
14772   if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
14773     // + imm12 or - imm8
14774     if (IsNeg)
14775       return isUInt<8>(V);
14776     return isUInt<12>(V);
14777   }
14778
14779   return false;
14780 }
14781
14782 /// isLegalAddressImmediate - Return true if the integer value can be used
14783 /// as the offset of the target addressing mode for load / store of the
14784 /// given type.
14785 static bool isLegalAddressImmediate(int64_t V, EVT VT,
14786                                     const ARMSubtarget *Subtarget) {
14787   if (V == 0)
14788     return true;
14789
14790   if (!VT.isSimple())
14791     return false;
14792
14793   if (Subtarget->isThumb1Only())
14794     return isLegalT1AddressImmediate(V, VT);
14795   else if (Subtarget->isThumb2())
14796     return isLegalT2AddressImmediate(V, VT, Subtarget);
14797
14798   // ARM mode.
14799   if (V < 0)
14800     V = - V;
14801   switch (VT.getSimpleVT().SimpleTy) {
14802   default: return false;
14803   case MVT::i1:
14804   case MVT::i8:
14805   case MVT::i32:
14806     // +- imm12
14807     return isUInt<12>(V);
14808   case MVT::i16:
14809     // +- imm8
14810     return isUInt<8>(V);
14811   case MVT::f32:
14812   case MVT::f64:
14813     if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
14814       return false;
14815     return isShiftedUInt<8, 2>(V);
14816   }
14817 }
14818
14819 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
14820                                                       EVT VT) const {
14821   int Scale = AM.Scale;
14822   if (Scale < 0)
14823     return false;
14824
14825   switch (VT.getSimpleVT().SimpleTy) {
14826   default: return false;
14827   case MVT::i1:
14828   case MVT::i8:
14829   case MVT::i16:
14830   case MVT::i32:
14831     if (Scale == 1)
14832       return true;
14833     // r + r << imm
14834     Scale = Scale & ~1;
14835     return Scale == 2 || Scale == 4 || Scale == 8;
14836   case MVT::i64:
14837     // FIXME: What are we trying to model here? ldrd doesn't have an r + r
14838     // version in Thumb mode.
14839     // r + r
14840     if (Scale == 1)
14841       return true;
14842     // r * 2 (this can be lowered to r + r).
14843     if (!AM.HasBaseReg && Scale == 2)
14844       return true;
14845     return false;
14846   case MVT::isVoid:
14847     // Note, we allow "void" uses (basically, uses that aren't loads or
14848     // stores), because arm allows folding a scale into many arithmetic
14849     // operations.  This should be made more precise and revisited later.
14850
14851     // Allow r << imm, but the imm has to be a multiple of two.
14852     if (Scale & 1) return false;
14853     return isPowerOf2_32(Scale);
14854   }
14855 }
14856
14857 bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM,
14858                                                       EVT VT) const {
14859   const int Scale = AM.Scale;
14860
14861   // Negative scales are not supported in Thumb1.
14862   if (Scale < 0)
14863     return false;
14864
14865   // Thumb1 addressing modes do not support register scaling excepting the
14866   // following cases:
14867   // 1. Scale == 1 means no scaling.
14868   // 2. Scale == 2 this can be lowered to r + r if there is no base register.
14869   return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
14870 }
14871
14872 /// isLegalAddressingMode - Return true if the addressing mode represented
14873 /// by AM is legal for this target, for a load/store of the specified type.
14874 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL,
14875                                               const AddrMode &AM, Type *Ty,
14876                                               unsigned AS, Instruction *I) const {
14877   EVT VT = getValueType(DL, Ty, true);
14878   if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
14879     return false;
14880
14881   // Can never fold addr of global into load/store.
14882   if (AM.BaseGV)
14883     return false;
14884
14885   switch (AM.Scale) {
14886   case 0:  // no scale reg, must be "r+i" or "r", or "i".
14887     break;
14888   default:
14889     // ARM doesn't support any R+R*scale+imm addr modes.
14890     if (AM.BaseOffs)
14891       return false;
14892
14893     if (!VT.isSimple())
14894       return false;
14895
14896     if (Subtarget->isThumb1Only())
14897       return isLegalT1ScaledAddressingMode(AM, VT);
14898
14899     if (Subtarget->isThumb2())
14900       return isLegalT2ScaledAddressingMode(AM, VT);
14901
14902     int Scale = AM.Scale;
14903     switch (VT.getSimpleVT().SimpleTy) {
14904     default: return false;
14905     case MVT::i1:
14906     case MVT::i8:
14907     case MVT::i32:
14908       if (Scale < 0) Scale = -Scale;
14909       if (Scale == 1)
14910         return true;
14911       // r + r << imm
14912       return isPowerOf2_32(Scale & ~1);
14913     case MVT::i16:
14914     case MVT::i64:
14915       // r +/- r
14916       if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
14917         return true;
14918       // r * 2 (this can be lowered to r + r).
14919       if (!AM.HasBaseReg && Scale == 2)
14920         return true;
14921       return false;
14922
14923     case MVT::isVoid:
14924       // Note, we allow "void" uses (basically, uses that aren't loads or
14925       // stores), because arm allows folding a scale into many arithmetic
14926       // operations.  This should be made more precise and revisited later.
14927
14928       // Allow r << imm, but the imm has to be a multiple of two.
14929       if (Scale & 1) return false;
14930       return isPowerOf2_32(Scale);
14931     }
14932   }
14933   return true;
14934 }
14935
14936 /// isLegalICmpImmediate - Return true if the specified immediate is legal
14937 /// icmp immediate, that is the target has icmp instructions which can compare
14938 /// a register against the immediate without having to materialize the
14939 /// immediate into a register.
14940 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
14941   // Thumb2 and ARM modes can use cmn for negative immediates.
14942   if (!Subtarget->isThumb())
14943     return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
14944            ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
14945   if (Subtarget->isThumb2())
14946     return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
14947            ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
14948   // Thumb1 doesn't have cmn, and only 8-bit immediates.
14949   return Imm >= 0 && Imm <= 255;
14950 }
14951
14952 /// isLegalAddImmediate - Return true if the specified immediate is a legal add
14953 /// *or sub* immediate, that is the target has add or sub instructions which can
14954 /// add a register with the immediate without having to materialize the
14955 /// immediate into a register.
14956 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
14957   // Same encoding for add/sub, just flip the sign.
14958   int64_t AbsImm = std::abs(Imm);
14959   if (!Subtarget->isThumb())
14960     return ARM_AM::getSOImmVal(AbsImm) != -1;
14961   if (Subtarget->isThumb2())
14962     return ARM_AM::getT2SOImmVal(AbsImm) != -1;
14963   // Thumb1 only has 8-bit unsigned immediate.
14964   return AbsImm >= 0 && AbsImm <= 255;
14965 }
14966
14967 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
14968                                       bool isSEXTLoad, SDValue &Base,
14969                                       SDValue &Offset, bool &isInc,
14970                                       SelectionDAG &DAG) {
14971   if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
14972     return false;
14973
14974   if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
14975     // AddressingMode 3
14976     Base = Ptr->getOperand(0);
14977     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
14978       int RHSC = (int)RHS->getZExtValue();
14979       if (RHSC < 0 && RHSC > -256) {
14980         assert(Ptr->getOpcode() == ISD::ADD);
14981         isInc = false;
14982         Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
14983         return true;
14984       }
14985     }
14986     isInc = (Ptr->getOpcode() == ISD::ADD);
14987     Offset = Ptr->getOperand(1);
14988     return true;
14989   } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
14990     // AddressingMode 2
14991     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
14992       int RHSC = (int)RHS->getZExtValue();
14993       if (RHSC < 0 && RHSC > -0x1000) {
14994         assert(Ptr->getOpcode() == ISD::ADD);
14995         isInc = false;
14996         Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
14997         Base = Ptr->getOperand(0);
14998         return true;
14999       }
15000     }
15001
15002     if (Ptr->getOpcode() == ISD::ADD) {
15003       isInc = true;
15004       ARM_AM::ShiftOpc ShOpcVal=
15005         ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
15006       if (ShOpcVal != ARM_AM::no_shift) {
15007         Base = Ptr->getOperand(1);
15008         Offset = Ptr->getOperand(0);
15009       } else {
15010         Base = Ptr->getOperand(0);
15011         Offset = Ptr->getOperand(1);
15012       }
15013       return true;
15014     }
15015
15016     isInc = (Ptr->getOpcode() == ISD::ADD);
15017     Base = Ptr->getOperand(0);
15018     Offset = Ptr->getOperand(1);
15019     return true;
15020   }
15021
15022   // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
15023   return false;
15024 }
15025
15026 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
15027                                      bool isSEXTLoad, SDValue &Base,
15028                                      SDValue &Offset, bool &isInc,
15029                                      SelectionDAG &DAG) {
15030   if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
15031     return false;
15032
15033   Base = Ptr->getOperand(0);
15034   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
15035     int RHSC = (int)RHS->getZExtValue();
15036     if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
15037       assert(Ptr->getOpcode() == ISD::ADD);
15038       isInc = false;
15039       Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
15040       return true;
15041     } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
15042       isInc = Ptr->getOpcode() == ISD::ADD;
15043       Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
15044       return true;
15045     }
15046   }
15047
15048   return false;
15049 }
15050
15051 static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align,
15052                                       bool isSEXTLoad, bool isLE, SDValue &Base,
15053                                       SDValue &Offset, bool &isInc,
15054                                       SelectionDAG &DAG) {
15055   if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
15056     return false;
15057   if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
15058     return false;
15059
15060   ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1));
15061   int RHSC = (int)RHS->getZExtValue();
15062
15063   auto IsInRange = [&](int RHSC, int Limit, int Scale) {
15064     if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
15065       assert(Ptr->getOpcode() == ISD::ADD);
15066       isInc = false;
15067       Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
15068       return true;
15069     } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
15070       isInc = Ptr->getOpcode() == ISD::ADD;
15071       Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
15072       return true;
15073     }
15074     return false;
15075   };
15076
15077   // Try to find a matching instruction based on s/zext, Alignment, Offset and
15078   // (in BE) type.
15079   Base = Ptr->getOperand(0);
15080   if (VT == MVT::v4i16) {
15081     if (Align >= 2 && IsInRange(RHSC, 0x80, 2))
15082       return true;
15083   } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
15084     if (IsInRange(RHSC, 0x80, 1))
15085       return true;
15086   } else if (Align >= 4 && (isLE || VT == MVT::v4i32 || VT == MVT::v4f32) &&
15087              IsInRange(RHSC, 0x80, 4))
15088     return true;
15089   else if (Align >= 2 && (isLE || VT == MVT::v8i16 || VT == MVT::v8f16) &&
15090            IsInRange(RHSC, 0x80, 2))
15091     return true;
15092   else if ((isLE || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
15093     return true;
15094   return false;
15095 }
15096
15097 /// getPreIndexedAddressParts - returns true by value, base pointer and
15098 /// offset pointer and addressing mode by reference if the node's address
15099 /// can be legally represented as pre-indexed load / store address.
15100 bool
15101 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
15102                                              SDValue &Offset,
15103                                              ISD::MemIndexedMode &AM,
15104                                              SelectionDAG &DAG) const {
15105   if (Subtarget->isThumb1Only())
15106     return false;
15107
15108   EVT VT;
15109   SDValue Ptr;
15110   unsigned Align;
15111   bool isSEXTLoad = false;
15112   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
15113     Ptr = LD->getBasePtr();
15114     VT = LD->getMemoryVT();
15115     Align = LD->getAlignment();
15116     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
15117   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
15118     Ptr = ST->getBasePtr();
15119     VT = ST->getMemoryVT();
15120     Align = ST->getAlignment();
15121   } else
15122     return false;
15123
15124   bool isInc;
15125   bool isLegal = false;
15126   if (VT.isVector())
15127     isLegal = Subtarget->hasMVEIntegerOps() &&
15128               getMVEIndexedAddressParts(Ptr.getNode(), VT, Align, isSEXTLoad,
15129                                         Subtarget->isLittle(), Base, Offset,
15130                                         isInc, DAG);
15131   else {
15132     if (Subtarget->isThumb2())
15133       isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
15134                                          Offset, isInc, DAG);
15135     else
15136       isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
15137                                           Offset, isInc, DAG);
15138   }
15139   if (!isLegal)
15140     return false;
15141
15142   AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
15143   return true;
15144 }
15145
15146 /// getPostIndexedAddressParts - returns true by value, base pointer and
15147 /// offset pointer and addressing mode by reference if this node can be
15148 /// combined with a load / store to form a post-indexed load / store.
15149 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
15150                                                    SDValue &Base,
15151                                                    SDValue &Offset,
15152                                                    ISD::MemIndexedMode &AM,
15153                                                    SelectionDAG &DAG) const {
15154   EVT VT;
15155   SDValue Ptr;
15156   unsigned Align;
15157   bool isSEXTLoad = false, isNonExt;
15158   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
15159     VT = LD->getMemoryVT();
15160     Ptr = LD->getBasePtr();
15161     Align = LD->getAlignment();
15162     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
15163     isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
15164   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
15165     VT = ST->getMemoryVT();
15166     Ptr = ST->getBasePtr();
15167     Align = ST->getAlignment();
15168     isNonExt = !ST->isTruncatingStore();
15169   } else
15170     return false;
15171
15172   if (Subtarget->isThumb1Only()) {
15173     // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
15174     // must be non-extending/truncating, i32, with an offset of 4.
15175     assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
15176     if (Op->getOpcode() != ISD::ADD || !isNonExt)
15177       return false;
15178     auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
15179     if (!RHS || RHS->getZExtValue() != 4)
15180       return false;
15181
15182     Offset = Op->getOperand(1);
15183     Base = Op->getOperand(0);
15184     AM = ISD::POST_INC;
15185     return true;
15186   }
15187
15188   bool isInc;
15189   bool isLegal = false;
15190   if (VT.isVector())
15191     isLegal = Subtarget->hasMVEIntegerOps() &&
15192               getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad,
15193                                         Subtarget->isLittle(), Base, Offset,
15194                                         isInc, DAG);
15195   else {
15196     if (Subtarget->isThumb2())
15197       isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
15198                                          isInc, DAG);
15199     else
15200       isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
15201                                           isInc, DAG);
15202   }
15203   if (!isLegal)
15204     return false;
15205
15206   if (Ptr != Base) {
15207     // Swap base ptr and offset to catch more post-index load / store when
15208     // it's legal. In Thumb2 mode, offset must be an immediate.
15209     if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
15210         !Subtarget->isThumb2())
15211       std::swap(Base, Offset);
15212
15213     // Post-indexed load / store update the base pointer.
15214     if (Ptr != Base)
15215       return false;
15216   }
15217
15218   AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
15219   return true;
15220 }
15221
15222 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
15223                                                       KnownBits &Known,
15224                                                       const APInt &DemandedElts,
15225                                                       const SelectionDAG &DAG,
15226                                                       unsigned Depth) const {
15227   unsigned BitWidth = Known.getBitWidth();
15228   Known.resetAll();
15229   switch (Op.getOpcode()) {
15230   default: break;
15231   case ARMISD::ADDC:
15232   case ARMISD::ADDE:
15233   case ARMISD::SUBC:
15234   case ARMISD::SUBE:
15235     // Special cases when we convert a carry to a boolean.
15236     if (Op.getResNo() == 0) {
15237       SDValue LHS = Op.getOperand(0);
15238       SDValue RHS = Op.getOperand(1);
15239       // (ADDE 0, 0, C) will give us a single bit.
15240       if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
15241           isNullConstant(RHS)) {
15242         Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
15243         return;
15244       }
15245     }
15246     break;
15247   case ARMISD::CMOV: {
15248     // Bits are known zero/one if known on the LHS and RHS.
15249     Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
15250     if (Known.isUnknown())
15251       return;
15252
15253     KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
15254     Known.Zero &= KnownRHS.Zero;
15255     Known.One  &= KnownRHS.One;
15256     return;
15257   }
15258   case ISD::INTRINSIC_W_CHAIN: {
15259     ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
15260     Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
15261     switch (IntID) {
15262     default: return;
15263     case Intrinsic::arm_ldaex:
15264     case Intrinsic::arm_ldrex: {
15265       EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
15266       unsigned MemBits = VT.getScalarSizeInBits();
15267       Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
15268       return;
15269     }
15270     }
15271   }
15272   case ARMISD::BFI: {
15273     // Conservatively, we can recurse down the first operand
15274     // and just mask out all affected bits.
15275     Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
15276
15277     // The operand to BFI is already a mask suitable for removing the bits it
15278     // sets.
15279     ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2));
15280     const APInt &Mask = CI->getAPIntValue();
15281     Known.Zero &= Mask;
15282     Known.One &= Mask;
15283     return;
15284   }
15285   case ARMISD::VGETLANEs:
15286   case ARMISD::VGETLANEu: {
15287     const SDValue &SrcSV = Op.getOperand(0);
15288     EVT VecVT = SrcSV.getValueType();
15289     assert(VecVT.isVector() && "VGETLANE expected a vector type");
15290     const unsigned NumSrcElts = VecVT.getVectorNumElements();
15291     ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
15292     assert(Pos->getAPIntValue().ult(NumSrcElts) &&
15293            "VGETLANE index out of bounds");
15294     unsigned Idx = Pos->getZExtValue();
15295     APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
15296     Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);
15297
15298     EVT VT = Op.getValueType();
15299     const unsigned DstSz = VT.getScalarSizeInBits();
15300     const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
15301     (void)SrcSz;
15302     assert(SrcSz == Known.getBitWidth());
15303     assert(DstSz > SrcSz);
15304     if (Op.getOpcode() == ARMISD::VGETLANEs)
15305       Known = Known.sext(DstSz);
15306     else {
15307       Known = Known.zext(DstSz, true /* extended bits are known zero */);
15308     }
15309     assert(DstSz == Known.getBitWidth());
15310     break;
15311   }
15312   }
15313 }
15314
15315 bool
15316 ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op,
15317                                                 const APInt &DemandedAPInt,
15318                                                 TargetLoweringOpt &TLO) const {
15319   // Delay optimization, so we don't have to deal with illegal types, or block
15320   // optimizations.
15321   if (!TLO.LegalOps)
15322     return false;
15323
15324   // Only optimize AND for now.
15325   if (Op.getOpcode() != ISD::AND)
15326     return false;
15327
15328   EVT VT = Op.getValueType();
15329
15330   // Ignore vectors.
15331   if (VT.isVector())
15332     return false;
15333
15334   assert(VT == MVT::i32 && "Unexpected integer type");
15335
15336   // Make sure the RHS really is a constant.
15337   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
15338   if (!C)
15339     return false;
15340
15341   unsigned Mask = C->getZExtValue();
15342
15343   unsigned Demanded = DemandedAPInt.getZExtValue();
15344   unsigned ShrunkMask = Mask & Demanded;
15345   unsigned ExpandedMask = Mask | ~Demanded;
15346
15347   // If the mask is all zeros, let the target-independent code replace the
15348   // result with zero.
15349   if (ShrunkMask == 0)
15350     return false;
15351
15352   // If the mask is all ones, erase the AND. (Currently, the target-independent
15353   // code won't do this, so we have to do it explicitly to avoid an infinite
15354   // loop in obscure cases.)
15355   if (ExpandedMask == ~0U)
15356     return TLO.CombineTo(Op, Op.getOperand(0));
15357
15358   auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
15359     return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
15360   };
15361   auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
15362     if (NewMask == Mask)
15363       return true;
15364     SDLoc DL(Op);
15365     SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
15366     SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
15367     return TLO.CombineTo(Op, NewOp);
15368   };
15369
15370   // Prefer uxtb mask.
15371   if (IsLegalMask(0xFF))
15372     return UseMask(0xFF);
15373
15374   // Prefer uxth mask.
15375   if (IsLegalMask(0xFFFF))
15376     return UseMask(0xFFFF);
15377
15378   // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
15379   // FIXME: Prefer a contiguous sequence of bits for other optimizations.
15380   if (ShrunkMask < 256)
15381     return UseMask(ShrunkMask);
15382
15383   // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
15384   // FIXME: Prefer a contiguous sequence of bits for other optimizations.
15385   if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
15386     return UseMask(ExpandedMask);
15387
15388   // Potential improvements:
15389   //
15390   // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
15391   // We could try to prefer Thumb1 immediates which can be lowered to a
15392   // two-instruction sequence.
15393   // We could try to recognize more legal ARM/Thumb2 immediates here.
15394
15395   return false;
15396 }
15397
15398
15399 //===----------------------------------------------------------------------===//
15400 //                           ARM Inline Assembly Support
15401 //===----------------------------------------------------------------------===//
15402
15403 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
15404   // Looking for "rev" which is V6+.
15405   if (!Subtarget->hasV6Ops())
15406     return false;
15407
15408   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
15409   std::string AsmStr = IA->getAsmString();
15410   SmallVector<StringRef, 4> AsmPieces;
15411   SplitString(AsmStr, AsmPieces, ";\n");
15412
15413   switch (AsmPieces.size()) {
15414   default: return false;
15415   case 1:
15416     AsmStr = AsmPieces[0];
15417     AsmPieces.clear();
15418     SplitString(AsmStr, AsmPieces, " \t,");
15419
15420     // rev $0, $1
15421     if (AsmPieces.size() == 3 &&
15422         AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
15423         IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
15424       IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
15425       if (Ty && Ty->getBitWidth() == 32)
15426         return IntrinsicLowering::LowerToByteSwap(CI);
15427     }
15428     break;
15429   }
15430
15431   return false;
15432 }
15433
15434 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
15435   // At this point, we have to lower this constraint to something else, so we
15436   // lower it to an "r" or "w". However, by doing this we will force the result
15437   // to be in register, while the X constraint is much more permissive.
15438   //
15439   // Although we are correct (we are free to emit anything, without
15440   // constraints), we might break use cases that would expect us to be more
15441   // efficient and emit something else.
15442   if (!Subtarget->hasVFP2Base())
15443     return "r";
15444   if (ConstraintVT.isFloatingPoint())
15445     return "w";
15446   if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
15447      (ConstraintVT.getSizeInBits() == 64 ||
15448       ConstraintVT.getSizeInBits() == 128))
15449     return "w";
15450
15451   return "r";
15452 }
15453
15454 /// getConstraintType - Given a constraint letter, return the type of
15455 /// constraint it is for this target.
15456 ARMTargetLowering::ConstraintType
15457 ARMTargetLowering::getConstraintType(StringRef Constraint) const {
15458   unsigned S = Constraint.size();
15459   if (S == 1) {
15460     switch (Constraint[0]) {
15461     default:  break;
15462     case 'l': return C_RegisterClass;
15463     case 'w': return C_RegisterClass;
15464     case 'h': return C_RegisterClass;
15465     case 'x': return C_RegisterClass;
15466     case 't': return C_RegisterClass;
15467     case 'j': return C_Immediate; // Constant for movw.
15468     // An address with a single base register. Due to the way we
15469     // currently handle addresses it is the same as an 'r' memory constraint.
15470     case 'Q': return C_Memory;
15471     }
15472   } else if (S == 2) {
15473     switch (Constraint[0]) {
15474     default: break;
15475     case 'T': return C_RegisterClass;
15476     // All 'U+' constraints are addresses.
15477     case 'U': return C_Memory;
15478     }
15479   }
15480   return TargetLowering::getConstraintType(Constraint);
15481 }
15482
15483 /// Examine constraint type and operand type and determine a weight value.
15484 /// This object must already have been set up with the operand type
15485 /// and the current alternative constraint selected.
15486 TargetLowering::ConstraintWeight
15487 ARMTargetLowering::getSingleConstraintMatchWeight(
15488     AsmOperandInfo &info, const char *constraint) const {
15489   ConstraintWeight weight = CW_Invalid;
15490   Value *CallOperandVal = info.CallOperandVal;
15491     // If we don't have a value, we can't do a match,
15492     // but allow it at the lowest weight.
15493   if (!CallOperandVal)
15494     return CW_Default;
15495   Type *type = CallOperandVal->getType();
15496   // Look at the constraint type.
15497   switch (*constraint) {
15498   default:
15499     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
15500     break;
15501   case 'l':
15502     if (type->isIntegerTy()) {
15503       if (Subtarget->isThumb())
15504         weight = CW_SpecificReg;
15505       else
15506         weight = CW_Register;
15507     }
15508     break;
15509   case 'w':
15510     if (type->isFloatingPointTy())
15511       weight = CW_Register;
15512     break;
15513   }
15514   return weight;
15515 }
15516
15517 using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
15518
15519 RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
15520     const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
15521   switch (Constraint.size()) {
15522   case 1:
15523     // GCC ARM Constraint Letters
15524     switch (Constraint[0]) {
15525     case 'l': // Low regs or general regs.
15526       if (Subtarget->isThumb())
15527         return RCPair(0U, &ARM::tGPRRegClass);
15528       return RCPair(0U, &ARM::GPRRegClass);
15529     case 'h': // High regs or no regs.
15530       if (Subtarget->isThumb())
15531         return RCPair(0U, &ARM::hGPRRegClass);
15532       break;
15533     case 'r':
15534       if (Subtarget->isThumb1Only())
15535         return RCPair(0U, &ARM::tGPRRegClass);
15536       return RCPair(0U, &ARM::GPRRegClass);
15537     case 'w':
15538       if (VT == MVT::Other)
15539         break;
15540       if (VT == MVT::f32)
15541         return RCPair(0U, &ARM::SPRRegClass);
15542       if (VT.getSizeInBits() == 64)
15543         return RCPair(0U, &ARM::DPRRegClass);
15544       if (VT.getSizeInBits() == 128)
15545         return RCPair(0U, &ARM::QPRRegClass);
15546       break;
15547     case 'x':
15548       if (VT == MVT::Other)
15549         break;
15550       if (VT == MVT::f32)
15551         return RCPair(0U, &ARM::SPR_8RegClass);
15552       if (VT.getSizeInBits() == 64)
15553         return RCPair(0U, &ARM::DPR_8RegClass);
15554       if (VT.getSizeInBits() == 128)
15555         return RCPair(0U, &ARM::QPR_8RegClass);
15556       break;
15557     case 't':
15558       if (VT == MVT::Other)
15559         break;
15560       if (VT == MVT::f32 || VT == MVT::i32)
15561         return RCPair(0U, &ARM::SPRRegClass);
15562       if (VT.getSizeInBits() == 64)
15563         return RCPair(0U, &ARM::DPR_VFP2RegClass);
15564       if (VT.getSizeInBits() == 128)
15565         return RCPair(0U, &ARM::QPR_VFP2RegClass);
15566       break;
15567     }
15568     break;
15569
15570   case 2:
15571     if (Constraint[0] == 'T') {
15572       switch (Constraint[1]) {
15573       default:
15574         break;
15575       case 'e':
15576         return RCPair(0U, &ARM::tGPREvenRegClass);
15577       case 'o':
15578         return RCPair(0U, &ARM::tGPROddRegClass);
15579       }
15580     }
15581     break;
15582
15583   default:
15584     break;
15585   }
15586
15587   if (StringRef("{cc}").equals_lower(Constraint))
15588     return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
15589
15590   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15591 }
15592
15593 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
15594 /// vector.  If it is invalid, don't add anything to Ops.
15595 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
15596                                                      std::string &Constraint,
15597                                                      std::vector<SDValue>&Ops,
15598                                                      SelectionDAG &DAG) const {
15599   SDValue Result;
15600
15601   // Currently only support length 1 constraints.
15602   if (Constraint.length() != 1) return;
15603
15604   char ConstraintLetter = Constraint[0];
15605   switch (ConstraintLetter) {
15606   default: break;
15607   case 'j':
15608   case 'I': case 'J': case 'K': case 'L':
15609   case 'M': case 'N': case 'O':
15610     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
15611     if (!C)
15612       return;
15613
15614     int64_t CVal64 = C->getSExtValue();
15615     int CVal = (int) CVal64;
15616     // None of these constraints allow values larger than 32 bits.  Check
15617     // that the value fits in an int.
15618     if (CVal != CVal64)
15619       return;
15620
15621     switch (ConstraintLetter) {
15622       case 'j':
15623         // Constant suitable for movw, must be between 0 and
15624         // 65535.
15625         if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
15626           if (CVal >= 0 && CVal <= 65535)
15627             break;
15628         return;
15629       case 'I':
15630         if (Subtarget->isThumb1Only()) {
15631           // This must be a constant between 0 and 255, for ADD
15632           // immediates.
15633           if (CVal >= 0 && CVal <= 255)
15634             break;
15635         } else if (Subtarget->isThumb2()) {
15636           // A constant that can be used as an immediate value in a
15637           // data-processing instruction.
15638           if (ARM_AM::getT2SOImmVal(CVal) != -1)
15639             break;
15640         } else {
15641           // A constant that can be used as an immediate value in a
15642           // data-processing instruction.
15643           if (ARM_AM::getSOImmVal(CVal) != -1)
15644             break;
15645         }
15646         return;
15647
15648       case 'J':
15649         if (Subtarget->isThumb1Only()) {
15650           // This must be a constant between -255 and -1, for negated ADD
15651           // immediates. This can be used in GCC with an "n" modifier that
15652           // prints the negated value, for use with SUB instructions. It is
15653           // not useful otherwise but is implemented for compatibility.
15654           if (CVal >= -255 && CVal <= -1)
15655             break;
15656         } else {
15657           // This must be a constant between -4095 and 4095. It is not clear
15658           // what this constraint is intended for. Implemented for
15659           // compatibility with GCC.
15660           if (CVal >= -4095 && CVal <= 4095)
15661             break;
15662         }
15663         return;
15664
15665       case 'K':
15666         if (Subtarget->isThumb1Only()) {
15667           // A 32-bit value where only one byte has a nonzero value. Exclude
15668           // zero to match GCC. This constraint is used by GCC internally for
15669           // constants that can be loaded with a move/shift combination.
15670           // It is not useful otherwise but is implemented for compatibility.
15671           if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
15672             break;
15673         } else if (Subtarget->isThumb2()) {
15674           // A constant whose bitwise inverse can be used as an immediate
15675           // value in a data-processing instruction. This can be used in GCC
15676           // with a "B" modifier that prints the inverted value, for use with
15677           // BIC and MVN instructions. It is not useful otherwise but is
15678           // implemented for compatibility.
15679           if (ARM_AM::getT2SOImmVal(~CVal) != -1)
15680             break;
15681         } else {
15682           // A constant whose bitwise inverse can be used as an immediate
15683           // value in a data-processing instruction. This can be used in GCC
15684           // with a "B" modifier that prints the inverted value, for use with
15685           // BIC and MVN instructions. It is not useful otherwise but is
15686           // implemented for compatibility.
15687           if (ARM_AM::getSOImmVal(~CVal) != -1)
15688             break;
15689         }
15690         return;
15691
15692       case 'L':
15693         if (Subtarget->isThumb1Only()) {
15694           // This must be a constant between -7 and 7,
15695           // for 3-operand ADD/SUB immediate instructions.
15696           if (CVal >= -7 && CVal < 7)
15697             break;
15698         } else if (Subtarget->isThumb2()) {
15699           // A constant whose negation can be used as an immediate value in a
15700           // data-processing instruction. This can be used in GCC with an "n"
15701           // modifier that prints the negated value, for use with SUB
15702           // instructions. It is not useful otherwise but is implemented for
15703           // compatibility.
15704           if (ARM_AM::getT2SOImmVal(-CVal) != -1)
15705             break;
15706         } else {
15707           // A constant whose negation can be used as an immediate value in a
15708           // data-processing instruction. This can be used in GCC with an "n"
15709           // modifier that prints the negated value, for use with SUB
15710           // instructions. It is not useful otherwise but is implemented for
15711           // compatibility.
15712           if (ARM_AM::getSOImmVal(-CVal) != -1)
15713             break;
15714         }
15715         return;
15716
15717       case 'M':
15718         if (Subtarget->isThumb1Only()) {
15719           // This must be a multiple of 4 between 0 and 1020, for
15720           // ADD sp + immediate.
15721           if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
15722             break;
15723         } else {
15724           // A power of two or a constant between 0 and 32.  This is used in
15725           // GCC for the shift amount on shifted register operands, but it is
15726           // useful in general for any shift amounts.
15727           if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
15728             break;
15729         }
15730         return;
15731
15732       case 'N':
15733         if (Subtarget->isThumb1Only()) {
15734           // This must be a constant between 0 and 31, for shift amounts.
15735           if (CVal >= 0 && CVal <= 31)
15736             break;
15737         }
15738         return;
15739
15740       case 'O':
15741         if (Subtarget->isThumb1Only()) {
15742           // This must be a multiple of 4 between -508 and 508, for
15743           // ADD/SUB sp = sp + immediate.
15744           if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
15745             break;
15746         }
15747         return;
15748     }
15749     Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType());
15750     break;
15751   }
15752
15753   if (Result.getNode()) {
15754     Ops.push_back(Result);
15755     return;
15756   }
15757   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
15758 }
15759
15760 static RTLIB::Libcall getDivRemLibcall(
15761     const SDNode *N, MVT::SimpleValueType SVT) {
15762   assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
15763           N->getOpcode() == ISD::SREM    || N->getOpcode() == ISD::UREM) &&
15764          "Unhandled Opcode in getDivRemLibcall");
15765   bool isSigned = N->getOpcode() == ISD::SDIVREM ||
15766                   N->getOpcode() == ISD::SREM;
15767   RTLIB::Libcall LC;
15768   switch (SVT) {
15769   default: llvm_unreachable("Unexpected request for libcall!");
15770   case MVT::i8:  LC = isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
15771   case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
15772   case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
15773   case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
15774   }
15775   return LC;
15776 }
15777
15778 static TargetLowering::ArgListTy getDivRemArgList(
15779     const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
15780   assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
15781           N->getOpcode() == ISD::SREM    || N->getOpcode() == ISD::UREM) &&
15782          "Unhandled Opcode in getDivRemArgList");
15783   bool isSigned = N->getOpcode() == ISD::SDIVREM ||
15784                   N->getOpcode() == ISD::SREM;
15785   TargetLowering::ArgListTy Args;
15786   TargetLowering::ArgListEntry Entry;
15787   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
15788     EVT ArgVT = N->getOperand(i).getValueType();
15789     Type *ArgTy = ArgVT.getTypeForEVT(*Context);
15790     Entry.Node = N->getOperand(i);
15791     Entry.Ty = ArgTy;
15792     Entry.IsSExt = isSigned;
15793     Entry.IsZExt = !isSigned;
15794     Args.push_back(Entry);
15795   }
15796   if (Subtarget->isTargetWindows() && Args.size() >= 2)
15797     std::swap(Args[0], Args[1]);
15798   return Args;
15799 }
15800
15801 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
15802   assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
15803           Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
15804           Subtarget->isTargetWindows()) &&
15805          "Register-based DivRem lowering only");
15806   unsigned Opcode = Op->getOpcode();
15807   assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
15808          "Invalid opcode for Div/Rem lowering");
15809   bool isSigned = (Opcode == ISD::SDIVREM);
15810   EVT VT = Op->getValueType(0);
15811   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
15812   SDLoc dl(Op);
15813
15814   // If the target has hardware divide, use divide + multiply + subtract:
15815   //     div = a / b
15816   //     rem = a - b * div
15817   //     return {div, rem}
15818   // This should be lowered into UDIV/SDIV + MLS later on.
15819   bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
15820                                         : Subtarget->hasDivideInARMMode();
15821   if (hasDivide && Op->getValueType(0).isSimple() &&
15822       Op->getSimpleValueType(0) == MVT::i32) {
15823     unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
15824     const SDValue Dividend = Op->getOperand(0);
15825     const SDValue Divisor = Op->getOperand(1);
15826     SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
15827     SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
15828     SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
15829
15830     SDValue Values[2] = {Div, Rem};
15831     return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
15832   }
15833
15834   RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
15835                                        VT.getSimpleVT().SimpleTy);
15836   SDValue InChain = DAG.getEntryNode();
15837
15838   TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(),
15839                                                     DAG.getContext(),
15840                                                     Subtarget);
15841
15842   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
15843                                          getPointerTy(DAG.getDataLayout()));
15844
15845   Type *RetTy = StructType::get(Ty, Ty);
15846
15847   if (Subtarget->isTargetWindows())
15848     InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
15849
15850   TargetLowering::CallLoweringInfo CLI(DAG);
15851   CLI.setDebugLoc(dl).setChain(InChain)
15852     .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
15853     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
15854
15855   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
15856   return CallInfo.first;
15857 }
15858
15859 // Lowers REM using divmod helpers
15860 // see RTABI section 4.2/4.3
15861 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
15862   // Build return types (div and rem)
15863   std::vector<Type*> RetTyParams;
15864   Type *RetTyElement;
15865
15866   switch (N->getValueType(0).getSimpleVT().SimpleTy) {
15867   default: llvm_unreachable("Unexpected request for libcall!");
15868   case MVT::i8:   RetTyElement = Type::getInt8Ty(*DAG.getContext());  break;
15869   case MVT::i16:  RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
15870   case MVT::i32:  RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
15871   case MVT::i64:  RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
15872   }
15873
15874   RetTyParams.push_back(RetTyElement);
15875   RetTyParams.push_back(RetTyElement);
15876   ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
15877   Type *RetTy = StructType::get(*DAG.getContext(), ret);
15878
15879   RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
15880                                                              SimpleTy);
15881   SDValue InChain = DAG.getEntryNode();
15882   TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(),
15883                                                     Subtarget);
15884   bool isSigned = N->getOpcode() == ISD::SREM;
15885   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
15886                                          getPointerTy(DAG.getDataLayout()));
15887
15888   if (Subtarget->isTargetWindows())
15889     InChain = WinDBZCheckDenominator(DAG, N, InChain);
15890
15891   // Lower call
15892   CallLoweringInfo CLI(DAG);
15893   CLI.setChain(InChain)
15894      .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
15895      .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N));
15896   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
15897
15898   // Return second (rem) result operand (first contains div)
15899   SDNode *ResNode = CallResult.first.getNode();
15900   assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
15901   return ResNode->getOperand(1);
15902 }
15903
15904 SDValue
15905 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
15906   assert(Subtarget->isTargetWindows() && "unsupported target platform");
15907   SDLoc DL(Op);
15908
15909   // Get the inputs.
15910   SDValue Chain = Op.getOperand(0);
15911   SDValue Size  = Op.getOperand(1);
15912
15913   if (DAG.getMachineFunction().getFunction().hasFnAttribute(
15914           "no-stack-arg-probe")) {
15915     unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
15916     SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
15917     Chain = SP.getValue(1);
15918     SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
15919     if (Align)
15920       SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
15921                        DAG.getConstant(-(uint64_t)Align, DL, MVT::i32));
15922     Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
15923     SDValue Ops[2] = { SP, Chain };
15924     return DAG.getMergeValues(Ops, DL);
15925   }
15926
15927   SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
15928                               DAG.getConstant(2, DL, MVT::i32));
15929
15930   SDValue Flag;
15931   Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag);
15932   Flag = Chain.getValue(1);
15933
15934   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
15935   Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag);
15936
15937   SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
15938   Chain = NewSP.getValue(1);
15939
15940   SDValue Ops[2] = { NewSP, Chain };
15941   return DAG.getMergeValues(Ops, DL);
15942 }
15943
15944 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
15945   SDValue SrcVal = Op.getOperand(0);
15946   const unsigned DstSz = Op.getValueType().getSizeInBits();
15947   const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
15948   assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
15949          "Unexpected type for custom-lowering FP_EXTEND");
15950
15951   assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
15952          "With both FP DP and 16, any FP conversion is legal!");
15953
15954   assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
15955          "With FP16, 16 to 32 conversion is legal!");
15956
15957   // Either we are converting from 16 -> 64, without FP16 and/or
15958   // FP.double-precision or without Armv8-fp. So we must do it in two
15959   // steps.
15960   // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
15961   // without FP16. So we must do a function call.
15962   SDLoc Loc(Op);
15963   RTLIB::Libcall LC;
15964   MakeLibCallOptions CallOptions;
15965   if (SrcSz == 16) {
15966     // Instruction from 16 -> 32
15967     if (Subtarget->hasFP16())
15968       SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, SrcVal);
15969     // Lib call from 16 -> 32
15970     else {
15971       LC = RTLIB::getFPEXT(MVT::f16, MVT::f32);
15972       assert(LC != RTLIB::UNKNOWN_LIBCALL &&
15973              "Unexpected type for custom-lowering FP_EXTEND");
15974       SrcVal =
15975         makeLibCall(DAG, LC, MVT::f32, SrcVal, CallOptions, Loc).first;
15976     }
15977   }
15978
15979   if (DstSz != 64)
15980     return SrcVal;
15981   // For sure now SrcVal is 32 bits
15982   if (Subtarget->hasFP64()) // Instruction from 32 -> 64
15983     return DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f64, SrcVal);
15984
15985   LC = RTLIB::getFPEXT(MVT::f32, MVT::f64);
15986   assert(LC != RTLIB::UNKNOWN_LIBCALL &&
15987          "Unexpected type for custom-lowering FP_EXTEND");
15988   return makeLibCall(DAG, LC, MVT::f64, SrcVal, CallOptions, Loc).first;
15989 }
15990
15991 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
15992   SDValue SrcVal = Op.getOperand(0);
15993   EVT SrcVT = SrcVal.getValueType();
15994   EVT DstVT = Op.getValueType();
15995   const unsigned DstSz = Op.getValueType().getSizeInBits();
15996   const unsigned SrcSz = SrcVT.getSizeInBits();
15997   (void)DstSz;
15998   assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
15999          "Unexpected type for custom-lowering FP_ROUND");
16000
16001   assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
16002          "With both FP DP and 16, any FP conversion is legal!");
16003
16004   SDLoc Loc(Op);
16005
16006   // Instruction from 32 -> 16 if hasFP16 is valid
16007   if (SrcSz == 32 && Subtarget->hasFP16())
16008     return Op;
16009
16010   // Lib call from 32 -> 16 / 64 -> [32, 16]
16011   RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
16012   assert(LC != RTLIB::UNKNOWN_LIBCALL &&
16013          "Unexpected type for custom-lowering FP_ROUND");
16014   MakeLibCallOptions CallOptions;
16015   return makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, Loc).first;
16016 }
16017
16018 void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
16019                                  SelectionDAG &DAG) const {
16020   assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS.");
16021   MVT HalfT = MVT::i32;
16022   SDLoc dl(N);
16023   SDValue Hi, Lo, Tmp;
16024
16025   if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) ||
16026       !isOperationLegalOrCustom(ISD::UADDO, HalfT))
16027     return ;
16028
16029   unsigned OpTypeBits = HalfT.getScalarSizeInBits();
16030   SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);
16031
16032   Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
16033                    DAG.getConstant(0, dl, HalfT));
16034   Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
16035                    DAG.getConstant(1, dl, HalfT));
16036
16037   Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi,
16038                     DAG.getConstant(OpTypeBits - 1, dl,
16039                     getShiftAmountTy(HalfT, DAG.getDataLayout())));
16040   Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
16041   Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
16042                    SDValue(Lo.getNode(), 1));
16043   Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
16044   Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
16045
16046   Results.push_back(Lo);
16047   Results.push_back(Hi);
16048 }
16049
16050 bool
16051 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
16052   // The ARM target isn't yet aware of offsets.
16053   return false;
16054 }
16055
16056 bool ARM::isBitFieldInvertedMask(unsigned v) {
16057   if (v == 0xffffffff)
16058     return false;
16059
16060   // there can be 1's on either or both "outsides", all the "inside"
16061   // bits must be 0's
16062   return isShiftedMask_32(~v);
16063 }
16064
16065 /// isFPImmLegal - Returns true if the target can instruction select the
16066 /// specified FP immediate natively. If false, the legalizer will
16067 /// materialize the FP immediate as a load from a constant pool.
16068 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
16069                                      bool ForCodeSize) const {
16070   if (!Subtarget->hasVFP3Base())
16071     return false;
16072   if (VT == MVT::f16 && Subtarget->hasFullFP16())
16073     return ARM_AM::getFP16Imm(Imm) != -1;
16074   if (VT == MVT::f32)
16075     return ARM_AM::getFP32Imm(Imm) != -1;
16076   if (VT == MVT::f64 && Subtarget->hasFP64())
16077     return ARM_AM::getFP64Imm(Imm) != -1;
16078   return false;
16079 }
16080
16081 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
16082 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
16083 /// specified in the intrinsic calls.
16084 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
16085                                            const CallInst &I,
16086                                            MachineFunction &MF,
16087                                            unsigned Intrinsic) const {
16088   switch (Intrinsic) {
16089   case Intrinsic::arm_neon_vld1:
16090   case Intrinsic::arm_neon_vld2:
16091   case Intrinsic::arm_neon_vld3:
16092   case Intrinsic::arm_neon_vld4:
16093   case Intrinsic::arm_neon_vld2lane:
16094   case Intrinsic::arm_neon_vld3lane:
16095   case Intrinsic::arm_neon_vld4lane:
16096   case Intrinsic::arm_neon_vld2dup:
16097   case Intrinsic::arm_neon_vld3dup:
16098   case Intrinsic::arm_neon_vld4dup: {
16099     Info.opc = ISD::INTRINSIC_W_CHAIN;
16100     // Conservatively set memVT to the entire set of vectors loaded.
16101     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
16102     uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
16103     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
16104     Info.ptrVal = I.getArgOperand(0);
16105     Info.offset = 0;
16106     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
16107     Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue());
16108     // volatile loads with NEON intrinsics not supported
16109     Info.flags = MachineMemOperand::MOLoad;
16110     return true;
16111   }
16112   case Intrinsic::arm_neon_vld1x2:
16113   case Intrinsic::arm_neon_vld1x3:
16114   case Intrinsic::arm_neon_vld1x4: {
16115     Info.opc = ISD::INTRINSIC_W_CHAIN;
16116     // Conservatively set memVT to the entire set of vectors loaded.
16117     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
16118     uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
16119     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
16120     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
16121     Info.offset = 0;
16122     Info.align.reset();
16123     // volatile loads with NEON intrinsics not supported
16124     Info.flags = MachineMemOperand::MOLoad;
16125     return true;
16126   }
16127   case Intrinsic::arm_neon_vst1:
16128   case Intrinsic::arm_neon_vst2:
16129   case Intrinsic::arm_neon_vst3:
16130   case Intrinsic::arm_neon_vst4:
16131   case Intrinsic::arm_neon_vst2lane:
16132   case Intrinsic::arm_neon_vst3lane:
16133   case Intrinsic::arm_neon_vst4lane: {
16134     Info.opc = ISD::INTRINSIC_VOID;
16135     // Conservatively set memVT to the entire set of vectors stored.
16136     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
16137     unsigned NumElts = 0;
16138     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
16139       Type *ArgTy = I.getArgOperand(ArgI)->getType();
16140       if (!ArgTy->isVectorTy())
16141         break;
16142       NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
16143     }
16144     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
16145     Info.ptrVal = I.getArgOperand(0);
16146     Info.offset = 0;
16147     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
16148     Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue());
16149     // volatile stores with NEON intrinsics not supported
16150     Info.flags = MachineMemOperand::MOStore;
16151     return true;
16152   }
16153   case Intrinsic::arm_neon_vst1x2:
16154   case Intrinsic::arm_neon_vst1x3:
16155   case Intrinsic::arm_neon_vst1x4: {
16156     Info.opc = ISD::INTRINSIC_VOID;
16157     // Conservatively set memVT to the entire set of vectors stored.
16158     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
16159     unsigned NumElts = 0;
16160     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
16161       Type *ArgTy = I.getArgOperand(ArgI)->getType();
16162       if (!ArgTy->isVectorTy())
16163         break;
16164       NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
16165     }
16166     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
16167     Info.ptrVal = I.getArgOperand(0);
16168     Info.offset = 0;
16169     Info.align.reset();
16170     // volatile stores with NEON intrinsics not supported
16171     Info.flags = MachineMemOperand::MOStore;
16172     return true;
16173   }
16174   case Intrinsic::arm_ldaex:
16175   case Intrinsic::arm_ldrex: {
16176     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
16177     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
16178     Info.opc = ISD::INTRINSIC_W_CHAIN;
16179     Info.memVT = MVT::getVT(PtrTy->getElementType());
16180     Info.ptrVal = I.getArgOperand(0);
16181     Info.offset = 0;
16182     Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
16183     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
16184     return true;
16185   }
16186   case Intrinsic::arm_stlex:
16187   case Intrinsic::arm_strex: {
16188     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
16189     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
16190     Info.opc = ISD::INTRINSIC_W_CHAIN;
16191     Info.memVT = MVT::getVT(PtrTy->getElementType());
16192     Info.ptrVal = I.getArgOperand(1);
16193     Info.offset = 0;
16194     Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
16195     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
16196     return true;
16197   }
16198   case Intrinsic::arm_stlexd:
16199   case Intrinsic::arm_strexd:
16200     Info.opc = ISD::INTRINSIC_W_CHAIN;
16201     Info.memVT = MVT::i64;
16202     Info.ptrVal = I.getArgOperand(2);
16203     Info.offset = 0;
16204     Info.align = Align(8);
16205     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
16206     return true;
16207
16208   case Intrinsic::arm_ldaexd:
16209   case Intrinsic::arm_ldrexd:
16210     Info.opc = ISD::INTRINSIC_W_CHAIN;
16211     Info.memVT = MVT::i64;
16212     Info.ptrVal = I.getArgOperand(0);
16213     Info.offset = 0;
16214     Info.align = Align(8);
16215     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
16216     return true;
16217
16218   default:
16219     break;
16220   }
16221
16222   return false;
16223 }
16224
16225 /// Returns true if it is beneficial to convert a load of a constant
16226 /// to just the constant itself.
16227 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
16228                                                           Type *Ty) const {
16229   assert(Ty->isIntegerTy());
16230
16231   unsigned Bits = Ty->getPrimitiveSizeInBits();
16232   if (Bits == 0 || Bits > 32)
16233     return false;
16234   return true;
16235 }
16236
16237 bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
16238                                                 unsigned Index) const {
16239   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
16240     return false;
16241
16242   return (Index == 0 || Index == ResVT.getVectorNumElements());
16243 }
16244
16245 Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
16246                                         ARM_MB::MemBOpt Domain) const {
16247   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
16248
16249   // First, if the target has no DMB, see what fallback we can use.
16250   if (!Subtarget->hasDataBarrier()) {
16251     // Some ARMv6 cpus can support data barriers with an mcr instruction.
16252     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
16253     // here.
16254     if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
16255       Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
16256       Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
16257                         Builder.getInt32(0), Builder.getInt32(7),
16258                         Builder.getInt32(10), Builder.getInt32(5)};
16259       return Builder.CreateCall(MCR, args);
16260     } else {
16261       // Instead of using barriers, atomic accesses on these subtargets use
16262       // libcalls.
16263       llvm_unreachable("makeDMB on a target so old that it has no barriers");
16264     }
16265   } else {
16266     Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
16267     // Only a full system barrier exists in the M-class architectures.
16268     Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
16269     Constant *CDomain = Builder.getInt32(Domain);
16270     return Builder.CreateCall(DMB, CDomain);
16271   }
16272 }
16273
16274 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
16275 Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
16276                                                  Instruction *Inst,
16277                                                  AtomicOrdering Ord) const {
16278   switch (Ord) {
16279   case AtomicOrdering::NotAtomic:
16280   case AtomicOrdering::Unordered:
16281     llvm_unreachable("Invalid fence: unordered/non-atomic");
16282   case AtomicOrdering::Monotonic:
16283   case AtomicOrdering::Acquire:
16284     return nullptr; // Nothing to do
16285   case AtomicOrdering::SequentiallyConsistent:
16286     if (!Inst->hasAtomicStore())
16287       return nullptr; // Nothing to do
16288     LLVM_FALLTHROUGH;
16289   case AtomicOrdering::Release:
16290   case AtomicOrdering::AcquireRelease:
16291     if (Subtarget->preferISHSTBarriers())
16292       return makeDMB(Builder, ARM_MB::ISHST);
16293     // FIXME: add a comment with a link to documentation justifying this.
16294     else
16295       return makeDMB(Builder, ARM_MB::ISH);
16296   }
16297   llvm_unreachable("Unknown fence ordering in emitLeadingFence");
16298 }
16299
16300 Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
16301                                                   Instruction *Inst,
16302                                                   AtomicOrdering Ord) const {
16303   switch (Ord) {
16304   case AtomicOrdering::NotAtomic:
16305   case AtomicOrdering::Unordered:
16306     llvm_unreachable("Invalid fence: unordered/not-atomic");
16307   case AtomicOrdering::Monotonic:
16308   case AtomicOrdering::Release:
16309     return nullptr; // Nothing to do
16310   case AtomicOrdering::Acquire:
16311   case AtomicOrdering::AcquireRelease:
16312   case AtomicOrdering::SequentiallyConsistent:
16313     return makeDMB(Builder, ARM_MB::ISH);
16314   }
16315   llvm_unreachable("Unknown fence ordering in emitTrailingFence");
16316 }
16317
16318 // Loads and stores less than 64-bits are already atomic; ones above that
16319 // are doomed anyway, so defer to the default libcall and blame the OS when
16320 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
16321 // anything for those.
16322 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
16323   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
16324   return (Size == 64) && !Subtarget->isMClass();
16325 }
16326
16327 // Loads and stores less than 64-bits are already atomic; ones above that
16328 // are doomed anyway, so defer to the default libcall and blame the OS when
16329 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
16330 // anything for those.
16331 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
16332 // guarantee, see DDI0406C ARM architecture reference manual,
16333 // sections A8.8.72-74 LDRD)
16334 TargetLowering::AtomicExpansionKind
16335 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
16336   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
16337   return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly
16338                                                   : AtomicExpansionKind::None;
16339 }
16340
16341 // For the real atomic operations, we have ldrex/strex up to 32 bits,
16342 // and up to 64 bits on the non-M profiles
16343 TargetLowering::AtomicExpansionKind
16344 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
16345   if (AI->isFloatingPointOperation())
16346     return AtomicExpansionKind::CmpXChg;
16347
16348   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
16349   bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
16350   return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW)
16351              ? AtomicExpansionKind::LLSC
16352              : AtomicExpansionKind::None;
16353 }
16354
16355 TargetLowering::AtomicExpansionKind
16356 ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
16357   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
16358   // implement cmpxchg without spilling. If the address being exchanged is also
16359   // on the stack and close enough to the spill slot, this can lead to a
16360   // situation where the monitor always gets cleared and the atomic operation
16361   // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
16362   bool HasAtomicCmpXchg =
16363       !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
16364   if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg)
16365     return AtomicExpansionKind::LLSC;
16366   return AtomicExpansionKind::None;
16367 }
16368
16369 bool ARMTargetLowering::shouldInsertFencesForAtomic(
16370     const Instruction *I) const {
16371   return InsertFencesForAtomic;
16372 }
16373
16374 // This has so far only been implemented for MachO.
16375 bool ARMTargetLowering::useLoadStackGuardNode() const {
16376   return Subtarget->isTargetMachO();
16377 }
16378
16379 void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
16380   if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
16381     return TargetLowering::insertSSPDeclarations(M);
16382
16383   // MSVC CRT has a global variable holding security cookie.
16384   M.getOrInsertGlobal("__security_cookie",
16385                       Type::getInt8PtrTy(M.getContext()));
16386
16387   // MSVC CRT has a function to validate security cookie.
16388   FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
16389       "__security_check_cookie", Type::getVoidTy(M.getContext()),
16390       Type::getInt8PtrTy(M.getContext()));
16391   if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
16392     F->addAttribute(1, Attribute::AttrKind::InReg);
16393 }
16394
16395 Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const {
16396   // MSVC CRT has a global variable holding security cookie.
16397   if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
16398     return M.getGlobalVariable("__security_cookie");
16399   return TargetLowering::getSDagStackGuard(M);
16400 }
16401
16402 Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const {
16403   // MSVC CRT has a function to validate security cookie.
16404   if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
16405     return M.getFunction("__security_check_cookie");
16406   return TargetLowering::getSSPStackGuardCheck(M);
16407 }
16408
16409 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
16410                                                   unsigned &Cost) const {
16411   // If we do not have NEON, vector types are not natively supported.
16412   if (!Subtarget->hasNEON())
16413     return false;
16414
16415   // Floating point values and vector values map to the same register file.
16416   // Therefore, although we could do a store extract of a vector type, this is
16417   // better to leave at float as we have more freedom in the addressing mode for
16418   // those.
16419   if (VectorTy->isFPOrFPVectorTy())
16420     return false;
16421
16422   // If the index is unknown at compile time, this is very expensive to lower
16423   // and it is not possible to combine the store with the extract.
16424   if (!isa<ConstantInt>(Idx))
16425     return false;
16426
16427   assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
16428   unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth();
16429   // We can do a store + vector extract on any vector that fits perfectly in a D
16430   // or Q register.
16431   if (BitWidth == 64 || BitWidth == 128) {
16432     Cost = 0;
16433     return true;
16434   }
16435   return false;
16436 }
16437
16438 bool ARMTargetLowering::isCheapToSpeculateCttz() const {
16439   return Subtarget->hasV6T2Ops();
16440 }
16441
16442 bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
16443   return Subtarget->hasV6T2Ops();
16444 }
16445
16446 bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const {
16447   return !Subtarget->hasMinSize();
16448 }
16449
16450 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
16451                                          AtomicOrdering Ord) const {
16452   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
16453   Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
16454   bool IsAcquire = isAcquireOrStronger(Ord);
16455
16456   // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
16457   // intrinsic must return {i32, i32} and we have to recombine them into a
16458   // single i64 here.
16459   if (ValTy->getPrimitiveSizeInBits() == 64) {
16460     Intrinsic::ID Int =
16461         IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
16462     Function *Ldrex = Intrinsic::getDeclaration(M, Int);
16463
16464     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
16465     Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
16466
16467     Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
16468     Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
16469     if (!Subtarget->isLittle())
16470       std::swap (Lo, Hi);
16471     Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
16472     Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
16473     return Builder.CreateOr(
16474         Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64");
16475   }
16476
16477   Type *Tys[] = { Addr->getType() };
16478   Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
16479   Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys);
16480
16481   return Builder.CreateTruncOrBitCast(
16482       Builder.CreateCall(Ldrex, Addr),
16483       cast<PointerType>(Addr->getType())->getElementType());
16484 }
16485
16486 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
16487     IRBuilder<> &Builder) const {
16488   if (!Subtarget->hasV7Ops())
16489     return;
16490   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
16491   Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
16492 }
16493
16494 Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
16495                                                Value *Addr,
16496                                                AtomicOrdering Ord) const {
16497   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
16498   bool IsRelease = isReleaseOrStronger(Ord);
16499
16500   // Since the intrinsics must have legal type, the i64 intrinsics take two
16501   // parameters: "i32, i32". We must marshal Val into the appropriate form
16502   // before the call.
16503   if (Val->getType()->getPrimitiveSizeInBits() == 64) {
16504     Intrinsic::ID Int =
16505         IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
16506     Function *Strex = Intrinsic::getDeclaration(M, Int);
16507     Type *Int32Ty = Type::getInt32Ty(M->getContext());
16508
16509     Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
16510     Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
16511     if (!Subtarget->isLittle())
16512       std::swap(Lo, Hi);
16513     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
16514     return Builder.CreateCall(Strex, {Lo, Hi, Addr});
16515   }
16516
16517   Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
16518   Type *Tys[] = { Addr->getType() };
16519   Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
16520
16521   return Builder.CreateCall(
16522       Strex, {Builder.CreateZExtOrBitCast(
16523                   Val, Strex->getFunctionType()->getParamType(0)),
16524               Addr});
16525 }
16526
16527
16528 bool ARMTargetLowering::alignLoopsWithOptSize() const {
16529   return Subtarget->isMClass();
16530 }
16531
16532 /// A helper function for determining the number of interleaved accesses we
16533 /// will generate when lowering accesses of the given type.
16534 unsigned
16535 ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
16536                                              const DataLayout &DL) const {
16537   return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
16538 }
16539
16540 bool ARMTargetLowering::isLegalInterleavedAccessType(
16541     VectorType *VecTy, const DataLayout &DL) const {
16542
16543   unsigned VecSize = DL.getTypeSizeInBits(VecTy);
16544   unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
16545
16546   // Ensure the vector doesn't have f16 elements. Even though we could do an
16547   // i16 vldN, we can't hold the f16 vectors and will end up converting via
16548   // f32.
16549   if (VecTy->getElementType()->isHalfTy())
16550     return false;
16551
16552   // Ensure the number of vector elements is greater than 1.
16553   if (VecTy->getNumElements() < 2)
16554     return false;
16555
16556   // Ensure the element type is legal.
16557   if (ElSize != 8 && ElSize != 16 && ElSize != 32)
16558     return false;
16559
16560   // Ensure the total vector size is 64 or a multiple of 128. Types larger than
16561   // 128 will be split into multiple interleaved accesses.
16562   return VecSize == 64 || VecSize % 128 == 0;
16563 }
16564
16565 unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
16566   if (Subtarget->hasNEON())
16567     return 4;
16568   return TargetLoweringBase::getMaxSupportedInterleaveFactor();
16569 }
16570
16571 /// Lower an interleaved load into a vldN intrinsic.
16572 ///
16573 /// E.g. Lower an interleaved load (Factor = 2):
16574 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
16575 ///        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
16576 ///        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
16577 ///
16578 ///      Into:
16579 ///        %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
16580 ///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
16581 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
16582 bool ARMTargetLowering::lowerInterleavedLoad(
16583     LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
16584     ArrayRef<unsigned> Indices, unsigned Factor) const {
16585   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16586          "Invalid interleave factor");
16587   assert(!Shuffles.empty() && "Empty shufflevector input");
16588   assert(Shuffles.size() == Indices.size() &&
16589          "Unmatched number of shufflevectors and indices");
16590
16591   VectorType *VecTy = Shuffles[0]->getType();
16592   Type *EltTy = VecTy->getVectorElementType();
16593
16594   const DataLayout &DL = LI->getModule()->getDataLayout();
16595
16596   // Skip if we do not have NEON and skip illegal vector types. We can
16597   // "legalize" wide vector types into multiple interleaved accesses as long as
16598   // the vector types are divisible by 128.
16599   if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL))
16600     return false;
16601
16602   unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
16603
16604   // A pointer vector can not be the return type of the ldN intrinsics. Need to
16605   // load integer vectors first and then convert to pointer vectors.
16606   if (EltTy->isPointerTy())
16607     VecTy =
16608         VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
16609
16610   IRBuilder<> Builder(LI);
16611
16612   // The base address of the load.
16613   Value *BaseAddr = LI->getPointerOperand();
16614
16615   if (NumLoads > 1) {
16616     // If we're going to generate more than one load, reset the sub-vector type
16617     // to something legal.
16618     VecTy = VectorType::get(VecTy->getVectorElementType(),
16619                             VecTy->getVectorNumElements() / NumLoads);
16620
16621     // We will compute the pointer operand of each load from the original base
16622     // address using GEPs. Cast the base address to a pointer to the scalar
16623     // element type.
16624     BaseAddr = Builder.CreateBitCast(
16625         BaseAddr, VecTy->getVectorElementType()->getPointerTo(
16626                       LI->getPointerAddressSpace()));
16627   }
16628
16629   assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
16630
16631   Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
16632   Type *Tys[] = {VecTy, Int8Ptr};
16633   static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
16634                                             Intrinsic::arm_neon_vld3,
16635                                             Intrinsic::arm_neon_vld4};
16636   Function *VldnFunc =
16637       Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
16638
16639   // Holds sub-vectors extracted from the load intrinsic return values. The
16640   // sub-vectors are associated with the shufflevector instructions they will
16641   // replace.
16642   DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
16643
16644   for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
16645     // If we're generating more than one load, compute the base address of
16646     // subsequent loads as an offset from the previous.
16647     if (LoadCount > 0)
16648       BaseAddr =
16649           Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr,
16650                                      VecTy->getVectorNumElements() * Factor);
16651
16652     SmallVector<Value *, 2> Ops;
16653     Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
16654     Ops.push_back(Builder.getInt32(LI->getAlignment()));
16655
16656     CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
16657
16658     // Replace uses of each shufflevector with the corresponding vector loaded
16659     // by ldN.
16660     for (unsigned i = 0; i < Shuffles.size(); i++) {
16661       ShuffleVectorInst *SV = Shuffles[i];
16662       unsigned Index = Indices[i];
16663
16664       Value *SubVec = Builder.CreateExtractValue(VldN, Index);
16665
16666       // Convert the integer vector to pointer vector if the element is pointer.
16667       if (EltTy->isPointerTy())
16668         SubVec = Builder.CreateIntToPtr(
16669             SubVec, VectorType::get(SV->getType()->getVectorElementType(),
16670                                     VecTy->getVectorNumElements()));
16671
16672       SubVecs[SV].push_back(SubVec);
16673     }
16674   }
16675
16676   // Replace uses of the shufflevector instructions with the sub-vectors
16677   // returned by the load intrinsic. If a shufflevector instruction is
16678   // associated with more than one sub-vector, those sub-vectors will be
16679   // concatenated into a single wide vector.
16680   for (ShuffleVectorInst *SVI : Shuffles) {
16681     auto &SubVec = SubVecs[SVI];
16682     auto *WideVec =
16683         SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
16684     SVI->replaceAllUsesWith(WideVec);
16685   }
16686
16687   return true;
16688 }
16689
16690 /// Lower an interleaved store into a vstN intrinsic.
16691 ///
16692 /// E.g. Lower an interleaved store (Factor = 3):
16693 ///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
16694 ///                                  <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
16695 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
16696 ///
16697 ///      Into:
16698 ///        %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
16699 ///        %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
16700 ///        %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
16701 ///        call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
16702 ///
16703 /// Note that the new shufflevectors will be removed and we'll only generate one
16704 /// vst3 instruction in CodeGen.
16705 ///
16706 /// Example for a more general valid mask (Factor 3). Lower:
16707 ///        %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
16708 ///                 <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
16709 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr
16710 ///
16711 ///      Into:
16712 ///        %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
16713 ///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
16714 ///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
16715 ///        call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
16716 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
16717                                               ShuffleVectorInst *SVI,
16718                                               unsigned Factor) const {
16719   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16720          "Invalid interleave factor");
16721
16722   VectorType *VecTy = SVI->getType();
16723   assert(VecTy->getVectorNumElements() % Factor == 0 &&
16724          "Invalid interleaved store");
16725
16726   unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
16727   Type *EltTy = VecTy->getVectorElementType();
16728   VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
16729
16730   const DataLayout &DL = SI->getModule()->getDataLayout();
16731
16732   // Skip if we do not have NEON and skip illegal vector types. We can
16733   // "legalize" wide vector types into multiple interleaved accesses as long as
16734   // the vector types are divisible by 128.
16735   if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
16736     return false;
16737
16738   unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
16739
16740   Value *Op0 = SVI->getOperand(0);
16741   Value *Op1 = SVI->getOperand(1);
16742   IRBuilder<> Builder(SI);
16743
16744   // StN intrinsics don't support pointer vectors as arguments. Convert pointer
16745   // vectors to integer vectors.
16746   if (EltTy->isPointerTy()) {
16747     Type *IntTy = DL.getIntPtrType(EltTy);
16748
16749     // Convert to the corresponding integer vector.
16750     Type *IntVecTy =
16751         VectorType::get(IntTy, Op0->getType()->getVectorNumElements());
16752     Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
16753     Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
16754
16755     SubVecTy = VectorType::get(IntTy, LaneLen);
16756   }
16757
16758   // The base address of the store.
16759   Value *BaseAddr = SI->getPointerOperand();
16760
16761   if (NumStores > 1) {
16762     // If we're going to generate more than one store, reset the lane length
16763     // and sub-vector type to something legal.
16764     LaneLen /= NumStores;
16765     SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);
16766
16767     // We will compute the pointer operand of each store from the original base
16768     // address using GEPs. Cast the base address to a pointer to the scalar
16769     // element type.
16770     BaseAddr = Builder.CreateBitCast(
16771         BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
16772                       SI->getPointerAddressSpace()));
16773   }
16774
16775   assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
16776
16777   auto Mask = SVI->getShuffleMask();
16778
16779   Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
16780   Type *Tys[] = {Int8Ptr, SubVecTy};
16781   static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
16782                                              Intrinsic::arm_neon_vst3,
16783                                              Intrinsic::arm_neon_vst4};
16784
16785   for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
16786     // If we generating more than one store, we compute the base address of
16787     // subsequent stores as an offset from the previous.
16788     if (StoreCount > 0)
16789       BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(),
16790                                             BaseAddr, LaneLen * Factor);
16791
16792     SmallVector<Value *, 6> Ops;
16793     Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
16794
16795     Function *VstNFunc =
16796         Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
16797
16798     // Split the shufflevector operands into sub vectors for the new vstN call.
16799     for (unsigned i = 0; i < Factor; i++) {
16800       unsigned IdxI = StoreCount * LaneLen * Factor + i;
16801       if (Mask[IdxI] >= 0) {
16802         Ops.push_back(Builder.CreateShuffleVector(
16803             Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
16804       } else {
16805         unsigned StartMask = 0;
16806         for (unsigned j = 1; j < LaneLen; j++) {
16807           unsigned IdxJ = StoreCount * LaneLen * Factor + j;
16808           if (Mask[IdxJ * Factor + IdxI] >= 0) {
16809             StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
16810             break;
16811           }
16812         }
16813         // Note: If all elements in a chunk are undefs, StartMask=0!
16814         // Note: Filling undef gaps with random elements is ok, since
16815         // those elements were being written anyway (with undefs).
16816         // In the case of all undefs we're defaulting to using elems from 0
16817         // Note: StartMask cannot be negative, it's checked in
16818         // isReInterleaveMask
16819         Ops.push_back(Builder.CreateShuffleVector(
16820             Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
16821       }
16822     }
16823
16824     Ops.push_back(Builder.getInt32(SI->getAlignment()));
16825     Builder.CreateCall(VstNFunc, Ops);
16826   }
16827   return true;
16828 }
16829
16830 enum HABaseType {
16831   HA_UNKNOWN = 0,
16832   HA_FLOAT,
16833   HA_DOUBLE,
16834   HA_VECT64,
16835   HA_VECT128
16836 };
16837
16838 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
16839                                    uint64_t &Members) {
16840   if (auto *ST = dyn_cast<StructType>(Ty)) {
16841     for (unsigned i = 0; i < ST->getNumElements(); ++i) {
16842       uint64_t SubMembers = 0;
16843       if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
16844         return false;
16845       Members += SubMembers;
16846     }
16847   } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
16848     uint64_t SubMembers = 0;
16849     if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
16850       return false;
16851     Members += SubMembers * AT->getNumElements();
16852   } else if (Ty->isFloatTy()) {
16853     if (Base != HA_UNKNOWN && Base != HA_FLOAT)
16854       return false;
16855     Members = 1;
16856     Base = HA_FLOAT;
16857   } else if (Ty->isDoubleTy()) {
16858     if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
16859       return false;
16860     Members = 1;
16861     Base = HA_DOUBLE;
16862   } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
16863     Members = 1;
16864     switch (Base) {
16865     case HA_FLOAT:
16866     case HA_DOUBLE:
16867       return false;
16868     case HA_VECT64:
16869       return VT->getBitWidth() == 64;
16870     case HA_VECT128:
16871       return VT->getBitWidth() == 128;
16872     case HA_UNKNOWN:
16873       switch (VT->getBitWidth()) {
16874       case 64:
16875         Base = HA_VECT64;
16876         return true;
16877       case 128:
16878         Base = HA_VECT128;
16879         return true;
16880       default:
16881         return false;
16882       }
16883     }
16884   }
16885
16886   return (Members > 0 && Members <= 4);
16887 }
16888
16889 /// Return the correct alignment for the current calling convention.
16890 unsigned
16891 ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy,
16892                                                  DataLayout DL) const {
16893   if (!ArgTy->isVectorTy())
16894     return DL.getABITypeAlignment(ArgTy);
16895
16896   // Avoid over-aligning vector parameters. It would require realigning the
16897   // stack and waste space for no real benefit.
16898   return std::min(DL.getABITypeAlignment(ArgTy),
16899                   (unsigned)DL.getStackAlignment().value());
16900 }
16901
16902 /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
16903 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
16904 /// passing according to AAPCS rules.
16905 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
16906     Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
16907   if (getEffectiveCallingConv(CallConv, isVarArg) !=
16908       CallingConv::ARM_AAPCS_VFP)
16909     return false;
16910
16911   HABaseType Base = HA_UNKNOWN;
16912   uint64_t Members = 0;
16913   bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
16914   LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
16915
16916   bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
16917   return IsHA || IsIntArray;
16918 }
16919
16920 unsigned ARMTargetLowering::getExceptionPointerRegister(
16921     const Constant *PersonalityFn) const {
16922   // Platforms which do not use SjLj EH may return values in these registers
16923   // via the personality function.
16924   return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0;
16925 }
16926
16927 unsigned ARMTargetLowering::getExceptionSelectorRegister(
16928     const Constant *PersonalityFn) const {
16929   // Platforms which do not use SjLj EH may return values in these registers
16930   // via the personality function.
16931   return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1;
16932 }
16933
16934 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
16935   // Update IsSplitCSR in ARMFunctionInfo.
16936   ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
16937   AFI->setIsSplitCSR(true);
16938 }
16939
16940 void ARMTargetLowering::insertCopiesSplitCSR(
16941     MachineBasicBlock *Entry,
16942     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
16943   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
16944   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
16945   if (!IStart)
16946     return;
16947
16948   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
16949   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
16950   MachineBasicBlock::iterator MBBI = Entry->begin();
16951   for (const MCPhysReg *I = IStart; *I; ++I) {
16952     const TargetRegisterClass *RC = nullptr;
16953     if (ARM::GPRRegClass.contains(*I))
16954       RC = &ARM::GPRRegClass;
16955     else if (ARM::DPRRegClass.contains(*I))
16956       RC = &ARM::DPRRegClass;
16957     else
16958       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
16959
16960     Register NewVR = MRI->createVirtualRegister(RC);
16961     // Create copy from CSR to a virtual register.
16962     // FIXME: this currently does not emit CFI pseudo-instructions, it works
16963     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
16964     // nounwind. If we want to generalize this later, we may need to emit
16965     // CFI pseudo-instructions.
16966     assert(Entry->getParent()->getFunction().hasFnAttribute(
16967                Attribute::NoUnwind) &&
16968            "Function should be nounwind in insertCopiesSplitCSR!");
16969     Entry->addLiveIn(*I);
16970     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
16971         .addReg(*I);
16972
16973     // Insert the copy-back instructions right before the terminator.
16974     for (auto *Exit : Exits)
16975       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
16976               TII->get(TargetOpcode::COPY), *I)
16977           .addReg(NewVR);
16978   }
16979 }
16980
16981 void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const {
16982   MF.getFrameInfo().computeMaxCallFrameSize(MF);
16983   TargetLoweringBase::finalizeLowering(MF);
16984 }